LLVM 22.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
298 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
411 AArch64_IMM::expandMOVImm(Val, 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(1, Cost);
440}
441
443 const APInt &Imm, Type *Ty,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
508}
509
512 const APInt &Imm, Type *Ty,
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
560}
561
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
571static bool isUnpackedVectorVT(EVT VecVT) {
572 return VecVT.isScalableVector() &&
574}
575
577 const IntrinsicCostAttributes &ICA) {
578 // We need to know at least the number of elements in the vector of buckets
579 // and the size of each element to update.
580 if (ICA.getArgTypes().size() < 2)
582
583 // Only interested in costing for the hardware instruction from SVE2.
584 if (!ST->hasSVE2())
586
587 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
588 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
589 unsigned TotalHistCnts = 1;
590
591 unsigned EltSize = EltTy->getScalarSizeInBits();
592 // Only allow (up to 64b) integers or pointers
593 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
595
596 // FIXME: We should be able to generate histcnt for fixed-length vectors
597 // using ptrue with a specific VL.
598 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
599 unsigned EC = VTy->getElementCount().getKnownMinValue();
600 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
602
603 // HistCnt only supports 32b and 64b element types
604 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
605
606 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
608
609 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610 TotalHistCnts = EC / NaturalVectorWidth;
611
612 return InstructionCost(BaseHistCntCost * TotalHistCnts);
613 }
614
616}
617
621 // The code-generator is currently not able to handle scalable vectors
622 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623 // it. This change will be removed when code-generation for these types is
624 // sufficiently reliable.
625 auto *RetTy = ICA.getReturnType();
626 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
627 if (VTy->getElementCount() == ElementCount::getScalable(1))
629
630 switch (ICA.getID()) {
631 case Intrinsic::experimental_vector_histogram_add: {
632 InstructionCost HistCost = getHistogramCost(ST, ICA);
633 // If the cost isn't valid, we may still be able to scalarize
634 if (HistCost.isValid())
635 return HistCost;
636 break;
637 }
638 case Intrinsic::umin:
639 case Intrinsic::umax:
640 case Intrinsic::smin:
641 case Intrinsic::smax: {
642 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645 MVT::nxv2i64};
646 auto LT = getTypeLegalizationCost(RetTy);
647 // v2i64 types get converted to cmp+bif hence the cost of 2
648 if (LT.second == MVT::v2i64)
649 return LT.first * 2;
650 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
651 return LT.first;
652 break;
653 }
654 case Intrinsic::sadd_sat:
655 case Intrinsic::ssub_sat:
656 case Intrinsic::uadd_sat:
657 case Intrinsic::usub_sat: {
658 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
659 MVT::v8i16, MVT::v2i32, MVT::v4i32,
660 MVT::v2i64};
661 auto LT = getTypeLegalizationCost(RetTy);
662 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
663 // need to extend the type, as it uses shr(qadd(shl, shl)).
664 unsigned Instrs =
665 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
666 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
667 return LT.first * Instrs;
668
670 uint64_t VectorSize = TS.getKnownMinValue();
671
672 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
673 return LT.first * Instrs;
674
675 break;
676 }
677 case Intrinsic::abs: {
678 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
679 MVT::v8i16, MVT::v2i32, MVT::v4i32,
680 MVT::v2i64};
681 auto LT = getTypeLegalizationCost(RetTy);
682 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
683 return LT.first;
684 break;
685 }
686 case Intrinsic::bswap: {
687 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
688 MVT::v4i32, MVT::v2i64};
689 auto LT = getTypeLegalizationCost(RetTy);
690 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
691 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
692 return LT.first;
693 break;
694 }
695 case Intrinsic::fma:
696 case Intrinsic::fmuladd: {
697 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
698 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
699 Type *EltTy = RetTy->getScalarType();
700 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
701 (EltTy->isHalfTy() && ST->hasFullFP16()))
702 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
703 break;
704 }
705 case Intrinsic::stepvector: {
706 InstructionCost Cost = 1; // Cost of the `index' instruction
707 auto LT = getTypeLegalizationCost(RetTy);
708 // Legalisation of illegal vectors involves an `index' instruction plus
709 // (LT.first - 1) vector adds.
710 if (LT.first > 1) {
711 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
712 InstructionCost AddCost =
713 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
714 Cost += AddCost * (LT.first - 1);
715 }
716 return Cost;
717 }
718 case Intrinsic::vector_extract:
719 case Intrinsic::vector_insert: {
720 // If both the vector and subvector types are legal types and the index
721 // is 0, then this should be a no-op or simple operation; return a
722 // relatively low cost.
723
724 // If arguments aren't actually supplied, then we cannot determine the
725 // value of the index. We also want to skip predicate types.
726 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
728 break;
729
730 LLVMContext &C = RetTy->getContext();
731 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
732 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
733 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
734 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
735 // Skip this if either the vector or subvector types are unpacked
736 // SVE types; they may get lowered to stack stores and loads.
737 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
738 break;
739
741 getTLI()->getTypeConversion(C, SubVecVT);
743 getTLI()->getTypeConversion(C, VecVT);
744 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
745 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
746 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
747 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
748 return TTI::TCC_Free;
749 break;
750 }
751 case Intrinsic::bitreverse: {
752 static const CostTblEntry BitreverseTbl[] = {
753 {Intrinsic::bitreverse, MVT::i32, 1},
754 {Intrinsic::bitreverse, MVT::i64, 1},
755 {Intrinsic::bitreverse, MVT::v8i8, 1},
756 {Intrinsic::bitreverse, MVT::v16i8, 1},
757 {Intrinsic::bitreverse, MVT::v4i16, 2},
758 {Intrinsic::bitreverse, MVT::v8i16, 2},
759 {Intrinsic::bitreverse, MVT::v2i32, 2},
760 {Intrinsic::bitreverse, MVT::v4i32, 2},
761 {Intrinsic::bitreverse, MVT::v1i64, 2},
762 {Intrinsic::bitreverse, MVT::v2i64, 2},
763 };
764 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
765 const auto *Entry =
766 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
767 if (Entry) {
768 // Cost Model is using the legal type(i32) that i8 and i16 will be
769 // converted to +1 so that we match the actual lowering cost
770 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
771 TLI->getValueType(DL, RetTy, true) == MVT::i16)
772 return LegalisationCost.first * Entry->Cost + 1;
773
774 return LegalisationCost.first * Entry->Cost;
775 }
776 break;
777 }
778 case Intrinsic::ctpop: {
779 if (!ST->hasNEON()) {
780 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
781 return getTypeLegalizationCost(RetTy).first * 12;
782 }
783 static const CostTblEntry CtpopCostTbl[] = {
784 {ISD::CTPOP, MVT::v2i64, 4},
785 {ISD::CTPOP, MVT::v4i32, 3},
786 {ISD::CTPOP, MVT::v8i16, 2},
787 {ISD::CTPOP, MVT::v16i8, 1},
788 {ISD::CTPOP, MVT::i64, 4},
789 {ISD::CTPOP, MVT::v2i32, 3},
790 {ISD::CTPOP, MVT::v4i16, 2},
791 {ISD::CTPOP, MVT::v8i8, 1},
792 {ISD::CTPOP, MVT::i32, 5},
793 };
794 auto LT = getTypeLegalizationCost(RetTy);
795 MVT MTy = LT.second;
796 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
797 // Extra cost of +1 when illegal vector types are legalized by promoting
798 // the integer type.
799 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
800 RetTy->getScalarSizeInBits()
801 ? 1
802 : 0;
803 return LT.first * Entry->Cost + ExtraCost;
804 }
805 break;
806 }
807 case Intrinsic::sadd_with_overflow:
808 case Intrinsic::uadd_with_overflow:
809 case Intrinsic::ssub_with_overflow:
810 case Intrinsic::usub_with_overflow:
811 case Intrinsic::smul_with_overflow:
812 case Intrinsic::umul_with_overflow: {
813 static const CostTblEntry WithOverflowCostTbl[] = {
814 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
815 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
816 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
817 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
818 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
819 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
820 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
821 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
822 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
823 {Intrinsic::usub_with_overflow, MVT::i8, 3},
824 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
825 {Intrinsic::usub_with_overflow, MVT::i16, 3},
826 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
827 {Intrinsic::usub_with_overflow, MVT::i32, 1},
828 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
829 {Intrinsic::usub_with_overflow, MVT::i64, 1},
830 {Intrinsic::smul_with_overflow, MVT::i8, 5},
831 {Intrinsic::umul_with_overflow, MVT::i8, 4},
832 {Intrinsic::smul_with_overflow, MVT::i16, 5},
833 {Intrinsic::umul_with_overflow, MVT::i16, 4},
834 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
835 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
836 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
837 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
838 };
839 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
840 if (MTy.isSimple())
841 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
842 MTy.getSimpleVT()))
843 return Entry->Cost;
844 break;
845 }
846 case Intrinsic::fptosi_sat:
847 case Intrinsic::fptoui_sat: {
848 if (ICA.getArgTypes().empty())
849 break;
850 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
851 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
852 EVT MTy = TLI->getValueType(DL, RetTy);
853 // Check for the legal types, which are where the size of the input and the
854 // output are the same, or we are using cvt f64->i32 or f32->i64.
855 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
856 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
857 LT.second == MVT::v2f64)) {
858 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
859 (LT.second == MVT::f64 && MTy == MVT::i32) ||
860 (LT.second == MVT::f32 && MTy == MVT::i64)))
861 return LT.first;
862 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
863 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
864 MTy.getScalarSizeInBits() == 64)
865 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
866 }
867 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
868 // f32.
869 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
870 return LT.first + getIntrinsicInstrCost(
871 {ICA.getID(),
872 RetTy,
873 {ICA.getArgTypes()[0]->getWithNewType(
874 Type::getFloatTy(RetTy->getContext()))}},
875 CostKind);
876 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
877 (LT.second == MVT::f16 && MTy == MVT::i64) ||
878 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
879 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
880 return LT.first;
881 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
882 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
883 MTy.getScalarSizeInBits() == 32)
884 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
885 // Extending vector types v8f16->v8i32. These current scalarize but the
886 // codegen could be better.
887 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
888 MTy.getScalarSizeInBits() == 64)
889 return MTy.getVectorNumElements() * 3;
890
891 // If we can we use a legal convert followed by a min+max
892 if ((LT.second.getScalarType() == MVT::f32 ||
893 LT.second.getScalarType() == MVT::f64 ||
894 LT.second.getScalarType() == MVT::f16) &&
895 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
896 Type *LegalTy =
897 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
898 if (LT.second.isVector())
899 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
901 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
902 LegalTy, {LegalTy, LegalTy});
904 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
905 LegalTy, {LegalTy, LegalTy});
907 return LT.first * Cost +
908 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
909 : 1);
910 }
911 // Otherwise we need to follow the default expansion that clamps the value
912 // using a float min/max with a fcmp+sel for nan handling when signed.
913 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
914 RetTy = RetTy->getScalarType();
915 if (LT.second.isVector()) {
916 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
917 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
918 }
919 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
921 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
923 Cost +=
924 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
926 if (IsSigned) {
927 Type *CondTy = RetTy->getWithNewBitWidth(1);
928 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
930 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
932 }
933 return LT.first * Cost;
934 }
935 case Intrinsic::fshl:
936 case Intrinsic::fshr: {
937 if (ICA.getArgs().empty())
938 break;
939
940 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
941
942 // ROTR / ROTL is a funnel shift with equal first and second operand. For
943 // ROTR on integer registers (i32/i64) this can be done in a single ror
944 // instruction. A fshl with a non-constant shift uses a neg + ror.
945 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
946 (RetTy->getPrimitiveSizeInBits() == 32 ||
947 RetTy->getPrimitiveSizeInBits() == 64)) {
948 InstructionCost NegCost =
949 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
950 return 1 + NegCost;
951 }
952
953 // TODO: Add handling for fshl where third argument is not a constant.
954 if (!OpInfoZ.isConstant())
955 break;
956
957 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
958 if (OpInfoZ.isUniform()) {
959 static const CostTblEntry FshlTbl[] = {
960 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
961 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
962 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
963 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
964 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
965 // to avoid having to duplicate the costs.
966 const auto *Entry =
967 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
968 if (Entry)
969 return LegalisationCost.first * Entry->Cost;
970 }
971
972 auto TyL = getTypeLegalizationCost(RetTy);
973 if (!RetTy->isIntegerTy())
974 break;
975
976 // Estimate cost manually, as types like i8 and i16 will get promoted to
977 // i32 and CostTableLookup will ignore the extra conversion cost.
978 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
979 RetTy->getScalarSizeInBits() < 64) ||
980 (RetTy->getScalarSizeInBits() % 64 != 0);
981 unsigned ExtraCost = HigherCost ? 1 : 0;
982 if (RetTy->getScalarSizeInBits() == 32 ||
983 RetTy->getScalarSizeInBits() == 64)
984 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
985 // extr instruction.
986 else if (HigherCost)
987 ExtraCost = 1;
988 else
989 break;
990 return TyL.first + ExtraCost;
991 }
992 case Intrinsic::get_active_lane_mask: {
993 auto RetTy = cast<VectorType>(ICA.getReturnType());
994 EVT RetVT = getTLI()->getValueType(DL, RetTy);
995 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
996 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
997 break;
998
999 if (RetTy->isScalableTy()) {
1000 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1002 break;
1003
1004 auto LT = getTypeLegalizationCost(RetTy);
1005 InstructionCost Cost = LT.first;
1006 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1007 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1008 // nxv32i1 = get_active_lane_mask(base, idx) ->
1009 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1010 if (ST->hasSVE2p1() || ST->hasSME2()) {
1011 Cost /= 2;
1012 if (Cost == 1)
1013 return Cost;
1014 }
1015
1016 // If more than one whilelo intrinsic is required, include the extra cost
1017 // required by the saturating add & select required to increment the
1018 // start value after the first intrinsic call.
1019 Type *OpTy = ICA.getArgTypes()[0];
1020 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1021 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1022 Type *CondTy = OpTy->getWithNewBitWidth(1);
1023 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1025 return Cost + (SplitCost * (Cost - 1));
1026 } else if (!getTLI()->isTypeLegal(RetVT)) {
1027 // We don't have enough context at this point to determine if the mask
1028 // is going to be kept live after the block, which will force the vXi1
1029 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1030 // For now, we just assume the vectorizer created this intrinsic and
1031 // the result will be the input for a PHI. In this case the cost will
1032 // be extremely high for fixed-width vectors.
1033 // NOTE: getScalarizationOverhead returns a cost that's far too
1034 // pessimistic for the actual generated codegen. In reality there are
1035 // two instructions generated per lane.
1036 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1037 }
1038 break;
1039 }
1040 case Intrinsic::experimental_vector_match: {
1041 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1042 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1043 unsigned SearchSize = NeedleTy->getNumElements();
1044 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1045 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1046 // Neoverse V3, these are cheap operations with the same latency as a
1047 // vector ADD. In most cases, however, we also need to do an extra DUP.
1048 // For fixed-length vectors we currently need an extra five--six
1049 // instructions besides the MATCH.
1051 if (isa<FixedVectorType>(RetTy))
1052 Cost += 10;
1053 return Cost;
1054 }
1055 break;
1056 }
1057 case Intrinsic::experimental_cttz_elts: {
1058 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1059 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1060 // This will consist of a SVE brkb and a cntp instruction. These
1061 // typically have the same latency and half the throughput as a vector
1062 // add instruction.
1063 return 4;
1064 }
1065 break;
1066 }
1067 case Intrinsic::loop_dependence_raw_mask:
1068 case Intrinsic::loop_dependence_war_mask: {
1069 // The whilewr/rw instructions require SVE2 or SME.
1070 if (ST->hasSVE2() || ST->hasSME()) {
1071 EVT VecVT = getTLI()->getValueType(DL, RetTy);
1072 unsigned EltSizeInBytes =
1073 cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
1074 if (is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) &&
1075 VecVT.getVectorMinNumElements() == (16 / EltSizeInBytes))
1076 return 1;
1077 }
1078 break;
1079 }
1080 case Intrinsic::experimental_vector_extract_last_active:
1081 if (ST->isSVEorStreamingSVEAvailable()) {
1082 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1083 // This should turn into chained clastb instructions.
1084 return LegalCost;
1085 }
1086 break;
1087 default:
1088 break;
1089 }
1091}
1092
1093/// The function will remove redundant reinterprets casting in the presence
1094/// of the control flow
1095static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1096 IntrinsicInst &II) {
1098 auto RequiredType = II.getType();
1099
1100 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1101 assert(PN && "Expected Phi Node!");
1102
1103 // Don't create a new Phi unless we can remove the old one.
1104 if (!PN->hasOneUse())
1105 return std::nullopt;
1106
1107 for (Value *IncValPhi : PN->incoming_values()) {
1108 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1109 if (!Reinterpret ||
1110 Reinterpret->getIntrinsicID() !=
1111 Intrinsic::aarch64_sve_convert_to_svbool ||
1112 RequiredType != Reinterpret->getArgOperand(0)->getType())
1113 return std::nullopt;
1114 }
1115
1116 // Create the new Phi
1117 IC.Builder.SetInsertPoint(PN);
1118 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1119 Worklist.push_back(PN);
1120
1121 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1122 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1123 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1124 Worklist.push_back(Reinterpret);
1125 }
1126
1127 // Cleanup Phi Node and reinterprets
1128 return IC.replaceInstUsesWith(II, NPN);
1129}
1130
1131// A collection of properties common to SVE intrinsics that allow for combines
1132// to be written without needing to know the specific intrinsic.
1134 //
1135 // Helper routines for common intrinsic definitions.
1136 //
1137
1138 // e.g. llvm.aarch64.sve.add pg, op1, op2
1139 // with IID ==> llvm.aarch64.sve.add_u
1140 static SVEIntrinsicInfo
1147
1148 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1155
1156 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1162
1163 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1169
1170 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1171 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1172 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1173 return SVEIntrinsicInfo()
1176 }
1177
1178 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1179 // llvm.aarch64.sve.ld1 pg, ptr
1186
1187 // All properties relate to predication and thus having a general predicate
1188 // is the minimum requirement to say there is intrinsic info to act on.
1189 explicit operator bool() const { return hasGoverningPredicate(); }
1190
1191 //
1192 // Properties relating to the governing predicate.
1193 //
1194
1196 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1197 }
1198
1200 assert(hasGoverningPredicate() && "Propery not set!");
1201 return GoverningPredicateIdx;
1202 }
1203
1205 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1206 GoverningPredicateIdx = Index;
1207 return *this;
1208 }
1209
1210 //
1211 // Properties relating to operations the intrinsic could be transformed into.
1212 // NOTE: This does not mean such a transformation is always possible, but the
1213 // knowledge makes it possible to reuse existing optimisations without needing
1214 // to embed specific handling for each intrinsic. For example, instruction
1215 // simplification can be used to optimise an intrinsic's active lanes.
1216 //
1217
1219 return UndefIntrinsic != Intrinsic::not_intrinsic;
1220 }
1221
1223 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1224 return UndefIntrinsic;
1225 }
1226
1228 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1229 UndefIntrinsic = IID;
1230 return *this;
1231 }
1232
1233 bool hasMatchingIROpode() const { return IROpcode != 0; }
1234
1235 unsigned getMatchingIROpode() const {
1236 assert(hasMatchingIROpode() && "Propery not set!");
1237 return IROpcode;
1238 }
1239
1241 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1242 IROpcode = Opcode;
1243 return *this;
1244 }
1245
1246 //
1247 // Properties relating to the result of inactive lanes.
1248 //
1249
1251 return ResultLanes == InactiveLanesTakenFromOperand;
1252 }
1253
1255 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1256 return OperandIdxForInactiveLanes;
1257 }
1258
1260 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1261 ResultLanes = InactiveLanesTakenFromOperand;
1262 OperandIdxForInactiveLanes = Index;
1263 return *this;
1264 }
1265
1267 return ResultLanes == InactiveLanesAreNotDefined;
1268 }
1269
1271 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1272 ResultLanes = InactiveLanesAreNotDefined;
1273 return *this;
1274 }
1275
1277 return ResultLanes == InactiveLanesAreUnused;
1278 }
1279
1281 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1282 ResultLanes = InactiveLanesAreUnused;
1283 return *this;
1284 }
1285
1286 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1287 // inactiveLanesAreZeroed =
1288 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1289 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1290
1292 ResultIsZeroInitialized = true;
1293 return *this;
1294 }
1295
1296 //
1297 // The first operand of unary merging operations is typically only used to
1298 // set the result for inactive lanes. Knowing this allows us to deadcode the
1299 // operand when we can prove there are no inactive lanes.
1300 //
1301
1303 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1304 }
1305
1307 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1308 return OperandIdxWithNoActiveLanes;
1309 }
1310
1312 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1313 OperandIdxWithNoActiveLanes = Index;
1314 return *this;
1315 }
1316
1317private:
1318 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1319
1320 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1321 unsigned IROpcode = 0;
1322
1323 enum PredicationStyle {
1325 InactiveLanesTakenFromOperand,
1326 InactiveLanesAreNotDefined,
1327 InactiveLanesAreUnused
1328 } ResultLanes = Uninitialized;
1329
1330 bool ResultIsZeroInitialized = false;
1331 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1332 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1333};
1334
1336 // Some SVE intrinsics do not use scalable vector types, but since they are
1337 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1338 if (!isa<ScalableVectorType>(II.getType()) &&
1339 all_of(II.args(), [&](const Value *V) {
1340 return !isa<ScalableVectorType>(V->getType());
1341 }))
1342 return SVEIntrinsicInfo();
1343
1344 Intrinsic::ID IID = II.getIntrinsicID();
1345 switch (IID) {
1346 default:
1347 break;
1348 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1349 case Intrinsic::aarch64_sve_fcvt_f16f32:
1350 case Intrinsic::aarch64_sve_fcvt_f16f64:
1351 case Intrinsic::aarch64_sve_fcvt_f32f16:
1352 case Intrinsic::aarch64_sve_fcvt_f32f64:
1353 case Intrinsic::aarch64_sve_fcvt_f64f16:
1354 case Intrinsic::aarch64_sve_fcvt_f64f32:
1355 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1356 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1357 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1358 case Intrinsic::aarch64_sve_fcvtzs:
1359 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1360 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1361 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1362 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1363 case Intrinsic::aarch64_sve_fcvtzu:
1364 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1365 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1366 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1367 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1368 case Intrinsic::aarch64_sve_scvtf:
1369 case Intrinsic::aarch64_sve_scvtf_f16i32:
1370 case Intrinsic::aarch64_sve_scvtf_f16i64:
1371 case Intrinsic::aarch64_sve_scvtf_f32i64:
1372 case Intrinsic::aarch64_sve_scvtf_f64i32:
1373 case Intrinsic::aarch64_sve_ucvtf:
1374 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1375 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1376 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1377 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1379
1380 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1381 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1382 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1383 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1385
1386 case Intrinsic::aarch64_sve_fabd:
1387 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1388 case Intrinsic::aarch64_sve_fadd:
1389 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1390 .setMatchingIROpcode(Instruction::FAdd);
1391 case Intrinsic::aarch64_sve_fdiv:
1392 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1393 .setMatchingIROpcode(Instruction::FDiv);
1394 case Intrinsic::aarch64_sve_fmax:
1395 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1396 case Intrinsic::aarch64_sve_fmaxnm:
1397 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1398 case Intrinsic::aarch64_sve_fmin:
1399 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1400 case Intrinsic::aarch64_sve_fminnm:
1401 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1402 case Intrinsic::aarch64_sve_fmla:
1403 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1404 case Intrinsic::aarch64_sve_fmls:
1405 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1406 case Intrinsic::aarch64_sve_fmul:
1407 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1408 .setMatchingIROpcode(Instruction::FMul);
1409 case Intrinsic::aarch64_sve_fmulx:
1410 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1411 case Intrinsic::aarch64_sve_fnmla:
1412 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1413 case Intrinsic::aarch64_sve_fnmls:
1414 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1415 case Intrinsic::aarch64_sve_fsub:
1416 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1417 .setMatchingIROpcode(Instruction::FSub);
1418 case Intrinsic::aarch64_sve_add:
1419 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1420 .setMatchingIROpcode(Instruction::Add);
1421 case Intrinsic::aarch64_sve_mla:
1422 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1423 case Intrinsic::aarch64_sve_mls:
1424 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1425 case Intrinsic::aarch64_sve_mul:
1426 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1427 .setMatchingIROpcode(Instruction::Mul);
1428 case Intrinsic::aarch64_sve_sabd:
1429 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1430 case Intrinsic::aarch64_sve_sdiv:
1431 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1432 .setMatchingIROpcode(Instruction::SDiv);
1433 case Intrinsic::aarch64_sve_smax:
1434 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1435 case Intrinsic::aarch64_sve_smin:
1436 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1437 case Intrinsic::aarch64_sve_smulh:
1438 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1439 case Intrinsic::aarch64_sve_sub:
1440 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1441 .setMatchingIROpcode(Instruction::Sub);
1442 case Intrinsic::aarch64_sve_uabd:
1443 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1444 case Intrinsic::aarch64_sve_udiv:
1445 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1446 .setMatchingIROpcode(Instruction::UDiv);
1447 case Intrinsic::aarch64_sve_umax:
1448 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1449 case Intrinsic::aarch64_sve_umin:
1450 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1451 case Intrinsic::aarch64_sve_umulh:
1452 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1453 case Intrinsic::aarch64_sve_asr:
1454 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1455 .setMatchingIROpcode(Instruction::AShr);
1456 case Intrinsic::aarch64_sve_lsl:
1457 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1458 .setMatchingIROpcode(Instruction::Shl);
1459 case Intrinsic::aarch64_sve_lsr:
1460 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1461 .setMatchingIROpcode(Instruction::LShr);
1462 case Intrinsic::aarch64_sve_and:
1463 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1464 .setMatchingIROpcode(Instruction::And);
1465 case Intrinsic::aarch64_sve_bic:
1466 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1467 case Intrinsic::aarch64_sve_eor:
1468 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1469 .setMatchingIROpcode(Instruction::Xor);
1470 case Intrinsic::aarch64_sve_orr:
1471 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1472 .setMatchingIROpcode(Instruction::Or);
1473 case Intrinsic::aarch64_sve_shsub:
1474 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
1475 case Intrinsic::aarch64_sve_shsubr:
1477 case Intrinsic::aarch64_sve_sqrshl:
1478 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1479 case Intrinsic::aarch64_sve_sqshl:
1480 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1481 case Intrinsic::aarch64_sve_sqsub:
1482 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1483 case Intrinsic::aarch64_sve_srshl:
1484 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1485 case Intrinsic::aarch64_sve_uhsub:
1486 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
1487 case Intrinsic::aarch64_sve_uhsubr:
1489 case Intrinsic::aarch64_sve_uqrshl:
1490 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1491 case Intrinsic::aarch64_sve_uqshl:
1492 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1493 case Intrinsic::aarch64_sve_uqsub:
1494 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1495 case Intrinsic::aarch64_sve_urshl:
1496 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1497
1498 case Intrinsic::aarch64_sve_add_u:
1500 Instruction::Add);
1501 case Intrinsic::aarch64_sve_and_u:
1503 Instruction::And);
1504 case Intrinsic::aarch64_sve_asr_u:
1506 Instruction::AShr);
1507 case Intrinsic::aarch64_sve_eor_u:
1509 Instruction::Xor);
1510 case Intrinsic::aarch64_sve_fadd_u:
1512 Instruction::FAdd);
1513 case Intrinsic::aarch64_sve_fdiv_u:
1515 Instruction::FDiv);
1516 case Intrinsic::aarch64_sve_fmul_u:
1518 Instruction::FMul);
1519 case Intrinsic::aarch64_sve_fsub_u:
1521 Instruction::FSub);
1522 case Intrinsic::aarch64_sve_lsl_u:
1524 Instruction::Shl);
1525 case Intrinsic::aarch64_sve_lsr_u:
1527 Instruction::LShr);
1528 case Intrinsic::aarch64_sve_mul_u:
1530 Instruction::Mul);
1531 case Intrinsic::aarch64_sve_orr_u:
1533 Instruction::Or);
1534 case Intrinsic::aarch64_sve_sdiv_u:
1536 Instruction::SDiv);
1537 case Intrinsic::aarch64_sve_sub_u:
1539 Instruction::Sub);
1540 case Intrinsic::aarch64_sve_udiv_u:
1542 Instruction::UDiv);
1543
1544 case Intrinsic::aarch64_sve_addqv:
1545 case Intrinsic::aarch64_sve_and_z:
1546 case Intrinsic::aarch64_sve_bic_z:
1547 case Intrinsic::aarch64_sve_brka_z:
1548 case Intrinsic::aarch64_sve_brkb_z:
1549 case Intrinsic::aarch64_sve_brkn_z:
1550 case Intrinsic::aarch64_sve_brkpa_z:
1551 case Intrinsic::aarch64_sve_brkpb_z:
1552 case Intrinsic::aarch64_sve_cntp:
1553 case Intrinsic::aarch64_sve_compact:
1554 case Intrinsic::aarch64_sve_eor_z:
1555 case Intrinsic::aarch64_sve_eorv:
1556 case Intrinsic::aarch64_sve_eorqv:
1557 case Intrinsic::aarch64_sve_nand_z:
1558 case Intrinsic::aarch64_sve_nor_z:
1559 case Intrinsic::aarch64_sve_orn_z:
1560 case Intrinsic::aarch64_sve_orr_z:
1561 case Intrinsic::aarch64_sve_orv:
1562 case Intrinsic::aarch64_sve_orqv:
1563 case Intrinsic::aarch64_sve_pnext:
1564 case Intrinsic::aarch64_sve_rdffr_z:
1565 case Intrinsic::aarch64_sve_saddv:
1566 case Intrinsic::aarch64_sve_uaddv:
1567 case Intrinsic::aarch64_sve_umaxv:
1568 case Intrinsic::aarch64_sve_umaxqv:
1569 case Intrinsic::aarch64_sve_cmpeq:
1570 case Intrinsic::aarch64_sve_cmpeq_wide:
1571 case Intrinsic::aarch64_sve_cmpge:
1572 case Intrinsic::aarch64_sve_cmpge_wide:
1573 case Intrinsic::aarch64_sve_cmpgt:
1574 case Intrinsic::aarch64_sve_cmpgt_wide:
1575 case Intrinsic::aarch64_sve_cmphi:
1576 case Intrinsic::aarch64_sve_cmphi_wide:
1577 case Intrinsic::aarch64_sve_cmphs:
1578 case Intrinsic::aarch64_sve_cmphs_wide:
1579 case Intrinsic::aarch64_sve_cmple_wide:
1580 case Intrinsic::aarch64_sve_cmplo_wide:
1581 case Intrinsic::aarch64_sve_cmpls_wide:
1582 case Intrinsic::aarch64_sve_cmplt_wide:
1583 case Intrinsic::aarch64_sve_cmpne:
1584 case Intrinsic::aarch64_sve_cmpne_wide:
1585 case Intrinsic::aarch64_sve_facge:
1586 case Intrinsic::aarch64_sve_facgt:
1587 case Intrinsic::aarch64_sve_fcmpeq:
1588 case Intrinsic::aarch64_sve_fcmpge:
1589 case Intrinsic::aarch64_sve_fcmpgt:
1590 case Intrinsic::aarch64_sve_fcmpne:
1591 case Intrinsic::aarch64_sve_fcmpuo:
1592 case Intrinsic::aarch64_sve_ld1:
1593 case Intrinsic::aarch64_sve_ld1_gather:
1594 case Intrinsic::aarch64_sve_ld1_gather_index:
1595 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1596 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1597 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1598 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1599 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1600 case Intrinsic::aarch64_sve_ld1q_gather_index:
1601 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1602 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1603 case Intrinsic::aarch64_sve_ld1ro:
1604 case Intrinsic::aarch64_sve_ld1rq:
1605 case Intrinsic::aarch64_sve_ld1udq:
1606 case Intrinsic::aarch64_sve_ld1uwq:
1607 case Intrinsic::aarch64_sve_ld2_sret:
1608 case Intrinsic::aarch64_sve_ld2q_sret:
1609 case Intrinsic::aarch64_sve_ld3_sret:
1610 case Intrinsic::aarch64_sve_ld3q_sret:
1611 case Intrinsic::aarch64_sve_ld4_sret:
1612 case Intrinsic::aarch64_sve_ld4q_sret:
1613 case Intrinsic::aarch64_sve_ldff1:
1614 case Intrinsic::aarch64_sve_ldff1_gather:
1615 case Intrinsic::aarch64_sve_ldff1_gather_index:
1616 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1617 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1618 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1619 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1620 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1621 case Intrinsic::aarch64_sve_ldnf1:
1622 case Intrinsic::aarch64_sve_ldnt1:
1623 case Intrinsic::aarch64_sve_ldnt1_gather:
1624 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1625 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1626 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1628
1629 case Intrinsic::aarch64_sve_prf:
1630 case Intrinsic::aarch64_sve_prfb_gather_index:
1631 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1632 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1633 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1634 case Intrinsic::aarch64_sve_prfd_gather_index:
1635 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1636 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1637 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1638 case Intrinsic::aarch64_sve_prfh_gather_index:
1639 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1640 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1641 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1642 case Intrinsic::aarch64_sve_prfw_gather_index:
1643 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1644 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1645 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1647
1648 case Intrinsic::aarch64_sve_st1_scatter:
1649 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1650 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1651 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1652 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1653 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1654 case Intrinsic::aarch64_sve_st1dq:
1655 case Intrinsic::aarch64_sve_st1q_scatter_index:
1656 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1657 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1658 case Intrinsic::aarch64_sve_st1wq:
1659 case Intrinsic::aarch64_sve_stnt1:
1660 case Intrinsic::aarch64_sve_stnt1_scatter:
1661 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1662 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1663 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1665 case Intrinsic::aarch64_sve_st2:
1666 case Intrinsic::aarch64_sve_st2q:
1668 case Intrinsic::aarch64_sve_st3:
1669 case Intrinsic::aarch64_sve_st3q:
1671 case Intrinsic::aarch64_sve_st4:
1672 case Intrinsic::aarch64_sve_st4q:
1674 }
1675
1676 return SVEIntrinsicInfo();
1677}
1678
1679static bool isAllActivePredicate(Value *Pred) {
1680 Value *UncastedPred;
1681
1682 // Look through predicate casts that only remove lanes.
1684 m_Value(UncastedPred)))) {
1685 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1686 Pred = UncastedPred;
1687
1689 m_Value(UncastedPred))))
1690 // If the predicate has the same or less lanes than the uncasted predicate
1691 // then we know the casting has no effect.
1692 if (OrigPredTy->getMinNumElements() <=
1693 cast<ScalableVectorType>(UncastedPred->getType())
1694 ->getMinNumElements())
1695 Pred = UncastedPred;
1696 }
1697
1698 auto *C = dyn_cast<Constant>(Pred);
1699 return C && C->isAllOnesValue();
1700}
1701
1702// Simplify `V` by only considering the operations that affect active lanes.
1703// This function should only return existing Values or newly created Constants.
1704static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1705 auto *Dup = dyn_cast<IntrinsicInst>(V);
1706 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1707 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1709 cast<VectorType>(V->getType())->getElementCount(),
1710 cast<Constant>(Dup->getOperand(2)));
1711
1712 return V;
1713}
1714
1715static std::optional<Instruction *>
1717 const SVEIntrinsicInfo &IInfo) {
1718 const unsigned Opc = IInfo.getMatchingIROpode();
1719 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1720
1721 Value *Pg = II.getOperand(0);
1722 Value *Op1 = II.getOperand(1);
1723 Value *Op2 = II.getOperand(2);
1724 const DataLayout &DL = II.getDataLayout();
1725
1726 // Canonicalise constants to the RHS.
1728 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1729 IC.replaceOperand(II, 1, Op2);
1730 IC.replaceOperand(II, 2, Op1);
1731 return &II;
1732 }
1733
1734 // Only active lanes matter when simplifying the operation.
1735 Op1 = stripInactiveLanes(Op1, Pg);
1736 Op2 = stripInactiveLanes(Op2, Pg);
1737
1738 Value *SimpleII;
1739 if (auto FII = dyn_cast<FPMathOperator>(&II))
1740 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1741 else
1742 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1743
1744 // An SVE intrinsic's result is always defined. However, this is not the case
1745 // for its equivalent IR instruction (e.g. when shifting by an amount more
1746 // than the data's bitwidth). Simplifications to an undefined result must be
1747 // ignored to preserve the intrinsic's expected behaviour.
1748 if (!SimpleII || isa<UndefValue>(SimpleII))
1749 return std::nullopt;
1750
1751 if (IInfo.inactiveLanesAreNotDefined())
1752 return IC.replaceInstUsesWith(II, SimpleII);
1753
1754 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1755
1756 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1757 if (SimpleII == Inactive)
1758 return IC.replaceInstUsesWith(II, SimpleII);
1759
1760 // Inactive lanes must be preserved.
1761 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1762 return IC.replaceInstUsesWith(II, SimpleII);
1763}
1764
1765// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1766// to operations with less strict inactive lane requirements.
1767static std::optional<Instruction *>
1769 const SVEIntrinsicInfo &IInfo) {
1770 if (!IInfo.hasGoverningPredicate())
1771 return std::nullopt;
1772
1773 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1774
1775 // If there are no active lanes.
1776 if (match(OpPredicate, m_ZeroInt())) {
1778 return IC.replaceInstUsesWith(
1779 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1780
1781 if (IInfo.inactiveLanesAreUnused()) {
1782 if (IInfo.resultIsZeroInitialized())
1784
1785 return IC.eraseInstFromFunction(II);
1786 }
1787 }
1788
1789 // If there are no inactive lanes.
1790 if (isAllActivePredicate(OpPredicate)) {
1791 if (IInfo.hasOperandWithNoActiveLanes()) {
1792 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1793 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1794 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1795 }
1796
1797 if (IInfo.hasMatchingUndefIntrinsic()) {
1798 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1799 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1800 II.setCalledFunction(NewDecl);
1801 return &II;
1802 }
1803 }
1804
1805 // Operation specific simplifications.
1806 if (IInfo.hasMatchingIROpode() &&
1808 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1809
1810 return std::nullopt;
1811}
1812
1813// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1814// => (binop (pred) (from_svbool _) (from_svbool _))
1815//
1816// The above transformation eliminates a `to_svbool` in the predicate
1817// operand of bitwise operation `binop` by narrowing the vector width of
1818// the operation. For example, it would convert a `<vscale x 16 x i1>
1819// and` into a `<vscale x 4 x i1> and`. This is profitable because
1820// to_svbool must zero the new lanes during widening, whereas
1821// from_svbool is free.
1822static std::optional<Instruction *>
1824 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1825 if (!BinOp)
1826 return std::nullopt;
1827
1828 auto IntrinsicID = BinOp->getIntrinsicID();
1829 switch (IntrinsicID) {
1830 case Intrinsic::aarch64_sve_and_z:
1831 case Intrinsic::aarch64_sve_bic_z:
1832 case Intrinsic::aarch64_sve_eor_z:
1833 case Intrinsic::aarch64_sve_nand_z:
1834 case Intrinsic::aarch64_sve_nor_z:
1835 case Intrinsic::aarch64_sve_orn_z:
1836 case Intrinsic::aarch64_sve_orr_z:
1837 break;
1838 default:
1839 return std::nullopt;
1840 }
1841
1842 auto BinOpPred = BinOp->getOperand(0);
1843 auto BinOpOp1 = BinOp->getOperand(1);
1844 auto BinOpOp2 = BinOp->getOperand(2);
1845
1846 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1847 if (!PredIntr ||
1848 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1849 return std::nullopt;
1850
1851 auto PredOp = PredIntr->getOperand(0);
1852 auto PredOpTy = cast<VectorType>(PredOp->getType());
1853 if (PredOpTy != II.getType())
1854 return std::nullopt;
1855
1856 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1857 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1858 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1859 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1860 if (BinOpOp1 == BinOpOp2)
1861 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1862 else
1863 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1864 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1865
1866 auto NarrowedBinOp =
1867 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1868 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1869}
1870
1871static std::optional<Instruction *>
1873 // If the reinterpret instruction operand is a PHI Node
1874 if (isa<PHINode>(II.getArgOperand(0)))
1875 return processPhiNode(IC, II);
1876
1877 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1878 return BinOpCombine;
1879
1880 // Ignore converts to/from svcount_t.
1881 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1882 isa<TargetExtType>(II.getType()))
1883 return std::nullopt;
1884
1885 SmallVector<Instruction *, 32> CandidatesForRemoval;
1886 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1887
1888 const auto *IVTy = cast<VectorType>(II.getType());
1889
1890 // Walk the chain of conversions.
1891 while (Cursor) {
1892 // If the type of the cursor has fewer lanes than the final result, zeroing
1893 // must take place, which breaks the equivalence chain.
1894 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1895 if (CursorVTy->getElementCount().getKnownMinValue() <
1896 IVTy->getElementCount().getKnownMinValue())
1897 break;
1898
1899 // If the cursor has the same type as I, it is a viable replacement.
1900 if (Cursor->getType() == IVTy)
1901 EarliestReplacement = Cursor;
1902
1903 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1904
1905 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1906 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1907 Intrinsic::aarch64_sve_convert_to_svbool ||
1908 IntrinsicCursor->getIntrinsicID() ==
1909 Intrinsic::aarch64_sve_convert_from_svbool))
1910 break;
1911
1912 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1913 Cursor = IntrinsicCursor->getOperand(0);
1914 }
1915
1916 // If no viable replacement in the conversion chain was found, there is
1917 // nothing to do.
1918 if (!EarliestReplacement)
1919 return std::nullopt;
1920
1921 return IC.replaceInstUsesWith(II, EarliestReplacement);
1922}
1923
1924static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1925 IntrinsicInst &II) {
1926 // svsel(ptrue, x, y) => x
1927 auto *OpPredicate = II.getOperand(0);
1928 if (isAllActivePredicate(OpPredicate))
1929 return IC.replaceInstUsesWith(II, II.getOperand(1));
1930
1931 auto Select =
1932 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1933 return IC.replaceInstUsesWith(II, Select);
1934}
1935
1936static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1937 IntrinsicInst &II) {
1938 Value *Pg = II.getOperand(1);
1939
1940 // sve.dup(V, all_active, X) ==> splat(X)
1941 if (isAllActivePredicate(Pg)) {
1942 auto *RetTy = cast<ScalableVectorType>(II.getType());
1943 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1944 II.getArgOperand(2));
1945 return IC.replaceInstUsesWith(II, Splat);
1946 }
1947
1949 m_SpecificInt(AArch64SVEPredPattern::vl1))))
1950 return std::nullopt;
1951
1952 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
1953 Value *Insert = IC.Builder.CreateInsertElement(
1954 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
1955 return IC.replaceInstUsesWith(II, Insert);
1956}
1957
1958static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1959 IntrinsicInst &II) {
1960 // Replace DupX with a regular IR splat.
1961 auto *RetTy = cast<ScalableVectorType>(II.getType());
1962 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1963 II.getArgOperand(0));
1964 Splat->takeName(&II);
1965 return IC.replaceInstUsesWith(II, Splat);
1966}
1967
1968static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1969 IntrinsicInst &II) {
1970 LLVMContext &Ctx = II.getContext();
1971
1972 if (!isAllActivePredicate(II.getArgOperand(0)))
1973 return std::nullopt;
1974
1975 // Check that we have a compare of zero..
1976 auto *SplatValue =
1978 if (!SplatValue || !SplatValue->isZero())
1979 return std::nullopt;
1980
1981 // ..against a dupq
1982 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1983 if (!DupQLane ||
1984 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1985 return std::nullopt;
1986
1987 // Where the dupq is a lane 0 replicate of a vector insert
1988 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1989 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1990 return std::nullopt;
1991
1992 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1993 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1994 return std::nullopt;
1995
1996 // Where the vector insert is a fixed constant vector insert into undef at
1997 // index zero
1998 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1999 return std::nullopt;
2000
2001 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
2002 return std::nullopt;
2003
2004 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
2005 if (!ConstVec)
2006 return std::nullopt;
2007
2008 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
2009 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
2010 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2011 return std::nullopt;
2012
2013 unsigned NumElts = VecTy->getNumElements();
2014 unsigned PredicateBits = 0;
2015
2016 // Expand intrinsic operands to a 16-bit byte level predicate
2017 for (unsigned I = 0; I < NumElts; ++I) {
2018 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
2019 if (!Arg)
2020 return std::nullopt;
2021 if (!Arg->isZero())
2022 PredicateBits |= 1 << (I * (16 / NumElts));
2023 }
2024
2025 // If all bits are zero bail early with an empty predicate
2026 if (PredicateBits == 0) {
2027 auto *PFalse = Constant::getNullValue(II.getType());
2028 PFalse->takeName(&II);
2029 return IC.replaceInstUsesWith(II, PFalse);
2030 }
2031
2032 // Calculate largest predicate type used (where byte predicate is largest)
2033 unsigned Mask = 8;
2034 for (unsigned I = 0; I < 16; ++I)
2035 if ((PredicateBits & (1 << I)) != 0)
2036 Mask |= (I % 8);
2037
2038 unsigned PredSize = Mask & -Mask;
2039 auto *PredType = ScalableVectorType::get(
2040 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2041
2042 // Ensure all relevant bits are set
2043 for (unsigned I = 0; I < 16; I += PredSize)
2044 if ((PredicateBits & (1 << I)) == 0)
2045 return std::nullopt;
2046
2047 auto *PTruePat =
2048 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2049 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2050 {PredType}, {PTruePat});
2051 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2052 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
2053 auto *ConvertFromSVBool =
2054 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2055 {II.getType()}, {ConvertToSVBool});
2056
2057 ConvertFromSVBool->takeName(&II);
2058 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2059}
2060
2061static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2062 IntrinsicInst &II) {
2063 Value *Pg = II.getArgOperand(0);
2064 Value *Vec = II.getArgOperand(1);
2065 auto IntrinsicID = II.getIntrinsicID();
2066 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2067
2068 // lastX(splat(X)) --> X
2069 if (auto *SplatVal = getSplatValue(Vec))
2070 return IC.replaceInstUsesWith(II, SplatVal);
2071
2072 // If x and/or y is a splat value then:
2073 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2074 Value *LHS, *RHS;
2075 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2076 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2077 auto *OldBinOp = cast<BinaryOperator>(Vec);
2078 auto OpC = OldBinOp->getOpcode();
2079 auto *NewLHS =
2080 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2081 auto *NewRHS =
2082 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2084 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2085 return IC.replaceInstUsesWith(II, NewBinOp);
2086 }
2087 }
2088
2089 auto *C = dyn_cast<Constant>(Pg);
2090 if (IsAfter && C && C->isNullValue()) {
2091 // The intrinsic is extracting lane 0 so use an extract instead.
2092 auto *IdxTy = Type::getInt64Ty(II.getContext());
2093 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2094 Extract->insertBefore(II.getIterator());
2095 Extract->takeName(&II);
2096 return IC.replaceInstUsesWith(II, Extract);
2097 }
2098
2099 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2100 if (!IntrPG)
2101 return std::nullopt;
2102
2103 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2104 return std::nullopt;
2105
2106 const auto PTruePattern =
2107 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2108
2109 // Can the intrinsic's predicate be converted to a known constant index?
2110 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2111 if (!MinNumElts)
2112 return std::nullopt;
2113
2114 unsigned Idx = MinNumElts - 1;
2115 // Increment the index if extracting the element after the last active
2116 // predicate element.
2117 if (IsAfter)
2118 ++Idx;
2119
2120 // Ignore extracts whose index is larger than the known minimum vector
2121 // length. NOTE: This is an artificial constraint where we prefer to
2122 // maintain what the user asked for until an alternative is proven faster.
2123 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2124 if (Idx >= PgVTy->getMinNumElements())
2125 return std::nullopt;
2126
2127 // The intrinsic is extracting a fixed lane so use an extract instead.
2128 auto *IdxTy = Type::getInt64Ty(II.getContext());
2129 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2130 Extract->insertBefore(II.getIterator());
2131 Extract->takeName(&II);
2132 return IC.replaceInstUsesWith(II, Extract);
2133}
2134
2135static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2136 IntrinsicInst &II) {
2137 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2138 // integer variant across a variety of micro-architectures. Replace scalar
2139 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2140 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2141 // depending on the micro-architecture, but has been observed as generally
2142 // being faster, particularly when the CLAST[AB] op is a loop-carried
2143 // dependency.
2144 Value *Pg = II.getArgOperand(0);
2145 Value *Fallback = II.getArgOperand(1);
2146 Value *Vec = II.getArgOperand(2);
2147 Type *Ty = II.getType();
2148
2149 if (!Ty->isIntegerTy())
2150 return std::nullopt;
2151
2152 Type *FPTy;
2153 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2154 default:
2155 return std::nullopt;
2156 case 16:
2157 FPTy = IC.Builder.getHalfTy();
2158 break;
2159 case 32:
2160 FPTy = IC.Builder.getFloatTy();
2161 break;
2162 case 64:
2163 FPTy = IC.Builder.getDoubleTy();
2164 break;
2165 }
2166
2167 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2168 auto *FPVTy = VectorType::get(
2169 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2170 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2171 auto *FPII = IC.Builder.CreateIntrinsic(
2172 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2173 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2174 return IC.replaceInstUsesWith(II, FPIItoInt);
2175}
2176
2177static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2178 IntrinsicInst &II) {
2179 LLVMContext &Ctx = II.getContext();
2180 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2181 // can work with RDFFR_PP for ptest elimination.
2182 auto *AllPat =
2183 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2184 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2185 {II.getType()}, {AllPat});
2186 auto *RDFFR =
2187 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2188 RDFFR->takeName(&II);
2189 return IC.replaceInstUsesWith(II, RDFFR);
2190}
2191
2192static std::optional<Instruction *>
2194 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2195
2196 if (Pattern == AArch64SVEPredPattern::all) {
2198 II.getType(), ElementCount::getScalable(NumElts));
2199 Cnt->takeName(&II);
2200 return IC.replaceInstUsesWith(II, Cnt);
2201 }
2202
2203 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2204
2205 return MinNumElts && NumElts >= MinNumElts
2206 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2207 II, ConstantInt::get(II.getType(), MinNumElts)))
2208 : std::nullopt;
2209}
2210
2211static std::optional<Instruction *>
2213 const AArch64Subtarget *ST) {
2214 if (!ST->isStreaming())
2215 return std::nullopt;
2216
2217 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2218 // with SVEPredPattern::all
2219 Value *Cnt =
2221 Cnt->takeName(&II);
2222 return IC.replaceInstUsesWith(II, Cnt);
2223}
2224
2225static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2226 IntrinsicInst &II) {
2227 Value *PgVal = II.getArgOperand(0);
2228 Value *OpVal = II.getArgOperand(1);
2229
2230 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2231 // Later optimizations prefer this form.
2232 if (PgVal == OpVal &&
2233 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2234 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2235 Value *Ops[] = {PgVal, OpVal};
2236 Type *Tys[] = {PgVal->getType()};
2237
2238 auto *PTest =
2239 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2240 PTest->takeName(&II);
2241
2242 return IC.replaceInstUsesWith(II, PTest);
2243 }
2244
2247
2248 if (!Pg || !Op)
2249 return std::nullopt;
2250
2251 Intrinsic::ID OpIID = Op->getIntrinsicID();
2252
2253 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2254 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2255 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2256 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2257 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2258
2259 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2260
2261 PTest->takeName(&II);
2262 return IC.replaceInstUsesWith(II, PTest);
2263 }
2264
2265 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2266 // Later optimizations may rewrite sequence to use the flag-setting variant
2267 // of instruction X to remove PTEST.
2268 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2269 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2270 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2271 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2272 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2273 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2274 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2275 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2276 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2277 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2278 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2279 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2280 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2281 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2282 Type *Tys[] = {Pg->getType()};
2283
2284 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2285 PTest->takeName(&II);
2286
2287 return IC.replaceInstUsesWith(II, PTest);
2288 }
2289
2290 return std::nullopt;
2291}
2292
2293template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2294static std::optional<Instruction *>
2296 bool MergeIntoAddendOp) {
2297 Value *P = II.getOperand(0);
2298 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2299 if (MergeIntoAddendOp) {
2300 AddendOp = II.getOperand(1);
2301 Mul = II.getOperand(2);
2302 } else {
2303 AddendOp = II.getOperand(2);
2304 Mul = II.getOperand(1);
2305 }
2306
2308 m_Value(MulOp1))))
2309 return std::nullopt;
2310
2311 if (!Mul->hasOneUse())
2312 return std::nullopt;
2313
2314 Instruction *FMFSource = nullptr;
2315 if (II.getType()->isFPOrFPVectorTy()) {
2316 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2317 // Stop the combine when the flags on the inputs differ in case dropping
2318 // flags would lead to us missing out on more beneficial optimizations.
2319 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2320 return std::nullopt;
2321 if (!FAddFlags.allowContract())
2322 return std::nullopt;
2323 FMFSource = &II;
2324 }
2325
2326 CallInst *Res;
2327 if (MergeIntoAddendOp)
2328 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2329 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2330 else
2331 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2332 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2333
2334 return IC.replaceInstUsesWith(II, Res);
2335}
2336
2337static std::optional<Instruction *>
2339 Value *Pred = II.getOperand(0);
2340 Value *PtrOp = II.getOperand(1);
2341 Type *VecTy = II.getType();
2342
2343 if (isAllActivePredicate(Pred)) {
2344 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2345 Load->copyMetadata(II);
2346 return IC.replaceInstUsesWith(II, Load);
2347 }
2348
2349 CallInst *MaskedLoad =
2350 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2351 Pred, ConstantAggregateZero::get(VecTy));
2352 MaskedLoad->copyMetadata(II);
2353 return IC.replaceInstUsesWith(II, MaskedLoad);
2354}
2355
2356static std::optional<Instruction *>
2358 Value *VecOp = II.getOperand(0);
2359 Value *Pred = II.getOperand(1);
2360 Value *PtrOp = II.getOperand(2);
2361
2362 if (isAllActivePredicate(Pred)) {
2363 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2364 Store->copyMetadata(II);
2365 return IC.eraseInstFromFunction(II);
2366 }
2367
2368 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2369 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2370 MaskedStore->copyMetadata(II);
2371 return IC.eraseInstFromFunction(II);
2372}
2373
2375 switch (Intrinsic) {
2376 case Intrinsic::aarch64_sve_fmul_u:
2377 return Instruction::BinaryOps::FMul;
2378 case Intrinsic::aarch64_sve_fadd_u:
2379 return Instruction::BinaryOps::FAdd;
2380 case Intrinsic::aarch64_sve_fsub_u:
2381 return Instruction::BinaryOps::FSub;
2382 default:
2383 return Instruction::BinaryOpsEnd;
2384 }
2385}
2386
2387static std::optional<Instruction *>
2389 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2390 if (II.isStrictFP())
2391 return std::nullopt;
2392
2393 auto *OpPredicate = II.getOperand(0);
2394 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2395 if (BinOpCode == Instruction::BinaryOpsEnd ||
2396 !isAllActivePredicate(OpPredicate))
2397 return std::nullopt;
2398 auto BinOp = IC.Builder.CreateBinOpFMF(
2399 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2400 return IC.replaceInstUsesWith(II, BinOp);
2401}
2402
2403static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2404 IntrinsicInst &II) {
2405 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2406 Intrinsic::aarch64_sve_mla>(
2407 IC, II, true))
2408 return MLA;
2409 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2410 Intrinsic::aarch64_sve_mad>(
2411 IC, II, false))
2412 return MAD;
2413 return std::nullopt;
2414}
2415
2416static std::optional<Instruction *>
2418 if (auto FMLA =
2419 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2420 Intrinsic::aarch64_sve_fmla>(IC, II,
2421 true))
2422 return FMLA;
2423 if (auto FMAD =
2424 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2425 Intrinsic::aarch64_sve_fmad>(IC, II,
2426 false))
2427 return FMAD;
2428 if (auto FMLA =
2429 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2430 Intrinsic::aarch64_sve_fmla>(IC, II,
2431 true))
2432 return FMLA;
2433 return std::nullopt;
2434}
2435
2436static std::optional<Instruction *>
2438 if (auto FMLA =
2439 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2440 Intrinsic::aarch64_sve_fmla>(IC, II,
2441 true))
2442 return FMLA;
2443 if (auto FMAD =
2444 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2445 Intrinsic::aarch64_sve_fmad>(IC, II,
2446 false))
2447 return FMAD;
2448 if (auto FMLA_U =
2449 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2450 Intrinsic::aarch64_sve_fmla_u>(
2451 IC, II, true))
2452 return FMLA_U;
2453 return instCombineSVEVectorBinOp(IC, II);
2454}
2455
2456static std::optional<Instruction *>
2458 if (auto FMLS =
2459 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2460 Intrinsic::aarch64_sve_fmls>(IC, II,
2461 true))
2462 return FMLS;
2463 if (auto FMSB =
2464 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2465 Intrinsic::aarch64_sve_fnmsb>(
2466 IC, II, false))
2467 return FMSB;
2468 if (auto FMLS =
2469 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2470 Intrinsic::aarch64_sve_fmls>(IC, II,
2471 true))
2472 return FMLS;
2473 return std::nullopt;
2474}
2475
2476static std::optional<Instruction *>
2478 if (auto FMLS =
2479 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2480 Intrinsic::aarch64_sve_fmls>(IC, II,
2481 true))
2482 return FMLS;
2483 if (auto FMSB =
2484 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2485 Intrinsic::aarch64_sve_fnmsb>(
2486 IC, II, false))
2487 return FMSB;
2488 if (auto FMLS_U =
2489 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2490 Intrinsic::aarch64_sve_fmls_u>(
2491 IC, II, true))
2492 return FMLS_U;
2493 return instCombineSVEVectorBinOp(IC, II);
2494}
2495
2496static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2497 IntrinsicInst &II) {
2498 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2499 Intrinsic::aarch64_sve_mls>(
2500 IC, II, true))
2501 return MLS;
2502 return std::nullopt;
2503}
2504
2505static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2506 IntrinsicInst &II) {
2507 Value *UnpackArg = II.getArgOperand(0);
2508 auto *RetTy = cast<ScalableVectorType>(II.getType());
2509 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2510 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2511
2512 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2513 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2514 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2515 ScalarArg =
2516 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2517 Value *NewVal =
2518 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2519 NewVal->takeName(&II);
2520 return IC.replaceInstUsesWith(II, NewVal);
2521 }
2522
2523 return std::nullopt;
2524}
2525static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2526 IntrinsicInst &II) {
2527 auto *OpVal = II.getOperand(0);
2528 auto *OpIndices = II.getOperand(1);
2529 VectorType *VTy = cast<VectorType>(II.getType());
2530
2531 // Check whether OpIndices is a constant splat value < minimal element count
2532 // of result.
2533 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2534 if (!SplatValue ||
2535 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2536 return std::nullopt;
2537
2538 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2539 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2540 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2541 auto *VectorSplat =
2542 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2543
2544 VectorSplat->takeName(&II);
2545 return IC.replaceInstUsesWith(II, VectorSplat);
2546}
2547
2548static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2549 IntrinsicInst &II) {
2550 Value *A, *B;
2551 Type *RetTy = II.getType();
2552 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2553 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2554
2555 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2556 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2557 if ((match(II.getArgOperand(0),
2559 match(II.getArgOperand(1),
2561 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2562 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2563 auto *TyA = cast<ScalableVectorType>(A->getType());
2564 if (TyA == B->getType() &&
2566 auto *SubVec = IC.Builder.CreateInsertVector(
2567 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2568 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2569 TyA->getMinNumElements());
2570 ConcatVec->takeName(&II);
2571 return IC.replaceInstUsesWith(II, ConcatVec);
2572 }
2573 }
2574
2575 return std::nullopt;
2576}
2577
2578static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2579 IntrinsicInst &II) {
2580 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2581 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2582 Value *A, *B;
2583 if (match(II.getArgOperand(0),
2586 m_Specific(A), m_Specific(B))))
2587 return IC.replaceInstUsesWith(
2588 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2589
2590 return std::nullopt;
2591}
2592
2593static std::optional<Instruction *>
2595 Value *Mask = II.getOperand(0);
2596 Value *BasePtr = II.getOperand(1);
2597 Value *Index = II.getOperand(2);
2598 Type *Ty = II.getType();
2599 Value *PassThru = ConstantAggregateZero::get(Ty);
2600
2601 // Contiguous gather => masked load.
2602 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2603 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2604 Value *IndexBase;
2606 m_Value(IndexBase), m_SpecificInt(1)))) {
2607 Align Alignment =
2608 BasePtr->getPointerAlignment(II.getDataLayout());
2609
2610 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2611 BasePtr, IndexBase);
2612 CallInst *MaskedLoad =
2613 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2614 MaskedLoad->takeName(&II);
2615 return IC.replaceInstUsesWith(II, MaskedLoad);
2616 }
2617
2618 return std::nullopt;
2619}
2620
2621static std::optional<Instruction *>
2623 Value *Val = II.getOperand(0);
2624 Value *Mask = II.getOperand(1);
2625 Value *BasePtr = II.getOperand(2);
2626 Value *Index = II.getOperand(3);
2627 Type *Ty = Val->getType();
2628
2629 // Contiguous scatter => masked store.
2630 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2631 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2632 Value *IndexBase;
2634 m_Value(IndexBase), m_SpecificInt(1)))) {
2635 Align Alignment =
2636 BasePtr->getPointerAlignment(II.getDataLayout());
2637
2638 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2639 BasePtr, IndexBase);
2640 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2641
2642 return IC.eraseInstFromFunction(II);
2643 }
2644
2645 return std::nullopt;
2646}
2647
2648static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2649 IntrinsicInst &II) {
2651 Value *Pred = II.getOperand(0);
2652 Value *Vec = II.getOperand(1);
2653 Value *DivVec = II.getOperand(2);
2654
2655 Value *SplatValue = getSplatValue(DivVec);
2656 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2657 if (!SplatConstantInt)
2658 return std::nullopt;
2659
2660 APInt Divisor = SplatConstantInt->getValue();
2661 const int64_t DivisorValue = Divisor.getSExtValue();
2662 if (DivisorValue == -1)
2663 return std::nullopt;
2664 if (DivisorValue == 1)
2665 IC.replaceInstUsesWith(II, Vec);
2666
2667 if (Divisor.isPowerOf2()) {
2668 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2669 auto ASRD = IC.Builder.CreateIntrinsic(
2670 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2671 return IC.replaceInstUsesWith(II, ASRD);
2672 }
2673 if (Divisor.isNegatedPowerOf2()) {
2674 Divisor.negate();
2675 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2676 auto ASRD = IC.Builder.CreateIntrinsic(
2677 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2678 auto NEG = IC.Builder.CreateIntrinsic(
2679 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2680 return IC.replaceInstUsesWith(II, NEG);
2681 }
2682
2683 return std::nullopt;
2684}
2685
2686bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2687 size_t VecSize = Vec.size();
2688 if (VecSize == 1)
2689 return true;
2690 if (!isPowerOf2_64(VecSize))
2691 return false;
2692 size_t HalfVecSize = VecSize / 2;
2693
2694 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2695 RHS != Vec.end(); LHS++, RHS++) {
2696 if (*LHS != nullptr && *RHS != nullptr) {
2697 if (*LHS == *RHS)
2698 continue;
2699 else
2700 return false;
2701 }
2702 if (!AllowPoison)
2703 return false;
2704 if (*LHS == nullptr && *RHS != nullptr)
2705 *LHS = *RHS;
2706 }
2707
2708 Vec.resize(HalfVecSize);
2709 SimplifyValuePattern(Vec, AllowPoison);
2710 return true;
2711}
2712
2713// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2714// to dupqlane(f64(C)) where C is A concatenated with B
2715static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2716 IntrinsicInst &II) {
2717 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2718 if (!match(II.getOperand(0),
2720 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2721 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2722 return std::nullopt;
2723 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2724
2725 // Insert the scalars into a container ordered by InsertElement index
2726 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2727 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2728 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2729 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2730 CurrentInsertElt = InsertElt->getOperand(0);
2731 }
2732
2733 bool AllowPoison =
2734 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2735 if (!SimplifyValuePattern(Elts, AllowPoison))
2736 return std::nullopt;
2737
2738 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2739 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2740 for (size_t I = 0; I < Elts.size(); I++) {
2741 if (Elts[I] == nullptr)
2742 continue;
2743 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2744 IC.Builder.getInt64(I));
2745 }
2746 if (InsertEltChain == nullptr)
2747 return std::nullopt;
2748
2749 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2750 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2751 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2752 // be narrowed back to the original type.
2753 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2754 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2755 IIScalableTy->getMinNumElements() /
2756 PatternWidth;
2757
2758 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2759 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2760 auto *WideShuffleMaskTy =
2761 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2762
2763 auto InsertSubvector = IC.Builder.CreateInsertVector(
2764 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2765 uint64_t(0));
2766 auto WideBitcast =
2767 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2768 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2769 auto WideShuffle = IC.Builder.CreateShuffleVector(
2770 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2771 auto NarrowBitcast =
2772 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2773
2774 return IC.replaceInstUsesWith(II, NarrowBitcast);
2775}
2776
2777static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2778 IntrinsicInst &II) {
2779 Value *A = II.getArgOperand(0);
2780 Value *B = II.getArgOperand(1);
2781 if (A == B)
2782 return IC.replaceInstUsesWith(II, A);
2783
2784 return std::nullopt;
2785}
2786
2787static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2788 IntrinsicInst &II) {
2789 Value *Pred = II.getOperand(0);
2790 Value *Vec = II.getOperand(1);
2791 Value *Shift = II.getOperand(2);
2792
2793 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2794 Value *AbsPred, *MergedValue;
2796 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2798 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2799
2800 return std::nullopt;
2801
2802 // Transform is valid if any of the following are true:
2803 // * The ABS merge value is an undef or non-negative
2804 // * The ABS predicate is all active
2805 // * The ABS predicate and the SRSHL predicates are the same
2806 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2807 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2808 return std::nullopt;
2809
2810 // Only valid when the shift amount is non-negative, otherwise the rounding
2811 // behaviour of SRSHL cannot be ignored.
2812 if (!match(Shift, m_NonNegative()))
2813 return std::nullopt;
2814
2815 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2816 {II.getType()}, {Pred, Vec, Shift});
2817
2818 return IC.replaceInstUsesWith(II, LSL);
2819}
2820
2821static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2822 IntrinsicInst &II) {
2823 Value *Vec = II.getOperand(0);
2824
2825 if (getSplatValue(Vec) == II.getOperand(1))
2826 return IC.replaceInstUsesWith(II, Vec);
2827
2828 return std::nullopt;
2829}
2830
2831static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2832 IntrinsicInst &II) {
2833 // If this barrier is post-dominated by identical one we can remove it
2834 auto *NI = II.getNextNode();
2835 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2836 auto CanSkipOver = [](Instruction *I) {
2837 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2838 };
2839 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2840 auto *NIBB = NI->getParent();
2841 NI = NI->getNextNode();
2842 if (!NI) {
2843 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2844 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2845 else
2846 break;
2847 }
2848 }
2849 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2850 if (NextII && II.isIdenticalTo(NextII))
2851 return IC.eraseInstFromFunction(II);
2852
2853 return std::nullopt;
2854}
2855
2856static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2857 IntrinsicInst &II) {
2858 return IC.replaceInstUsesWith(
2859 II,
2860 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2861 {II.getType(), II.getOperand(0)->getType()},
2862 {II.getOperand(0), II.getOperand(1)}));
2863}
2864
2865static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2866 IntrinsicInst &II) {
2868 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2869 return std::nullopt;
2870}
2871
2872static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2874 unsigned NumBits) {
2875 Value *Passthru = II.getOperand(0);
2876 Value *Pg = II.getOperand(1);
2877 Value *Op = II.getOperand(2);
2878
2879 // Convert UXT[BHW] to AND.
2880 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2881 auto *Ty = cast<VectorType>(II.getType());
2882 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2883 auto *Mask = ConstantInt::get(Ty, MaskValue);
2884 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2885 {Pg, Op, Mask});
2886 return IC.replaceInstUsesWith(II, And);
2887 }
2888
2889 return std::nullopt;
2890}
2891
2892static std::optional<Instruction *>
2894 SMEAttrs FnSMEAttrs(*II.getFunction());
2895 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2896 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2897 return IC.replaceInstUsesWith(
2898 II, ConstantInt::getBool(II.getType(), IsStreaming));
2899 return std::nullopt;
2900}
2901
2902std::optional<Instruction *>
2904 IntrinsicInst &II) const {
2906 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2907 return I;
2908
2909 Intrinsic::ID IID = II.getIntrinsicID();
2910 switch (IID) {
2911 default:
2912 break;
2913 case Intrinsic::aarch64_dmb:
2914 return instCombineDMB(IC, II);
2915 case Intrinsic::aarch64_neon_fmaxnm:
2916 case Intrinsic::aarch64_neon_fminnm:
2917 return instCombineMaxMinNM(IC, II);
2918 case Intrinsic::aarch64_sve_convert_from_svbool:
2919 return instCombineConvertFromSVBool(IC, II);
2920 case Intrinsic::aarch64_sve_dup:
2921 return instCombineSVEDup(IC, II);
2922 case Intrinsic::aarch64_sve_dup_x:
2923 return instCombineSVEDupX(IC, II);
2924 case Intrinsic::aarch64_sve_cmpne:
2925 case Intrinsic::aarch64_sve_cmpne_wide:
2926 return instCombineSVECmpNE(IC, II);
2927 case Intrinsic::aarch64_sve_rdffr:
2928 return instCombineRDFFR(IC, II);
2929 case Intrinsic::aarch64_sve_lasta:
2930 case Intrinsic::aarch64_sve_lastb:
2931 return instCombineSVELast(IC, II);
2932 case Intrinsic::aarch64_sve_clasta_n:
2933 case Intrinsic::aarch64_sve_clastb_n:
2934 return instCombineSVECondLast(IC, II);
2935 case Intrinsic::aarch64_sve_cntd:
2936 return instCombineSVECntElts(IC, II, 2);
2937 case Intrinsic::aarch64_sve_cntw:
2938 return instCombineSVECntElts(IC, II, 4);
2939 case Intrinsic::aarch64_sve_cnth:
2940 return instCombineSVECntElts(IC, II, 8);
2941 case Intrinsic::aarch64_sve_cntb:
2942 return instCombineSVECntElts(IC, II, 16);
2943 case Intrinsic::aarch64_sme_cntsd:
2944 return instCombineSMECntsd(IC, II, ST);
2945 case Intrinsic::aarch64_sve_ptest_any:
2946 case Intrinsic::aarch64_sve_ptest_first:
2947 case Intrinsic::aarch64_sve_ptest_last:
2948 return instCombineSVEPTest(IC, II);
2949 case Intrinsic::aarch64_sve_fadd:
2950 return instCombineSVEVectorFAdd(IC, II);
2951 case Intrinsic::aarch64_sve_fadd_u:
2952 return instCombineSVEVectorFAddU(IC, II);
2953 case Intrinsic::aarch64_sve_fmul_u:
2954 return instCombineSVEVectorBinOp(IC, II);
2955 case Intrinsic::aarch64_sve_fsub:
2956 return instCombineSVEVectorFSub(IC, II);
2957 case Intrinsic::aarch64_sve_fsub_u:
2958 return instCombineSVEVectorFSubU(IC, II);
2959 case Intrinsic::aarch64_sve_add:
2960 return instCombineSVEVectorAdd(IC, II);
2961 case Intrinsic::aarch64_sve_add_u:
2962 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2963 Intrinsic::aarch64_sve_mla_u>(
2964 IC, II, true);
2965 case Intrinsic::aarch64_sve_sub:
2966 return instCombineSVEVectorSub(IC, II);
2967 case Intrinsic::aarch64_sve_sub_u:
2968 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2969 Intrinsic::aarch64_sve_mls_u>(
2970 IC, II, true);
2971 case Intrinsic::aarch64_sve_tbl:
2972 return instCombineSVETBL(IC, II);
2973 case Intrinsic::aarch64_sve_uunpkhi:
2974 case Intrinsic::aarch64_sve_uunpklo:
2975 case Intrinsic::aarch64_sve_sunpkhi:
2976 case Intrinsic::aarch64_sve_sunpklo:
2977 return instCombineSVEUnpack(IC, II);
2978 case Intrinsic::aarch64_sve_uzp1:
2979 return instCombineSVEUzp1(IC, II);
2980 case Intrinsic::aarch64_sve_zip1:
2981 case Intrinsic::aarch64_sve_zip2:
2982 return instCombineSVEZip(IC, II);
2983 case Intrinsic::aarch64_sve_ld1_gather_index:
2984 return instCombineLD1GatherIndex(IC, II);
2985 case Intrinsic::aarch64_sve_st1_scatter_index:
2986 return instCombineST1ScatterIndex(IC, II);
2987 case Intrinsic::aarch64_sve_ld1:
2988 return instCombineSVELD1(IC, II, DL);
2989 case Intrinsic::aarch64_sve_st1:
2990 return instCombineSVEST1(IC, II, DL);
2991 case Intrinsic::aarch64_sve_sdiv:
2992 return instCombineSVESDIV(IC, II);
2993 case Intrinsic::aarch64_sve_sel:
2994 return instCombineSVESel(IC, II);
2995 case Intrinsic::aarch64_sve_srshl:
2996 return instCombineSVESrshl(IC, II);
2997 case Intrinsic::aarch64_sve_dupq_lane:
2998 return instCombineSVEDupqLane(IC, II);
2999 case Intrinsic::aarch64_sve_insr:
3000 return instCombineSVEInsr(IC, II);
3001 case Intrinsic::aarch64_sve_whilelo:
3002 return instCombineWhilelo(IC, II);
3003 case Intrinsic::aarch64_sve_ptrue:
3004 return instCombinePTrue(IC, II);
3005 case Intrinsic::aarch64_sve_uxtb:
3006 return instCombineSVEUxt(IC, II, 8);
3007 case Intrinsic::aarch64_sve_uxth:
3008 return instCombineSVEUxt(IC, II, 16);
3009 case Intrinsic::aarch64_sve_uxtw:
3010 return instCombineSVEUxt(IC, II, 32);
3011 case Intrinsic::aarch64_sme_in_streaming_mode:
3012 return instCombineInStreamingMode(IC, II);
3013 }
3014
3015 return std::nullopt;
3016}
3017
3019 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3020 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3021 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3022 SimplifyAndSetOp) const {
3023 switch (II.getIntrinsicID()) {
3024 default:
3025 break;
3026 case Intrinsic::aarch64_neon_fcvtxn:
3027 case Intrinsic::aarch64_neon_rshrn:
3028 case Intrinsic::aarch64_neon_sqrshrn:
3029 case Intrinsic::aarch64_neon_sqrshrun:
3030 case Intrinsic::aarch64_neon_sqshrn:
3031 case Intrinsic::aarch64_neon_sqshrun:
3032 case Intrinsic::aarch64_neon_sqxtn:
3033 case Intrinsic::aarch64_neon_sqxtun:
3034 case Intrinsic::aarch64_neon_uqrshrn:
3035 case Intrinsic::aarch64_neon_uqshrn:
3036 case Intrinsic::aarch64_neon_uqxtn:
3037 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3038 break;
3039 }
3040
3041 return std::nullopt;
3042}
3043
3045 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3047}
3048
3051 switch (K) {
3053 return TypeSize::getFixed(64);
3055 if (ST->useSVEForFixedLengthVectors() &&
3056 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3057 return TypeSize::getFixed(
3058 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3059 else if (ST->isNeonAvailable())
3060 return TypeSize::getFixed(128);
3061 else
3062 return TypeSize::getFixed(0);
3064 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3066 return TypeSize::getScalable(128);
3067 else
3068 return TypeSize::getScalable(0);
3069 }
3070 llvm_unreachable("Unsupported register kind");
3071}
3072
3073bool AArch64TTIImpl::isSingleExtWideningInstruction(
3074 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3075 Type *SrcOverrideTy) const {
3076 // A helper that returns a vector type from the given type. The number of
3077 // elements in type Ty determines the vector width.
3078 auto toVectorTy = [&](Type *ArgTy) {
3079 return VectorType::get(ArgTy->getScalarType(),
3080 cast<VectorType>(DstTy)->getElementCount());
3081 };
3082
3083 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3084 // i32, i64]. SVE doesn't generally have the same set of instructions to
3085 // perform an extend with the add/sub/mul. There are SMULLB style
3086 // instructions, but they operate on top/bottom, requiring some sort of lane
3087 // interleaving to be used with zext/sext.
3088 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3089 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3090 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3091 return false;
3092
3093 Type *SrcTy = SrcOverrideTy;
3094 switch (Opcode) {
3095 case Instruction::Add: // UADDW(2), SADDW(2).
3096 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3097 // The second operand needs to be an extend
3098 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3099 if (!SrcTy)
3100 SrcTy =
3101 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3102 break;
3103 }
3104
3105 if (Opcode == Instruction::Sub)
3106 return false;
3107
3108 // UADDW(2), SADDW(2) can be commutted.
3109 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3110 if (!SrcTy)
3111 SrcTy =
3112 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3113 break;
3114 }
3115 return false;
3116 }
3117 default:
3118 return false;
3119 }
3120
3121 // Legalize the destination type and ensure it can be used in a widening
3122 // operation.
3123 auto DstTyL = getTypeLegalizationCost(DstTy);
3124 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3125 return false;
3126
3127 // Legalize the source type and ensure it can be used in a widening
3128 // operation.
3129 assert(SrcTy && "Expected some SrcTy");
3130 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3131 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3132 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3133 return false;
3134
3135 // Get the total number of vector elements in the legalized types.
3136 InstructionCost NumDstEls =
3137 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3138 InstructionCost NumSrcEls =
3139 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3140
3141 // Return true if the legalized types have the same number of vector elements
3142 // and the destination element type size is twice that of the source type.
3143 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3144}
3145
3146Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3148 Type *SrcOverrideTy) const {
3149 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3150 Opcode != Instruction::Mul)
3151 return nullptr;
3152
3153 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3154 // i32, i64]. SVE doesn't generally have the same set of instructions to
3155 // perform an extend with the add/sub/mul. There are SMULLB style
3156 // instructions, but they operate on top/bottom, requiring some sort of lane
3157 // interleaving to be used with zext/sext.
3158 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3159 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3160 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3161 return nullptr;
3162
3163 auto getScalarSizeWithOverride = [&](const Value *V) {
3164 if (SrcOverrideTy)
3165 return SrcOverrideTy->getScalarSizeInBits();
3166 return cast<Instruction>(V)
3167 ->getOperand(0)
3168 ->getType()
3169 ->getScalarSizeInBits();
3170 };
3171
3172 unsigned MaxEltSize = 0;
3173 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3174 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3175 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3176 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3177 MaxEltSize = std::max(EltSize0, EltSize1);
3178 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3179 isa<SExtInst, ZExtInst>(Args[1])) {
3180 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3181 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3182 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3183 // enough.
3184 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3185 return nullptr;
3186 MaxEltSize = DstEltSize / 2;
3187 } else if (Opcode == Instruction::Mul &&
3188 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3189 // If one of the operands is a Zext and the other has enough zero bits
3190 // to be treated as unsigned, we can still generate a umull, meaning the
3191 // zext is free.
3192 KnownBits Known =
3193 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3194 if (Args[0]->getType()->getScalarSizeInBits() -
3195 Known.Zero.countLeadingOnes() >
3196 DstTy->getScalarSizeInBits() / 2)
3197 return nullptr;
3198
3199 MaxEltSize =
3200 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3201 } else
3202 return nullptr;
3203
3204 if (MaxEltSize * 2 > DstEltSize)
3205 return nullptr;
3206
3207 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3208 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3209 return nullptr;
3210 return ExtTy;
3211}
3212
3213// s/urhadd instructions implement the following pattern, making the
3214// extends free:
3215// %x = add ((zext i8 -> i16), 1)
3216// %y = (zext i8 -> i16)
3217// trunc i16 (lshr (add %x, %y), 1) -> i8
3218//
3220 Type *Src) const {
3221 // The source should be a legal vector type.
3222 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3223 (Src->isScalableTy() && !ST->hasSVE2()))
3224 return false;
3225
3226 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3227 return false;
3228
3229 // Look for trunc/shl/add before trying to match the pattern.
3230 const Instruction *Add = ExtUser;
3231 auto *AddUser =
3232 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3233 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3234 Add = AddUser;
3235
3236 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3237 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3238 return false;
3239
3240 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3241 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3242 Src->getScalarSizeInBits() !=
3243 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3244 return false;
3245
3246 // Try to match the whole pattern. Ext could be either the first or second
3247 // m_ZExtOrSExt matched.
3248 Instruction *Ex1, *Ex2;
3249 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3250 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3251 return false;
3252
3253 // Ensure both extends are of the same type
3254 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3255 Ex1->getOpcode() == Ex2->getOpcode())
3256 return true;
3257
3258 return false;
3259}
3260
3262 Type *Src,
3265 const Instruction *I) const {
3266 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3267 assert(ISD && "Invalid opcode");
3268 // If the cast is observable, and it is used by a widening instruction (e.g.,
3269 // uaddl, saddw, etc.), it may be free.
3270 if (I && I->hasOneUser()) {
3271 auto *SingleUser = cast<Instruction>(*I->user_begin());
3272 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3273 if (Type *ExtTy = isBinExtWideningInstruction(
3274 SingleUser->getOpcode(), Dst, Operands,
3275 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3276 // The cost from Src->Src*2 needs to be added if required, the cost from
3277 // Src*2->ExtTy is free.
3278 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3279 Type *DoubleSrcTy =
3280 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3281 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3283 }
3284
3285 return 0;
3286 }
3287
3288 if (isSingleExtWideningInstruction(
3289 SingleUser->getOpcode(), Dst, Operands,
3290 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3291 // For adds only count the second operand as free if both operands are
3292 // extends but not the same operation. (i.e both operands are not free in
3293 // add(sext, zext)).
3294 if (SingleUser->getOpcode() == Instruction::Add) {
3295 if (I == SingleUser->getOperand(1) ||
3296 (isa<CastInst>(SingleUser->getOperand(1)) &&
3297 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3298 return 0;
3299 } else {
3300 // Others are free so long as isSingleExtWideningInstruction
3301 // returned true.
3302 return 0;
3303 }
3304 }
3305
3306 // The cast will be free for the s/urhadd instructions
3307 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3308 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3309 return 0;
3310 }
3311
3312 // TODO: Allow non-throughput costs that aren't binary.
3313 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
3315 return Cost == 0 ? 0 : 1;
3316 return Cost;
3317 };
3318
3319 EVT SrcTy = TLI->getValueType(DL, Src);
3320 EVT DstTy = TLI->getValueType(DL, Dst);
3321
3322 if (!SrcTy.isSimple() || !DstTy.isSimple())
3323 return AdjustCost(
3324 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3325
3326 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3327 // we use fcvtx under SVE2. Give them invalid costs.
3328 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3329 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3330 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3332
3333 static const TypeConversionCostTblEntry BF16Tbl[] = {
3334 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3335 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3336 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3337 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3338 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3339 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3340 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3341 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3342 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3343 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3344 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3345 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3346 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3347 };
3348
3349 if (ST->hasBF16())
3350 if (const auto *Entry = ConvertCostTableLookup(
3351 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3352 return AdjustCost(Entry->Cost);
3353
3354 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3355 // The cost of unpacking twice is artificially increased for now in order
3356 // to avoid regressions against NEON, which will use tbl instructions directly
3357 // instead of multiple layers of [s|u]unpk[lo|hi].
3358 // We use the unpacks in cases where the destination type is illegal and
3359 // requires splitting of the input, even if the input type itself is legal.
3360 const unsigned int SVE_EXT_COST = 1;
3361 const unsigned int SVE_FCVT_COST = 1;
3362 const unsigned int SVE_UNPACK_ONCE = 4;
3363 const unsigned int SVE_UNPACK_TWICE = 16;
3364
3365 static const TypeConversionCostTblEntry ConversionTbl[] = {
3366 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3367 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3368 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3369 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3370 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3371 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3372 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3373 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3374 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3375 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3376 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3377 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3378 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3379 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3380 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3381 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3382 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3383 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3384 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3385 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3386
3387 // Truncations on nxvmiN
3388 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3389 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3390 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3391 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3392 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3393 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3394 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3395 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3396 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3397 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3398 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3399 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3400 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3401 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3402 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3403 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3404 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3405 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3406 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3407 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3408 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3409 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3410 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3411 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3412 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3413 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3414 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3415 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3416 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3417 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3418 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3419 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3420 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3421
3422 // The number of shll instructions for the extension.
3423 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3424 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3425 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3426 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3427 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3428 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3429 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3430 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3431 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3432 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3433 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3434 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3435 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3436 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3437 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3438 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3439
3440 // FP Ext and trunc
3441 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3442 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3443 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3444 // FP16
3445 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3446 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3447 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3448 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3449 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3450 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3451 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3452 // BF16 (uses shift)
3453 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3454 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3455 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3456 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3457 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3458 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3459 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3460 // FP Ext and trunc
3461 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3462 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3463 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3464 // FP16
3465 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3466 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3467 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3468 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3469 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3470 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3471 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3472 // BF16 (more complex, with +bf16 is handled above)
3473 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3474 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3475 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3476 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3477 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3478 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3479 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3480 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3481
3482 // LowerVectorINT_TO_FP:
3483 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3484 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3485 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3486 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3487 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3488 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3489
3490 // SVE: to nxv2f16
3491 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3492 SVE_EXT_COST + SVE_FCVT_COST},
3493 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3494 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3495 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3496 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3497 SVE_EXT_COST + SVE_FCVT_COST},
3498 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3499 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3500 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3501
3502 // SVE: to nxv4f16
3503 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3504 SVE_EXT_COST + SVE_FCVT_COST},
3505 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3506 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3507 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3508 SVE_EXT_COST + SVE_FCVT_COST},
3509 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3510 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3511
3512 // SVE: to nxv8f16
3513 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3514 SVE_EXT_COST + SVE_FCVT_COST},
3515 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3516 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3517 SVE_EXT_COST + SVE_FCVT_COST},
3518 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3519
3520 // SVE: to nxv16f16
3521 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3522 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3523 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3524 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3525
3526 // Complex: to v2f32
3527 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3528 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3529 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3530 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3531
3532 // SVE: to nxv2f32
3533 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3534 SVE_EXT_COST + SVE_FCVT_COST},
3535 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3536 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3537 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3538 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3539 SVE_EXT_COST + SVE_FCVT_COST},
3540 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3541 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3542 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3543
3544 // Complex: to v4f32
3545 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3546 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3547 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3548 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3549
3550 // SVE: to nxv4f32
3551 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3552 SVE_EXT_COST + SVE_FCVT_COST},
3553 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3554 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3555 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3556 SVE_EXT_COST + SVE_FCVT_COST},
3557 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3558 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3559
3560 // Complex: to v8f32
3561 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3562 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3563 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3564 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3565
3566 // SVE: to nxv8f32
3567 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3568 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3569 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3570 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3571 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3572 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3573 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3574 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3575
3576 // SVE: to nxv16f32
3577 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3578 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3579 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3580 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3581
3582 // Complex: to v16f32
3583 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3584 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3585
3586 // Complex: to v2f64
3587 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3588 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3589 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3590 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3591 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3592 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3593
3594 // SVE: to nxv2f64
3595 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3596 SVE_EXT_COST + SVE_FCVT_COST},
3597 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3598 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3599 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3600 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3601 SVE_EXT_COST + SVE_FCVT_COST},
3602 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3603 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3604 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3605
3606 // Complex: to v4f64
3607 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3608 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3609
3610 // SVE: to nxv4f64
3611 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3612 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3613 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3614 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3615 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3616 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3617 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3618 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3619 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3620 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3621 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3622 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3623
3624 // SVE: to nxv8f64
3625 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3626 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3627 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3628 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3629 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3630 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3631 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3632 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3633
3634 // LowerVectorFP_TO_INT
3635 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3636 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3637 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3638 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3639 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3640 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3641
3642 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3643 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3644 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3645 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3646 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3647 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3648 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3649
3650 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3651 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3652 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3653 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3654 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3655
3656 // Complex, from nxv2f32.
3657 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3658 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3659 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3660 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3661 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3662 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3663 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3664 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3665
3666 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3667 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3668 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3669 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3670 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3671 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3672 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3673
3674 // Complex, from nxv2f64.
3675 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3676 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3677 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3678 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3679 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3680 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3681 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3682 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3683 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3684 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3685
3686 // Complex, from nxv4f32.
3687 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3688 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3689 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3690 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3691 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3692 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3693 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3694 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3695 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3696 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3697
3698 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3699 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3700 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3701 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3702 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3703
3704 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3705 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3706 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3707 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3708 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3709 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3710 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3711
3712 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3713 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3714 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3715 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3716 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3717
3718 // Complex, from nxv8f16.
3719 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3720 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3721 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3722 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3723 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3724 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3725 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3726 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3727 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3728 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3729
3730 // Complex, from nxv4f16.
3731 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3732 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3733 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3734 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3735 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3736 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3737 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3738 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3739
3740 // Complex, from nxv2f16.
3741 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3742 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3743 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3744 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3745 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3746 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3747 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3748 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3749
3750 // Truncate from nxvmf32 to nxvmf16.
3751 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3752 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3753 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3754
3755 // Truncate from nxvmf32 to nxvmbf16.
3756 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3757 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3758 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3759
3760 // Truncate from nxvmf64 to nxvmf16.
3761 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3762 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3763 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3764
3765 // Truncate from nxvmf64 to nxvmbf16.
3766 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3767 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3768 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3769
3770 // Truncate from nxvmf64 to nxvmf32.
3771 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3772 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3773 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3774
3775 // Extend from nxvmf16 to nxvmf32.
3776 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3777 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3778 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3779
3780 // Extend from nxvmbf16 to nxvmf32.
3781 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3782 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3783 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3784
3785 // Extend from nxvmf16 to nxvmf64.
3786 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3787 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3788 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3789
3790 // Extend from nxvmbf16 to nxvmf64.
3791 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3792 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3793 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3794
3795 // Extend from nxvmf32 to nxvmf64.
3796 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3797 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3798 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3799
3800 // Bitcasts from float to integer
3801 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3802 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3803 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3804
3805 // Bitcasts from integer to float
3806 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3807 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3808 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3809
3810 // Add cost for extending to illegal -too wide- scalable vectors.
3811 // zero/sign extend are implemented by multiple unpack operations,
3812 // where each operation has a cost of 1.
3813 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3814 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3815 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3816 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3817 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3818 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3819
3820 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3821 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3822 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3823 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3824 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3825 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3826 };
3827
3828 // We have to estimate a cost of fixed length operation upon
3829 // SVE registers(operations) with the number of registers required
3830 // for a fixed type to be represented upon SVE registers.
3831 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3832 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3833 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3834 ST->useSVEForFixedLengthVectors(WiderTy)) {
3835 std::pair<InstructionCost, MVT> LT =
3836 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3837 unsigned NumElements =
3838 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3839 return AdjustCost(
3840 LT.first *
3842 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3843 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3844 CostKind, I));
3845 }
3846
3847 if (const auto *Entry = ConvertCostTableLookup(
3848 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3849 return AdjustCost(Entry->Cost);
3850
3851 static const TypeConversionCostTblEntry FP16Tbl[] = {
3852 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3853 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3854 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3855 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3856 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3857 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3858 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3859 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3860 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3861 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3862 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3863 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3864 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3865 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3866 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3867 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3868 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3869 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3870 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3871 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3872 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3873 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3874 };
3875
3876 if (ST->hasFullFP16())
3877 if (const auto *Entry = ConvertCostTableLookup(
3878 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3879 return AdjustCost(Entry->Cost);
3880
3881 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3882 // double-rounding issues.
3883 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3884 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3886 return AdjustCost(
3888 getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(),
3889 CCH, CostKind) +
3891 CostKind) +
3893 CostKind));
3894
3895 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3897 ST->isSVEorStreamingSVEAvailable() &&
3898 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3900 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3902 // The standard behaviour in the backend for these cases is to split the
3903 // extend up into two parts:
3904 // 1. Perform an extending load or masked load up to the legal type.
3905 // 2. Extend the loaded data to the final type.
3906 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3907 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3909 Opcode, LegalTy, Src, CCH, CostKind, I);
3911 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3912 return Part1 + Part2;
3913 }
3914
3915 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3916 // but we also want to include the TTI::CastContextHint::Masked case too.
3917 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3919 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3921
3922 return AdjustCost(
3923 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3924}
3925
3928 VectorType *VecTy, unsigned Index,
3930
3931 // Make sure we were given a valid extend opcode.
3932 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3933 "Invalid opcode");
3934
3935 // We are extending an element we extract from a vector, so the source type
3936 // of the extend is the element type of the vector.
3937 auto *Src = VecTy->getElementType();
3938
3939 // Sign- and zero-extends are for integer types only.
3940 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3941
3942 // Get the cost for the extract. We compute the cost (if any) for the extend
3943 // below.
3944 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3945 CostKind, Index, nullptr, nullptr);
3946
3947 // Legalize the types.
3948 auto VecLT = getTypeLegalizationCost(VecTy);
3949 auto DstVT = TLI->getValueType(DL, Dst);
3950 auto SrcVT = TLI->getValueType(DL, Src);
3951
3952 // If the resulting type is still a vector and the destination type is legal,
3953 // we may get the extension for free. If not, get the default cost for the
3954 // extend.
3955 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3956 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3957 CostKind);
3958
3959 // The destination type should be larger than the element type. If not, get
3960 // the default cost for the extend.
3961 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3962 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3963 CostKind);
3964
3965 switch (Opcode) {
3966 default:
3967 llvm_unreachable("Opcode should be either SExt or ZExt");
3968
3969 // For sign-extends, we only need a smov, which performs the extension
3970 // automatically.
3971 case Instruction::SExt:
3972 return Cost;
3973
3974 // For zero-extends, the extend is performed automatically by a umov unless
3975 // the destination type is i64 and the element type is i8 or i16.
3976 case Instruction::ZExt:
3977 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3978 return Cost;
3979 }
3980
3981 // If we are unable to perform the extend for free, get the default cost.
3982 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3983 CostKind);
3984}
3985
3988 const Instruction *I) const {
3990 return Opcode == Instruction::PHI ? 0 : 1;
3991 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3992 // Branches are assumed to be predicted.
3993 return 0;
3994}
3995
3996InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3997 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3998 const Instruction *I, Value *Scalar,
3999 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
4000 assert(Val->isVectorTy() && "This must be a vector type");
4001
4002 if (Index != -1U) {
4003 // Legalize the type.
4004 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4005
4006 // This type is legalized to a scalar type.
4007 if (!LT.second.isVector())
4008 return 0;
4009
4010 // The type may be split. For fixed-width vectors we can normalize the
4011 // index to the new type.
4012 if (LT.second.isFixedLengthVector()) {
4013 unsigned Width = LT.second.getVectorNumElements();
4014 Index = Index % Width;
4015 }
4016
4017 // The element at index zero is already inside the vector.
4018 // - For a insert-element or extract-element
4019 // instruction that extracts integers, an explicit FPR -> GPR move is
4020 // needed. So it has non-zero cost.
4021 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4022 return 0;
4023
4024 // This is recognising a LD1 single-element structure to one lane of one
4025 // register instruction. I.e., if this is an `insertelement` instruction,
4026 // and its second operand is a load, then we will generate a LD1, which
4027 // are expensive instructions.
4028 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
4029 return CostKind == TTI::TCK_CodeSize
4030 ? 0
4032
4033 // i1 inserts and extract will include an extra cset or cmp of the vector
4034 // value. Increase the cost by 1 to account.
4035 if (Val->getScalarSizeInBits() == 1)
4036 return CostKind == TTI::TCK_CodeSize
4037 ? 2
4039
4040 // FIXME:
4041 // If the extract-element and insert-element instructions could be
4042 // simplified away (e.g., could be combined into users by looking at use-def
4043 // context), they have no cost. This is not done in the first place for
4044 // compile-time considerations.
4045 }
4046
4047 // In case of Neon, if there exists extractelement from lane != 0 such that
4048 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4049 // 2. extractelement result feeds into fmul.
4050 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4051 // equivalent to 0.
4052 // then the extractelement can be merged with fmul in the backend and it
4053 // incurs no cost.
4054 // e.g.
4055 // define double @foo(<2 x double> %a) {
4056 // %1 = extractelement <2 x double> %a, i32 0
4057 // %2 = extractelement <2 x double> %a, i32 1
4058 // %res = fmul double %1, %2
4059 // ret double %res
4060 // }
4061 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4062 auto ExtractCanFuseWithFmul = [&]() {
4063 // We bail out if the extract is from lane 0.
4064 if (Index == 0)
4065 return false;
4066
4067 // Check if the scalar element type of the vector operand of ExtractElement
4068 // instruction is one of the allowed types.
4069 auto IsAllowedScalarTy = [&](const Type *T) {
4070 return T->isFloatTy() || T->isDoubleTy() ||
4071 (T->isHalfTy() && ST->hasFullFP16());
4072 };
4073
4074 // Check if the extractelement user is scalar fmul.
4075 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4076 // Check if the user is scalar fmul.
4077 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4078 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4079 !BO->getType()->isVectorTy();
4080 };
4081
4082 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4083 // certain scalar type and a certain vector register width.
4084 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4085 auto RegWidth =
4087 .getFixedValue();
4088 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4089 };
4090
4091 // Check if the type constraints on input vector type and result scalar type
4092 // of extractelement instruction are satisfied.
4093 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4094 return false;
4095
4096 if (Scalar) {
4097 DenseMap<User *, unsigned> UserToExtractIdx;
4098 for (auto *U : Scalar->users()) {
4099 if (!IsUserFMulScalarTy(U))
4100 return false;
4101 // Recording entry for the user is important. Index value is not
4102 // important.
4103 UserToExtractIdx[U];
4104 }
4105 if (UserToExtractIdx.empty())
4106 return false;
4107 for (auto &[S, U, L] : ScalarUserAndIdx) {
4108 for (auto *U : S->users()) {
4109 if (UserToExtractIdx.contains(U)) {
4110 auto *FMul = cast<BinaryOperator>(U);
4111 auto *Op0 = FMul->getOperand(0);
4112 auto *Op1 = FMul->getOperand(1);
4113 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4114 UserToExtractIdx[U] = L;
4115 break;
4116 }
4117 }
4118 }
4119 }
4120 for (auto &[U, L] : UserToExtractIdx) {
4121 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4122 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4123 return false;
4124 }
4125 } else {
4126 const auto *EE = cast<ExtractElementInst>(I);
4127
4128 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4129 if (!IdxOp)
4130 return false;
4131
4132 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4133 if (!IsUserFMulScalarTy(U))
4134 return false;
4135
4136 // Check if the other operand of extractelement is also extractelement
4137 // from lane equivalent to 0.
4138 const auto *BO = cast<BinaryOperator>(U);
4139 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4140 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4141 if (OtherEE) {
4142 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4143 if (!IdxOp)
4144 return false;
4145 return IsExtractLaneEquivalentToZero(
4146 cast<ConstantInt>(OtherEE->getIndexOperand())
4147 ->getValue()
4148 .getZExtValue(),
4149 OtherEE->getType()->getScalarSizeInBits());
4150 }
4151 return true;
4152 });
4153 }
4154 return true;
4155 };
4156
4157 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4158 ExtractCanFuseWithFmul())
4159 return 0;
4160
4161 // All other insert/extracts cost this much.
4162 return CostKind == TTI::TCK_CodeSize ? 1
4163 : ST->getVectorInsertExtractBaseCost();
4164}
4165
4168 unsigned Index,
4169 const Value *Op0,
4170 const Value *Op1) const {
4171 // Treat insert at lane 0 into a poison vector as having zero cost. This
4172 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4173 // single dup) are treated as cheap.
4174 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4175 isa<PoisonValue>(Op0))
4176 return 0;
4177 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
4178}
4179
4181 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4182 Value *Scalar,
4183 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
4184 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4185 ScalarUserAndIdx);
4186}
4187
4189 Type *Val,
4191 unsigned Index) const {
4192 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
4193}
4194
4198 unsigned Index) const {
4199 if (isa<FixedVectorType>(Val))
4201 Index);
4202
4203 // This typically requires both while and lastb instructions in order
4204 // to extract the last element. If this is in a loop the while
4205 // instruction can at least be hoisted out, although it will consume a
4206 // predicate register. The cost should be more expensive than the base
4207 // extract cost, which is 2 for most CPUs.
4208 return CostKind == TTI::TCK_CodeSize
4209 ? 2
4210 : ST->getVectorInsertExtractBaseCost() + 1;
4211}
4212
4214 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4215 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4216 ArrayRef<Value *> VL) const {
4219 if (Ty->getElementType()->isFloatingPointTy())
4220 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4221 CostKind);
4222 unsigned VecInstCost =
4223 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4224 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4225}
4226
4227std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4229 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4230 std::function<InstructionCost(Type *)> InstCost) const {
4231 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4232 return std::nullopt;
4233 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4234 return std::nullopt;
4235 if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
4236 ST->isNonStreamingSVEorSME2Available())
4237 return std::nullopt;
4238
4239 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4240 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4242 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4243 Cost *= 2;
4244 Cost += InstCost(PromotedTy);
4245 if (IncludeTrunc)
4246 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4248 return Cost;
4249}
4250
4252 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4254 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4255
4256 // The code-generator is currently not able to handle scalable vectors
4257 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4258 // it. This change will be removed when code-generation for these types is
4259 // sufficiently reliable.
4260 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4261 if (VTy->getElementCount() == ElementCount::getScalable(1))
4263
4264 // TODO: Handle more cost kinds.
4266 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4267 Op2Info, Args, CxtI);
4268
4269 // Legalize the type.
4270 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4271 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4272
4273 // Increase the cost for half and bfloat types if not architecturally
4274 // supported.
4275 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4276 ISD == ISD::FDIV || ISD == ISD::FREM)
4277 if (auto PromotedCost = getFP16BF16PromoteCost(
4278 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4279 // There is not native support for fdiv/frem even with +sve-b16b16.
4280 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4281 [&](Type *PromotedTy) {
4282 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4283 Op1Info, Op2Info);
4284 }))
4285 return *PromotedCost;
4286
4287 // If the operation is a widening instruction (smull or umull) and both
4288 // operands are extends the cost can be cheaper by considering that the
4289 // operation will operate on the narrowest type size possible (double the
4290 // largest input size) and a further extend.
4291 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4292 if (ExtTy != Ty)
4293 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4294 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4296 return LT.first;
4297 }
4298
4299 switch (ISD) {
4300 default:
4301 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4302 Op2Info);
4303 case ISD::SREM:
4304 case ISD::SDIV:
4305 /*
4306 Notes for sdiv/srem specific costs:
4307 1. This only considers the cases where the divisor is constant, uniform and
4308 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4309 result in some form of (ldr + adrp), corresponding to constant vectors, or
4310 scalarization of the division operation.
4311 2. Constant divisors, either negative in whole or partially, don't result in
4312 significantly different codegen as compared to positive constant divisors.
4313 So, we don't consider negative divisors separately.
4314 3. If the codegen is significantly different with SVE, it has been indicated
4315 using comments at appropriate places.
4316
4317 sdiv specific cases:
4318 -----------------------------------------------------------------------
4319 codegen | pow-of-2 | Type
4320 -----------------------------------------------------------------------
4321 add + cmp + csel + asr | Y | i64
4322 add + cmp + csel + asr | Y | i32
4323 -----------------------------------------------------------------------
4324
4325 srem specific cases:
4326 -----------------------------------------------------------------------
4327 codegen | pow-of-2 | Type
4328 -----------------------------------------------------------------------
4329 negs + and + and + csneg | Y | i64
4330 negs + and + and + csneg | Y | i32
4331 -----------------------------------------------------------------------
4332
4333 other sdiv/srem cases:
4334 -------------------------------------------------------------------------
4335 common codegen | + srem | + sdiv | pow-of-2 | Type
4336 -------------------------------------------------------------------------
4337 smulh + asr + add + add | - | - | N | i64
4338 smull + lsr + add + add | - | - | N | i32
4339 usra | and + sub | sshr | Y | <2 x i64>
4340 2 * (scalar code) | - | - | N | <2 x i64>
4341 usra | bic + sub | sshr + neg | Y | <4 x i32>
4342 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4343 + sshr + usra | | | |
4344 -------------------------------------------------------------------------
4345 */
4346 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4347 InstructionCost AddCost =
4348 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4349 Op1Info.getNoProps(), Op2Info.getNoProps());
4350 InstructionCost AsrCost =
4351 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4352 Op1Info.getNoProps(), Op2Info.getNoProps());
4353 InstructionCost MulCost =
4354 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4355 Op1Info.getNoProps(), Op2Info.getNoProps());
4356 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4357 // have similar cost.
4358 auto VT = TLI->getValueType(DL, Ty);
4359 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4360 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4361 // Neg can be folded into the asr instruction.
4362 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4363 : (3 * AsrCost + AddCost);
4364 } else {
4365 return MulCost + AsrCost + 2 * AddCost;
4366 }
4367 } else if (VT.isVector()) {
4368 InstructionCost UsraCost = 2 * AsrCost;
4369 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4370 // Division with scalable types corresponds to native 'asrd'
4371 // instruction when SVE is available.
4372 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4373
4374 // One more for the negation in SDIV
4376 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4377 if (Ty->isScalableTy() && ST->hasSVE())
4378 Cost += 2 * AsrCost;
4379 else {
4380 Cost +=
4381 UsraCost +
4382 (ISD == ISD::SDIV
4383 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4384 : 2 * AddCost);
4385 }
4386 return Cost;
4387 } else if (LT.second == MVT::v2i64) {
4388 return VT.getVectorNumElements() *
4389 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4390 Op1Info.getNoProps(),
4391 Op2Info.getNoProps());
4392 } else {
4393 // When SVE is available, we get:
4394 // smulh + lsr + add/sub + asr + add/sub.
4395 if (Ty->isScalableTy() && ST->hasSVE())
4396 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4397 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4398 }
4399 }
4400 }
4401 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4402 LT.second.isFixedLengthVector()) {
4403 // FIXME: When the constant vector is non-uniform, this may result in
4404 // loading the vector from constant pool or in some cases, may also result
4405 // in scalarization. For now, we are approximating this with the
4406 // scalarization cost.
4407 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4408 CostKind, -1, nullptr, nullptr);
4409 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4410 CostKind, -1, nullptr, nullptr);
4411 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4412 return ExtractCost + InsertCost +
4413 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4414 CostKind, Op1Info.getNoProps(),
4415 Op2Info.getNoProps());
4416 }
4417 [[fallthrough]];
4418 case ISD::UDIV:
4419 case ISD::UREM: {
4420 auto VT = TLI->getValueType(DL, Ty);
4421 if (Op2Info.isConstant()) {
4422 // If the operand is a power of 2 we can use the shift or and cost.
4423 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4424 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4425 Op1Info.getNoProps(),
4426 Op2Info.getNoProps());
4427 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4428 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4429 Op1Info.getNoProps(),
4430 Op2Info.getNoProps());
4431
4432 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4433 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4434 // The MULHU will be expanded to UMULL for the types not listed below,
4435 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4436 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4437 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4438 LT.second == MVT::nxv16i8;
4439 bool Is128bit = LT.second.is128BitVector();
4440
4441 InstructionCost MulCost =
4442 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4443 Op1Info.getNoProps(), Op2Info.getNoProps());
4444 InstructionCost AddCost =
4445 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4446 Op1Info.getNoProps(), Op2Info.getNoProps());
4447 InstructionCost ShrCost =
4448 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4449 Op1Info.getNoProps(), Op2Info.getNoProps());
4450 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4451 (HasMULH ? 0 : ShrCost) + // UMULL shift
4452 AddCost * 2 + ShrCost;
4453 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4454 }
4455 }
4456
4457 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4458 // emitted by the backend even when those functions are not declared in the
4459 // module.
4460 if (!VT.isVector() && VT.getSizeInBits() > 64)
4461 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4462
4464 Opcode, Ty, CostKind, Op1Info, Op2Info);
4465 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4466 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4467 // SDIV/UDIV operations are lowered using SVE, then we can have less
4468 // costs.
4469 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4470 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4471 static const CostTblEntry DivTbl[]{
4472 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4473 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4474 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4475 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4476 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4477 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4478
4479 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4480 if (nullptr != Entry)
4481 return Entry->Cost;
4482 }
4483 // For 8/16-bit elements, the cost is higher because the type
4484 // requires promotion and possibly splitting:
4485 if (LT.second.getScalarType() == MVT::i8)
4486 Cost *= 8;
4487 else if (LT.second.getScalarType() == MVT::i16)
4488 Cost *= 4;
4489 return Cost;
4490 } else {
4491 // If one of the operands is a uniform constant then the cost for each
4492 // element is Cost for insertion, extraction and division.
4493 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4494 // operation with scalar type
4495 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4496 (Op2Info.isConstant() && Op2Info.isUniform())) {
4497 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4499 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4500 return (4 + DivCost) * VTy->getNumElements();
4501 }
4502 }
4503 // On AArch64, without SVE, vector divisions are expanded
4504 // into scalar divisions of each pair of elements.
4505 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4506 -1, nullptr, nullptr);
4507 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4508 nullptr, nullptr);
4509 }
4510
4511 // TODO: if one of the arguments is scalar, then it's not necessary to
4512 // double the cost of handling the vector elements.
4513 Cost += Cost;
4514 }
4515 return Cost;
4516 }
4517 case ISD::MUL:
4518 // When SVE is available, then we can lower the v2i64 operation using
4519 // the SVE mul instruction, which has a lower cost.
4520 if (LT.second == MVT::v2i64 && ST->hasSVE())
4521 return LT.first;
4522
4523 // When SVE is not available, there is no MUL.2d instruction,
4524 // which means mul <2 x i64> is expensive as elements are extracted
4525 // from the vectors and the muls scalarized.
4526 // As getScalarizationOverhead is a bit too pessimistic, we
4527 // estimate the cost for a i64 vector directly here, which is:
4528 // - four 2-cost i64 extracts,
4529 // - two 2-cost i64 inserts, and
4530 // - two 1-cost muls.
4531 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4532 // LT.first = 2 the cost is 28.
4533 if (LT.second != MVT::v2i64)
4534 return LT.first;
4535 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4536 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4537 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4538 nullptr, nullptr) *
4539 2 +
4540 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4541 nullptr, nullptr));
4542 case ISD::ADD:
4543 case ISD::XOR:
4544 case ISD::OR:
4545 case ISD::AND:
4546 case ISD::SRL:
4547 case ISD::SRA:
4548 case ISD::SHL:
4549 // These nodes are marked as 'custom' for combining purposes only.
4550 // We know that they are legal. See LowerAdd in ISelLowering.
4551 return LT.first;
4552
4553 case ISD::FNEG:
4554 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4555 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4556 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4557 CxtI &&
4558 ((CxtI->hasOneUse() &&
4559 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4560 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4561 return 0;
4562 [[fallthrough]];
4563 case ISD::FADD:
4564 case ISD::FSUB:
4565 if (!Ty->getScalarType()->isFP128Ty())
4566 return LT.first;
4567 [[fallthrough]];
4568 case ISD::FMUL:
4569 case ISD::FDIV:
4570 // These nodes are marked as 'custom' just to lower them to SVE.
4571 // We know said lowering will incur no additional cost.
4572 if (!Ty->getScalarType()->isFP128Ty())
4573 return 2 * LT.first;
4574
4575 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4576 Op2Info);
4577 case ISD::FREM:
4578 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4579 // those functions are not declared in the module.
4580 if (!Ty->isVectorTy())
4581 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4582 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4583 Op2Info);
4584 }
4585}
4586
4589 const SCEV *Ptr,
4591 // Address computations in vectorized code with non-consecutive addresses will
4592 // likely result in more instructions compared to scalar code where the
4593 // computation can more often be merged into the index mode. The resulting
4594 // extra micro-ops can significantly decrease throughput.
4595 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4596 int MaxMergeDistance = 64;
4597
4598 if (PtrTy->isVectorTy() && SE &&
4599 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4600 return NumVectorInstToHideOverhead;
4601
4602 // In many cases the address computation is not merged into the instruction
4603 // addressing mode.
4604 return 1;
4605}
4606
4607/// Check whether Opcode1 has less throughput according to the scheduling
4608/// model than Opcode2.
4610 unsigned Opcode1, unsigned Opcode2) const {
4611 const MCSchedModel &Sched = ST->getSchedModel();
4612 const TargetInstrInfo *TII = ST->getInstrInfo();
4613 if (!Sched.hasInstrSchedModel())
4614 return false;
4615
4616 const MCSchedClassDesc *SCD1 =
4617 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4618 const MCSchedClassDesc *SCD2 =
4619 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4620 // We cannot handle variant scheduling classes without an MI. If we need to
4621 // support them for any of the instructions we query the information of we
4622 // might need to add a way to resolve them without a MI or not use the
4623 // scheduling info.
4624 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4625 "Cannot handle variant scheduling classes without an MI");
4626 if (!SCD1->isValid() || !SCD2->isValid())
4627 return false;
4628
4629 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4631}
4632
4634 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4636 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4637 // We don't lower some vector selects well that are wider than the register
4638 // width. TODO: Improve this with different cost kinds.
4639 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4640 // We would need this many instructions to hide the scalarization happening.
4641 const int AmortizationCost = 20;
4642
4643 // If VecPred is not set, check if we can get a predicate from the context
4644 // instruction, if its type matches the requested ValTy.
4645 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4646 CmpPredicate CurrentPred;
4647 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4648 m_Value())))
4649 VecPred = CurrentPred;
4650 }
4651 // Check if we have a compare/select chain that can be lowered using
4652 // a (F)CMxx & BFI pair.
4653 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4654 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4655 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4656 VecPred == CmpInst::FCMP_UNE) {
4657 static const auto ValidMinMaxTys = {
4658 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4659 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4660 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4661
4662 auto LT = getTypeLegalizationCost(ValTy);
4663 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
4664 (ST->hasFullFP16() &&
4665 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
4666 return LT.first;
4667 }
4668
4669 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4670 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4671 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4672 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4673 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4674 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4675 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4676 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4677 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4678 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4679 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4680 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4681
4682 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4683 EVT SelValTy = TLI->getValueType(DL, ValTy);
4684 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4685 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4686 SelCondTy.getSimpleVT(),
4687 SelValTy.getSimpleVT()))
4688 return Entry->Cost;
4689 }
4690 }
4691
4692 if (Opcode == Instruction::FCmp) {
4693 if (auto PromotedCost = getFP16BF16PromoteCost(
4694 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4695 // TODO: Consider costing SVE FCMPs.
4696 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4698 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4699 CostKind, Op1Info, Op2Info);
4700 if (isa<VectorType>(PromotedTy))
4702 Instruction::Trunc,
4706 return Cost;
4707 }))
4708 return *PromotedCost;
4709
4710 auto LT = getTypeLegalizationCost(ValTy);
4711 // Model unknown fp compares as a libcall.
4712 if (LT.second.getScalarType() != MVT::f64 &&
4713 LT.second.getScalarType() != MVT::f32 &&
4714 LT.second.getScalarType() != MVT::f16)
4715 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4716 {ValTy, ValTy}, CostKind);
4717
4718 // Some comparison operators require expanding to multiple compares + or.
4719 unsigned Factor = 1;
4720 if (!CondTy->isVectorTy() &&
4721 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4722 Factor = 2; // fcmp with 2 selects
4723 else if (isa<FixedVectorType>(ValTy) &&
4724 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4725 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4726 Factor = 3; // fcmxx+fcmyy+or
4727 else if (isa<ScalableVectorType>(ValTy) &&
4728 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4729 Factor = 3; // fcmxx+fcmyy+or
4730
4731 if (isa<ScalableVectorType>(ValTy) &&
4733 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4734 AArch64::FCMEQv4f32))
4735 Factor *= 2;
4736
4737 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4738 }
4739
4740 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4741 // icmp(and, 0) as free, as we can make use of ands, but only if the
4742 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4743 // providing it will not cause performance regressions.
4744 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4745 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4746 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4747 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4748 if (match(I->getOperand(1), m_Zero()))
4749 return 0;
4750
4751 // x >= 1 / x < 1 -> x > 0 / x <= 0
4752 if (match(I->getOperand(1), m_One()) &&
4753 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4754 return 0;
4755
4756 // x <= -1 / x > -1 -> x > 0 / x <= 0
4757 if (match(I->getOperand(1), m_AllOnes()) &&
4758 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4759 return 0;
4760 }
4761
4762 // The base case handles scalable vectors fine for now, since it treats the
4763 // cost as 1 * legalization cost.
4764 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4765 Op1Info, Op2Info, I);
4766}
4767
4769AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4771 if (ST->requiresStrictAlign()) {
4772 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4773 // a bunch of instructions when strict align is enabled.
4774 return Options;
4775 }
4776 Options.AllowOverlappingLoads = true;
4777 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4778 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4779 // TODO: Though vector loads usually perform well on AArch64, in some targets
4780 // they may wake up the FP unit, which raises the power consumption. Perhaps
4781 // they could be used with no holds barred (-O3).
4782 Options.LoadSizes = {8, 4, 2, 1};
4783 Options.AllowedTailExpansions = {3, 5, 6};
4784 return Options;
4785}
4786
4788 return ST->hasSVE();
4789}
4790
4794 switch (MICA.getID()) {
4795 case Intrinsic::masked_scatter:
4796 case Intrinsic::masked_gather:
4797 return getGatherScatterOpCost(MICA, CostKind);
4798 case Intrinsic::masked_load:
4799 case Intrinsic::masked_store:
4800 return getMaskedMemoryOpCost(MICA, CostKind);
4801 }
4803}
4804
4808 Type *Src = MICA.getDataType();
4809
4810 if (useNeonVector(Src))
4812 auto LT = getTypeLegalizationCost(Src);
4813 if (!LT.first.isValid())
4815
4816 // Return an invalid cost for element types that we are unable to lower.
4817 auto *VT = cast<VectorType>(Src);
4818 if (VT->getElementType()->isIntegerTy(1))
4820
4821 // The code-generator is currently not able to handle scalable vectors
4822 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4823 // it. This change will be removed when code-generation for these types is
4824 // sufficiently reliable.
4825 if (VT->getElementCount() == ElementCount::getScalable(1))
4827
4828 return LT.first;
4829}
4830
4831// This function returns gather/scatter overhead either from
4832// user-provided value or specialized values per-target from \p ST.
4833static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4834 const AArch64Subtarget *ST) {
4835 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4836 "Should be called on only load or stores.");
4837 switch (Opcode) {
4838 case Instruction::Load:
4839 if (SVEGatherOverhead.getNumOccurrences() > 0)
4840 return SVEGatherOverhead;
4841 return ST->getGatherOverhead();
4842 break;
4843 case Instruction::Store:
4844 if (SVEScatterOverhead.getNumOccurrences() > 0)
4845 return SVEScatterOverhead;
4846 return ST->getScatterOverhead();
4847 break;
4848 default:
4849 llvm_unreachable("Shouldn't have reached here");
4850 }
4851}
4852
4856
4857 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4858 MICA.getID() == Intrinsic::vp_gather)
4859 ? Instruction::Load
4860 : Instruction::Store;
4861
4862 Type *DataTy = MICA.getDataType();
4863 Align Alignment = MICA.getAlignment();
4864 const Instruction *I = MICA.getInst();
4865
4866 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4868 auto *VT = cast<VectorType>(DataTy);
4869 auto LT = getTypeLegalizationCost(DataTy);
4870 if (!LT.first.isValid())
4872
4873 // Return an invalid cost for element types that we are unable to lower.
4874 if (!LT.second.isVector() ||
4875 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4876 VT->getElementType()->isIntegerTy(1))
4878
4879 // The code-generator is currently not able to handle scalable vectors
4880 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4881 // it. This change will be removed when code-generation for these types is
4882 // sufficiently reliable.
4883 if (VT->getElementCount() == ElementCount::getScalable(1))
4885
4886 ElementCount LegalVF = LT.second.getVectorElementCount();
4887 InstructionCost MemOpCost =
4888 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4889 {TTI::OK_AnyValue, TTI::OP_None}, I);
4890 // Add on an overhead cost for using gathers/scatters.
4891 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4892 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4893}
4894
4896 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4897}
4898
4900 Align Alignment,
4901 unsigned AddressSpace,
4903 TTI::OperandValueInfo OpInfo,
4904 const Instruction *I) const {
4905 EVT VT = TLI->getValueType(DL, Ty, true);
4906 // Type legalization can't handle structs
4907 if (VT == MVT::Other)
4908 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4909 CostKind);
4910
4911 auto LT = getTypeLegalizationCost(Ty);
4912 if (!LT.first.isValid())
4914
4915 // The code-generator is currently not able to handle scalable vectors
4916 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4917 // it. This change will be removed when code-generation for these types is
4918 // sufficiently reliable.
4919 // We also only support full register predicate loads and stores.
4920 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4921 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4922 (VTy->getElementType()->isIntegerTy(1) &&
4923 !VTy->getElementCount().isKnownMultipleOf(
4926
4927 // TODO: consider latency as well for TCK_SizeAndLatency.
4929 return LT.first;
4930
4932 return 1;
4933
4934 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4935 LT.second.is128BitVector() && Alignment < Align(16)) {
4936 // Unaligned stores are extremely inefficient. We don't split all
4937 // unaligned 128-bit stores because the negative impact that has shown in
4938 // practice on inlined block copy code.
4939 // We make such stores expensive so that we will only vectorize if there
4940 // are 6 other instructions getting vectorized.
4941 const int AmortizationCost = 6;
4942
4943 return LT.first * 2 * AmortizationCost;
4944 }
4945
4946 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4947 if (Ty->isPtrOrPtrVectorTy())
4948 return LT.first;
4949
4950 if (useNeonVector(Ty)) {
4951 // Check truncating stores and extending loads.
4952 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4953 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4954 if (VT == MVT::v4i8)
4955 return 2;
4956 // Otherwise we need to scalarize.
4957 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4958 }
4959 EVT EltVT = VT.getVectorElementType();
4960 unsigned EltSize = EltVT.getScalarSizeInBits();
4961 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4962 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4963 return LT.first;
4964 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4965 // widening to v4i8, which produces suboptimal results.
4966 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4967 return LT.first;
4968
4969 // Check non-power-of-2 loads/stores for legal vector element types with
4970 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4971 // operations on smaller power-of-2 ops, including ld1/st1.
4972 LLVMContext &C = Ty->getContext();
4974 SmallVector<EVT> TypeWorklist;
4975 TypeWorklist.push_back(VT);
4976 while (!TypeWorklist.empty()) {
4977 EVT CurrVT = TypeWorklist.pop_back_val();
4978 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4979 if (isPowerOf2_32(CurrNumElements)) {
4980 Cost += 1;
4981 continue;
4982 }
4983
4984 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4985 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4986 TypeWorklist.push_back(
4987 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4988 }
4989 return Cost;
4990 }
4991
4992 return LT.first;
4993}
4994
4996 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4997 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4998 bool UseMaskForCond, bool UseMaskForGaps) const {
4999 assert(Factor >= 2 && "Invalid interleave factor");
5000 auto *VecVTy = cast<VectorType>(VecTy);
5001
5002 if (VecTy->isScalableTy() && !ST->hasSVE())
5004
5005 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5006 // only have lowering for power-of-2 factors.
5007 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5008 // InterleavedAccessPass for ld3/st3
5009 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
5011
5012 // Vectorization for masked interleaved accesses is only enabled for scalable
5013 // VF.
5014 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5016
5017 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5018 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5019 auto *SubVecTy =
5020 VectorType::get(VecVTy->getElementType(),
5021 VecVTy->getElementCount().divideCoefficientBy(Factor));
5022
5023 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5024 // Accesses having vector types that are a multiple of 128 bits can be
5025 // matched to more than one ldN/stN instruction.
5026 bool UseScalable;
5027 if (MinElts % Factor == 0 &&
5028 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5029 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5030 }
5031
5032 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5033 Alignment, AddressSpace, CostKind,
5034 UseMaskForCond, UseMaskForGaps);
5035}
5036
5041 for (auto *I : Tys) {
5042 if (!I->isVectorTy())
5043 continue;
5044 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5045 128)
5046 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5047 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5048 }
5049 return Cost;
5050}
5051
5053 return ST->getMaxInterleaveFactor();
5054}
5055
5056// For Falkor, we want to avoid having too many strided loads in a loop since
5057// that can exhaust the HW prefetcher resources. We adjust the unroller
5058// MaxCount preference below to attempt to ensure unrolling doesn't create too
5059// many strided loads.
5060static void
5063 enum { MaxStridedLoads = 7 };
5064 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5065 int StridedLoads = 0;
5066 // FIXME? We could make this more precise by looking at the CFG and
5067 // e.g. not counting loads in each side of an if-then-else diamond.
5068 for (const auto BB : L->blocks()) {
5069 for (auto &I : *BB) {
5070 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5071 if (!LMemI)
5072 continue;
5073
5074 Value *PtrValue = LMemI->getPointerOperand();
5075 if (L->isLoopInvariant(PtrValue))
5076 continue;
5077
5078 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5079 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5080 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5081 continue;
5082
5083 // FIXME? We could take pairing of unrolled load copies into account
5084 // by looking at the AddRec, but we would probably have to limit this
5085 // to loops with no stores or other memory optimization barriers.
5086 ++StridedLoads;
5087 // We've seen enough strided loads that seeing more won't make a
5088 // difference.
5089 if (StridedLoads > MaxStridedLoads / 2)
5090 return StridedLoads;
5091 }
5092 }
5093 return StridedLoads;
5094 };
5095
5096 int StridedLoads = countStridedLoads(L, SE);
5097 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5098 << " strided loads\n");
5099 // Pick the largest power of 2 unroll count that won't result in too many
5100 // strided loads.
5101 if (StridedLoads) {
5102 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5103 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5104 << UP.MaxCount << '\n');
5105 }
5106}
5107
5108// This function returns true if the loop:
5109// 1. Has a valid cost, and
5110// 2. Has a cost within the supplied budget.
5111// Otherwise it returns false.
5113 InstructionCost Budget,
5114 unsigned *FinalSize) {
5115 // Estimate the size of the loop.
5116 InstructionCost LoopCost = 0;
5117
5118 for (auto *BB : L->getBlocks()) {
5119 for (auto &I : *BB) {
5120 SmallVector<const Value *, 4> Operands(I.operand_values());
5121 InstructionCost Cost =
5122 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5123 // This can happen with intrinsics that don't currently have a cost model
5124 // or for some operations that require SVE.
5125 if (!Cost.isValid())
5126 return false;
5127
5128 LoopCost += Cost;
5129 if (LoopCost > Budget)
5130 return false;
5131 }
5132 }
5133
5134 if (FinalSize)
5135 *FinalSize = LoopCost.getValue();
5136 return true;
5137}
5138
5140 const AArch64TTIImpl &TTI) {
5141 // Only consider loops with unknown trip counts for which we can determine
5142 // a symbolic expression. Multi-exit loops with small known trip counts will
5143 // likely be unrolled anyway.
5144 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5146 return false;
5147
5148 // It might not be worth unrolling loops with low max trip counts. Restrict
5149 // this to max trip counts > 32 for now.
5150 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5151 if (MaxTC > 0 && MaxTC <= 32)
5152 return false;
5153
5154 // Make sure the loop size is <= 5.
5155 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5156 return false;
5157
5158 // Small search loops with multiple exits can be highly beneficial to unroll.
5159 // We only care about loops with exactly two exiting blocks, although each
5160 // block could jump to the same exit block.
5161 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5162 if (Blocks.size() != 2)
5163 return false;
5164
5165 if (any_of(Blocks, [](BasicBlock *BB) {
5166 return !isa<BranchInst>(BB->getTerminator());
5167 }))
5168 return false;
5169
5170 return true;
5171}
5172
5173/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5174/// OOO engine's wide instruction window and various predictors.
5175static void
5178 const AArch64TTIImpl &TTI) {
5179 // Limit loops with structure that is highly likely to benefit from runtime
5180 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5181 // likely with complex control flow). Note that the heuristics here may be
5182 // overly conservative and we err on the side of avoiding runtime unrolling
5183 // rather than unroll excessively. They are all subject to further refinement.
5184 if (!L->isInnermost() || L->getNumBlocks() > 8)
5185 return;
5186
5187 // Loops with multiple exits are handled by common code.
5188 if (!L->getExitBlock())
5189 return;
5190
5191 // Check if the loop contains any reductions that could be parallelized when
5192 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5193 // a multiple of 2.
5194 bool HasParellelizableReductions =
5195 L->getNumBlocks() == 1 &&
5196 any_of(L->getHeader()->phis(),
5197 [&SE, L](PHINode &Phi) {
5198 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5199 }) &&
5200 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5201 if (HasParellelizableReductions &&
5202 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5203 UP.Partial = true;
5204 UP.MaxCount = 4;
5205 UP.AddAdditionalAccumulators = true;
5206 }
5207
5208 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5210 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5211 SE.getSmallConstantMaxTripCount(L) <= 32))
5212 return;
5213
5214 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5215 return;
5216
5218 return;
5219
5220 // Limit to loops with trip counts that are cheap to expand.
5221 UP.SCEVExpansionBudget = 1;
5222
5223 if (HasParellelizableReductions) {
5224 UP.Runtime = true;
5226 UP.AddAdditionalAccumulators = true;
5227 }
5228
5229 // Try to unroll small loops, of few-blocks with low budget, if they have
5230 // load/store dependencies, to expose more parallel memory access streams,
5231 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5232 BasicBlock *Header = L->getHeader();
5233 BasicBlock *Latch = L->getLoopLatch();
5234 if (Header == Latch) {
5235 // Estimate the size of the loop.
5236 unsigned Size;
5237 unsigned Width = 10;
5238 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5239 return;
5240
5241 // Try to find an unroll count that maximizes the use of the instruction
5242 // window, i.e. trying to fetch as many instructions per cycle as possible.
5243 unsigned MaxInstsPerLine = 16;
5244 unsigned UC = 1;
5245 unsigned BestUC = 1;
5246 unsigned SizeWithBestUC = BestUC * Size;
5247 while (UC <= 8) {
5248 unsigned SizeWithUC = UC * Size;
5249 if (SizeWithUC > 48)
5250 break;
5251 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5252 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5253 BestUC = UC;
5254 SizeWithBestUC = BestUC * Size;
5255 }
5256 UC++;
5257 }
5258
5259 if (BestUC == 1)
5260 return;
5261
5262 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5264 for (auto *BB : L->blocks()) {
5265 for (auto &I : *BB) {
5267 if (!Ptr)
5268 continue;
5269 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5270 if (SE.isLoopInvariant(PtrSCEV, L))
5271 continue;
5272 if (isa<LoadInst>(&I)) {
5273 LoadedValuesPlus.insert(&I);
5274 // Include in-loop 1st users of loaded values.
5275 for (auto *U : I.users())
5276 if (L->contains(cast<Instruction>(U)))
5277 LoadedValuesPlus.insert(U);
5278 } else
5279 Stores.push_back(cast<StoreInst>(&I));
5280 }
5281 }
5282
5283 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5284 return LoadedValuesPlus.contains(SI->getOperand(0));
5285 }))
5286 return;
5287
5288 UP.Runtime = true;
5289 UP.DefaultUnrollRuntimeCount = BestUC;
5290 return;
5291 }
5292
5293 // Try to runtime-unroll loops with early-continues depending on loop-varying
5294 // loads; this helps with branch-prediction for the early-continues.
5295 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5297 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5298 !llvm::is_contained(Preds, Header) ||
5299 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5300 return;
5301
5302 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5303 [&](Instruction *I, unsigned Depth) -> bool {
5304 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5305 return false;
5306
5307 if (isa<LoadInst>(I))
5308 return true;
5309
5310 return any_of(I->operands(), [&](Value *V) {
5311 auto *I = dyn_cast<Instruction>(V);
5312 return I && DependsOnLoopLoad(I, Depth + 1);
5313 });
5314 };
5315 CmpPredicate Pred;
5316 Instruction *I;
5317 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5318 m_Value())) &&
5319 DependsOnLoopLoad(I, 0)) {
5320 UP.Runtime = true;
5321 }
5322}
5323
5326 OptimizationRemarkEmitter *ORE) const {
5327 // Enable partial unrolling and runtime unrolling.
5328 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5329
5330 UP.UpperBound = true;
5331
5332 // For inner loop, it is more likely to be a hot one, and the runtime check
5333 // can be promoted out from LICM pass, so the overhead is less, let's try
5334 // a larger threshold to unroll more loops.
5335 if (L->getLoopDepth() > 1)
5336 UP.PartialThreshold *= 2;
5337
5338 // Disable partial & runtime unrolling on -Os.
5340
5341 // Scan the loop: don't unroll loops with calls as this could prevent
5342 // inlining. Don't unroll auto-vectorized loops either, though do allow
5343 // unrolling of the scalar remainder.
5344 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5346 for (auto *BB : L->getBlocks()) {
5347 for (auto &I : *BB) {
5348 // Both auto-vectorized loops and the scalar remainder have the
5349 // isvectorized attribute, so differentiate between them by the presence
5350 // of vector instructions.
5351 if (IsVectorized && I.getType()->isVectorTy())
5352 return;
5353 if (isa<CallBase>(I)) {
5356 if (!isLoweredToCall(F))
5357 continue;
5358 return;
5359 }
5360
5361 SmallVector<const Value *, 4> Operands(I.operand_values());
5362 Cost += getInstructionCost(&I, Operands,
5364 }
5365 }
5366
5367 // Apply subtarget-specific unrolling preferences.
5368 if (ST->isAppleMLike())
5369 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5370 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5373
5374 // If this is a small, multi-exit loop similar to something like std::find,
5375 // then there is typically a performance improvement achieved by unrolling.
5376 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5377 UP.RuntimeUnrollMultiExit = true;
5378 UP.Runtime = true;
5379 // Limit unroll count.
5381 // Allow slightly more costly trip-count expansion to catch search loops
5382 // with pointer inductions.
5383 UP.SCEVExpansionBudget = 5;
5384 return;
5385 }
5386
5387 // Enable runtime unrolling for in-order models
5388 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5389 // checking for that case, we can ensure that the default behaviour is
5390 // unchanged
5391 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5392 !ST->getSchedModel().isOutOfOrder()) {
5393 UP.Runtime = true;
5394 UP.Partial = true;
5395 UP.UnrollRemainder = true;
5397
5398 UP.UnrollAndJam = true;
5400 }
5401
5402 // Force unrolling small loops can be very useful because of the branch
5403 // taken cost of the backedge.
5405 UP.Force = true;
5406}
5407
5412
5414 Type *ExpectedType,
5415 bool CanCreate) const {
5416 switch (Inst->getIntrinsicID()) {
5417 default:
5418 return nullptr;
5419 case Intrinsic::aarch64_neon_st2:
5420 case Intrinsic::aarch64_neon_st3:
5421 case Intrinsic::aarch64_neon_st4: {
5422 // Create a struct type
5423 StructType *ST = dyn_cast<StructType>(ExpectedType);
5424 if (!CanCreate || !ST)
5425 return nullptr;
5426 unsigned NumElts = Inst->arg_size() - 1;
5427 if (ST->getNumElements() != NumElts)
5428 return nullptr;
5429 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5430 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5431 return nullptr;
5432 }
5433 Value *Res = PoisonValue::get(ExpectedType);
5434 IRBuilder<> Builder(Inst);
5435 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5436 Value *L = Inst->getArgOperand(i);
5437 Res = Builder.CreateInsertValue(Res, L, i);
5438 }
5439 return Res;
5440 }
5441 case Intrinsic::aarch64_neon_ld2:
5442 case Intrinsic::aarch64_neon_ld3:
5443 case Intrinsic::aarch64_neon_ld4:
5444 if (Inst->getType() == ExpectedType)
5445 return Inst;
5446 return nullptr;
5447 }
5448}
5449
5451 MemIntrinsicInfo &Info) const {
5452 switch (Inst->getIntrinsicID()) {
5453 default:
5454 break;
5455 case Intrinsic::aarch64_neon_ld2:
5456 case Intrinsic::aarch64_neon_ld3:
5457 case Intrinsic::aarch64_neon_ld4:
5458 Info.ReadMem = true;
5459 Info.WriteMem = false;
5460 Info.PtrVal = Inst->getArgOperand(0);
5461 break;
5462 case Intrinsic::aarch64_neon_st2:
5463 case Intrinsic::aarch64_neon_st3:
5464 case Intrinsic::aarch64_neon_st4:
5465 Info.ReadMem = false;
5466 Info.WriteMem = true;
5467 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5468 break;
5469 }
5470
5471 switch (Inst->getIntrinsicID()) {
5472 default:
5473 return false;
5474 case Intrinsic::aarch64_neon_ld2:
5475 case Intrinsic::aarch64_neon_st2:
5476 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5477 break;
5478 case Intrinsic::aarch64_neon_ld3:
5479 case Intrinsic::aarch64_neon_st3:
5480 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5481 break;
5482 case Intrinsic::aarch64_neon_ld4:
5483 case Intrinsic::aarch64_neon_st4:
5484 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5485 break;
5486 }
5487 return true;
5488}
5489
5490/// See if \p I should be considered for address type promotion. We check if \p
5491/// I is a sext with right type and used in memory accesses. If it used in a
5492/// "complex" getelementptr, we allow it to be promoted without finding other
5493/// sext instructions that sign extended the same initial value. A getelementptr
5494/// is considered as "complex" if it has more than 2 operands.
5496 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5497 bool Considerable = false;
5498 AllowPromotionWithoutCommonHeader = false;
5499 if (!isa<SExtInst>(&I))
5500 return false;
5501 Type *ConsideredSExtType =
5502 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5503 if (I.getType() != ConsideredSExtType)
5504 return false;
5505 // See if the sext is the one with the right type and used in at least one
5506 // GetElementPtrInst.
5507 for (const User *U : I.users()) {
5508 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5509 Considerable = true;
5510 // A getelementptr is considered as "complex" if it has more than 2
5511 // operands. We will promote a SExt used in such complex GEP as we
5512 // expect some computation to be merged if they are done on 64 bits.
5513 if (GEPInst->getNumOperands() > 2) {
5514 AllowPromotionWithoutCommonHeader = true;
5515 break;
5516 }
5517 }
5518 }
5519 return Considerable;
5520}
5521
5523 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5524 if (!VF.isScalable())
5525 return true;
5526
5527 Type *Ty = RdxDesc.getRecurrenceType();
5528 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5529 return false;
5530
5531 switch (RdxDesc.getRecurrenceKind()) {
5532 case RecurKind::Sub:
5534 case RecurKind::Add:
5535 case RecurKind::FAdd:
5536 case RecurKind::And:
5537 case RecurKind::Or:
5538 case RecurKind::Xor:
5539 case RecurKind::SMin:
5540 case RecurKind::SMax:
5541 case RecurKind::UMin:
5542 case RecurKind::UMax:
5543 case RecurKind::FMin:
5544 case RecurKind::FMax:
5545 case RecurKind::FMulAdd:
5546 case RecurKind::AnyOf:
5547 return true;
5548 default:
5549 return false;
5550 }
5551}
5552
5555 FastMathFlags FMF,
5557 // The code-generator is currently not able to handle scalable vectors
5558 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5559 // it. This change will be removed when code-generation for these types is
5560 // sufficiently reliable.
5561 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5562 if (VTy->getElementCount() == ElementCount::getScalable(1))
5564
5565 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5566
5567 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5568 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5569
5570 InstructionCost LegalizationCost = 0;
5571 if (LT.first > 1) {
5572 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5573 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5574 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5575 }
5576
5577 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5578}
5579
5581 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5582 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5583 InstructionCost LegalizationCost = 0;
5584 if (LT.first > 1) {
5585 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5586 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5587 LegalizationCost *= LT.first - 1;
5588 }
5589
5590 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5591 assert(ISD && "Invalid opcode");
5592 // Add the final reduction cost for the legal horizontal reduction
5593 switch (ISD) {
5594 case ISD::ADD:
5595 case ISD::AND:
5596 case ISD::OR:
5597 case ISD::XOR:
5598 case ISD::FADD:
5599 return LegalizationCost + 2;
5600 default:
5602 }
5603}
5604
5607 std::optional<FastMathFlags> FMF,
5609 // The code-generator is currently not able to handle scalable vectors
5610 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5611 // it. This change will be removed when code-generation for these types is
5612 // sufficiently reliable.
5613 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5614 if (VTy->getElementCount() == ElementCount::getScalable(1))
5616
5618 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5619 InstructionCost BaseCost =
5620 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5621 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5622 // end up vectorizing for more computationally intensive loops.
5623 return BaseCost + FixedVTy->getNumElements();
5624 }
5625
5626 if (Opcode != Instruction::FAdd)
5628
5629 auto *VTy = cast<ScalableVectorType>(ValTy);
5631 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5632 Cost *= getMaxNumElements(VTy->getElementCount());
5633 return Cost;
5634 }
5635
5636 if (isa<ScalableVectorType>(ValTy))
5637 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5638
5639 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5640 MVT MTy = LT.second;
5641 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5642 assert(ISD && "Invalid opcode");
5643
5644 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5645 // instructions as twice a normal vector add, plus 1 for each legalization
5646 // step (LT.first). This is the only arithmetic vector reduction operation for
5647 // which we have an instruction.
5648 // OR, XOR and AND costs should match the codegen from:
5649 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5650 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5651 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5652 static const CostTblEntry CostTblNoPairwise[]{
5653 {ISD::ADD, MVT::v8i8, 2},
5654 {ISD::ADD, MVT::v16i8, 2},
5655 {ISD::ADD, MVT::v4i16, 2},
5656 {ISD::ADD, MVT::v8i16, 2},
5657 {ISD::ADD, MVT::v2i32, 2},
5658 {ISD::ADD, MVT::v4i32, 2},
5659 {ISD::ADD, MVT::v2i64, 2},
5660 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5661 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5662 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5663 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5664 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5665 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5666 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5667 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5668 {ISD::XOR, MVT::v16i8, 7},
5669 {ISD::XOR, MVT::v4i16, 4},
5670 {ISD::XOR, MVT::v8i16, 6},
5671 {ISD::XOR, MVT::v2i32, 3},
5672 {ISD::XOR, MVT::v4i32, 5},
5673 {ISD::XOR, MVT::v2i64, 3},
5674 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5675 {ISD::AND, MVT::v16i8, 7},
5676 {ISD::AND, MVT::v4i16, 4},
5677 {ISD::AND, MVT::v8i16, 6},
5678 {ISD::AND, MVT::v2i32, 3},
5679 {ISD::AND, MVT::v4i32, 5},
5680 {ISD::AND, MVT::v2i64, 3},
5681 };
5682 switch (ISD) {
5683 default:
5684 break;
5685 case ISD::FADD:
5686 if (Type *EltTy = ValTy->getScalarType();
5687 // FIXME: For half types without fullfp16 support, this could extend and
5688 // use a fp32 faddp reduction but current codegen unrolls.
5689 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5690 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5691 const unsigned NElts = MTy.getVectorNumElements();
5692 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5693 isPowerOf2_32(NElts))
5694 // Reduction corresponding to series of fadd instructions is lowered to
5695 // series of faddp instructions. faddp has latency/throughput that
5696 // matches fadd instruction and hence, every faddp instruction can be
5697 // considered to have a relative cost = 1 with
5698 // CostKind = TCK_RecipThroughput.
5699 // An faddp will pairwise add vector elements, so the size of input
5700 // vector reduces by half every time, requiring
5701 // #(faddp instructions) = log2_32(NElts).
5702 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5703 }
5704 break;
5705 case ISD::ADD:
5706 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5707 return (LT.first - 1) + Entry->Cost;
5708 break;
5709 case ISD::XOR:
5710 case ISD::AND:
5711 case ISD::OR:
5712 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5713 if (!Entry)
5714 break;
5715 auto *ValVTy = cast<FixedVectorType>(ValTy);
5716 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5717 isPowerOf2_32(ValVTy->getNumElements())) {
5718 InstructionCost ExtraCost = 0;
5719 if (LT.first != 1) {
5720 // Type needs to be split, so there is an extra cost of LT.first - 1
5721 // arithmetic ops.
5722 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5723 MTy.getVectorNumElements());
5724 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5725 ExtraCost *= LT.first - 1;
5726 }
5727 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5728 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5729 return Cost + ExtraCost;
5730 }
5731 break;
5732 }
5733 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5734}
5735
5737 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5738 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5739 EVT VecVT = TLI->getValueType(DL, VecTy);
5740 EVT ResVT = TLI->getValueType(DL, ResTy);
5741
5742 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5743 VecVT.getSizeInBits() >= 64) {
5744 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5745
5746 // The legal cases are:
5747 // UADDLV 8/16/32->32
5748 // UADDLP 32->64
5749 unsigned RevVTSize = ResVT.getSizeInBits();
5750 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5751 RevVTSize <= 32) ||
5752 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5753 RevVTSize <= 32) ||
5754 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5755 RevVTSize <= 64))
5756 return (LT.first - 1) * 2 + 2;
5757 }
5758
5759 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5760 CostKind);
5761}
5762
5764AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5765 Type *ResTy, VectorType *VecTy,
5767 EVT VecVT = TLI->getValueType(DL, VecTy);
5768 EVT ResVT = TLI->getValueType(DL, ResTy);
5769
5770 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5771 RedOpcode == Instruction::Add) {
5772 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5773
5774 // The legal cases with dotprod are
5775 // UDOT 8->32
5776 // Which requires an additional uaddv to sum the i32 values.
5777 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5778 ResVT == MVT::i32)
5779 return LT.first + 2;
5780 }
5781
5782 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5783 CostKind);
5784}
5785
5789 static const CostTblEntry ShuffleTbl[] = {
5790 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5791 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5792 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5793 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5794 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5795 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5796 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5797 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5798 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5799 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5800 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5801 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5802 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5803 };
5804
5805 // The code-generator is currently not able to handle scalable vectors
5806 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5807 // it. This change will be removed when code-generation for these types is
5808 // sufficiently reliable.
5811
5812 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5813 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5814 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5815 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5816 : LT.second;
5817 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5818 InstructionCost LegalizationCost = 0;
5819 if (Index < 0) {
5820 LegalizationCost =
5821 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5823 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5825 }
5826
5827 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5828 // Cost performed on a promoted type.
5829 if (LT.second.getScalarType() == MVT::i1) {
5830 LegalizationCost +=
5831 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5833 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5835 }
5836 const auto *Entry =
5837 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5838 assert(Entry && "Illegal Type for Splice");
5839 LegalizationCost += Entry->Cost;
5840 return LegalizationCost * LT.first;
5841}
5842
5844 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5846 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5849
5851 return Invalid;
5852
5853 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5854 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5855 return Invalid;
5856
5857 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
5858 OpAExtend == TTI::PR_None)
5859 return Invalid;
5860
5861 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5862 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5863 "Unexpected values for OpBExtend or InputTypeB");
5864
5865 // We only support multiply binary operations for now, and for muls we
5866 // require the types being extended to be the same.
5867 if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
5868 return Invalid;
5869
5870 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5871 if (IsUSDot && !ST->hasMatMulInt8())
5872 return Invalid;
5873
5874 unsigned Ratio =
5875 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5876 if (VF.getKnownMinValue() <= Ratio)
5877 return Invalid;
5878
5879 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
5880 VectorType *AccumVectorType =
5881 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
5882 // We don't yet support all kinds of legalization.
5883 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
5884 EVT::getEVT(AccumVectorType));
5885 switch (TC.first) {
5886 default:
5887 return Invalid;
5891 // The legalised type (e.g. after splitting) must be legal too.
5892 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
5894 return Invalid;
5895 break;
5896 }
5897
5898 std::pair<InstructionCost, MVT> AccumLT =
5899 getTypeLegalizationCost(AccumVectorType);
5900 std::pair<InstructionCost, MVT> InputLT =
5901 getTypeLegalizationCost(InputVectorType);
5902
5903 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5904
5905 // Prefer using full types by costing half-full input types as more expensive.
5906 if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
5908 // FIXME: This can be removed after the cost of the extends are folded into
5909 // the dot-product expression in VPlan, after landing:
5910 // https://github.com/llvm/llvm-project/pull/147302
5911 Cost *= 2;
5912
5913 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
5914 // i16 -> i64 is natively supported for udot/sdot
5915 if (AccumLT.second.getScalarType() == MVT::i64 &&
5916 InputLT.second.getScalarType() == MVT::i16)
5917 return Cost;
5918 // i8 -> i64 is supported with an extra level of extends
5919 if (AccumLT.second.getScalarType() == MVT::i64 &&
5920 InputLT.second.getScalarType() == MVT::i8)
5921 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
5922 // because it requires two extra extends on the inputs. But if we'd change
5923 // that now, a regular reduction would be cheaper because the costs of
5924 // the extends in the IR are still counted. This can be fixed
5925 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
5926 return Cost;
5927 }
5928
5929 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
5930 if (ST->isSVEorStreamingSVEAvailable() ||
5931 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
5932 ST->hasDotProd())) {
5933 if (AccumLT.second.getScalarType() == MVT::i32 &&
5934 InputLT.second.getScalarType() == MVT::i8)
5935 return Cost;
5936 }
5937
5938 // Add additional cost for the extends that would need to be inserted.
5939 return Cost + 2;
5940}
5941
5944 VectorType *SrcTy, ArrayRef<int> Mask,
5945 TTI::TargetCostKind CostKind, int Index,
5947 const Instruction *CxtI) const {
5948 assert((Mask.empty() || DstTy->isScalableTy() ||
5949 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5950 "Expected the Mask to match the return size if given");
5951 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5952 "Expected the same scalar types");
5953 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
5954
5955 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5956 // into smaller vectors and sum the cost of each shuffle.
5957 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
5958 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5959 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5960 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5961 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5962 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5963 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5964 // cost than just the load.
5965 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5968 return std::max<InstructionCost>(1, LT.first / 4);
5969
5970 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5971 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5972 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5973 // cost than just the store.
5974 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
5976 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
5978 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
5979 return LT.first;
5980
5981 unsigned TpNumElts = Mask.size();
5982 unsigned LTNumElts = LT.second.getVectorNumElements();
5983 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5984 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
5985 LT.second.getVectorElementCount());
5987 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5988 PreviousCosts;
5989 for (unsigned N = 0; N < NumVecs; N++) {
5990 SmallVector<int> NMask;
5991 // Split the existing mask into chunks of size LTNumElts. Track the source
5992 // sub-vectors to ensure the result has at most 2 inputs.
5993 unsigned Source1 = -1U, Source2 = -1U;
5994 unsigned NumSources = 0;
5995 for (unsigned E = 0; E < LTNumElts; E++) {
5996 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
5998 if (MaskElt < 0) {
6000 continue;
6001 }
6002
6003 // Calculate which source from the input this comes from and whether it
6004 // is new to us.
6005 unsigned Source = MaskElt / LTNumElts;
6006 if (NumSources == 0) {
6007 Source1 = Source;
6008 NumSources = 1;
6009 } else if (NumSources == 1 && Source != Source1) {
6010 Source2 = Source;
6011 NumSources = 2;
6012 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6013 NumSources++;
6014 }
6015
6016 // Add to the new mask. For the NumSources>2 case these are not correct,
6017 // but are only used for the modular lane number.
6018 if (Source == Source1)
6019 NMask.push_back(MaskElt % LTNumElts);
6020 else if (Source == Source2)
6021 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6022 else
6023 NMask.push_back(MaskElt % LTNumElts);
6024 }
6025 // Check if we have already generated this sub-shuffle, which means we
6026 // will have already generated the output. For example a <16 x i32> splat
6027 // will be the same sub-splat 4 times, which only needs to be generated
6028 // once and reused.
6029 auto Result =
6030 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6031 // Check if it was already in the map (already costed).
6032 if (!Result.second)
6033 continue;
6034 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6035 // getShuffleCost. If not then cost it using the worst case as the number
6036 // of element moves into a new vector.
6037 InstructionCost NCost =
6038 NumSources <= 2
6039 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6041 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6042 CxtI)
6043 : LTNumElts;
6044 Result.first->second = NCost;
6045 Cost += NCost;
6046 }
6047 return Cost;
6048 }
6049
6050 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6051 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6052 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6053 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6054 // This currently only handles low or high extracts to prevent SLP vectorizer
6055 // regressions.
6056 // Note that SVE's ext instruction is destructive, but it can be fused with
6057 // a movprfx to act like a constructive instruction.
6058 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6059 if (LT.second.getFixedSizeInBits() >= 128 &&
6060 cast<FixedVectorType>(SubTp)->getNumElements() ==
6061 LT.second.getVectorNumElements() / 2) {
6062 if (Index == 0)
6063 return 0;
6064 if (Index == (int)LT.second.getVectorNumElements() / 2)
6065 return 1;
6066 }
6068 }
6069 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6070 // the code to handle length-changing shuffles.
6071 if (Kind == TTI::SK_InsertSubvector) {
6072 LT = getTypeLegalizationCost(DstTy);
6073 SrcTy = DstTy;
6074 }
6075
6076 // Check for identity masks, which we can treat as free for both fixed and
6077 // scalable vector paths.
6078 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6079 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6080 all_of(enumerate(Mask), [](const auto &M) {
6081 return M.value() < 0 || M.value() == (int)M.index();
6082 }))
6083 return 0;
6084
6085 // Segmented shuffle matching.
6086 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6087 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6088 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6090
6092 unsigned Segments =
6094 unsigned SegmentElts = VTy->getNumElements() / Segments;
6095
6096 // dupq zd.t, zn.t[idx]
6097 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6098 ST->isSVEorStreamingSVEAvailable() &&
6099 isDUPQMask(Mask, Segments, SegmentElts))
6100 return LT.first;
6101
6102 // mov zd.q, vn
6103 if (ST->isSVEorStreamingSVEAvailable() &&
6104 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6105 return LT.first;
6106 }
6107
6108 // Check for broadcast loads, which are supported by the LD1R instruction.
6109 // In terms of code-size, the shuffle vector is free when a load + dup get
6110 // folded into a LD1R. That's what we check and return here. For performance
6111 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6112 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6113 // that we model the load + dup sequence slightly higher because LD1R is a
6114 // high latency instruction.
6115 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6116 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6117 if (IsLoad && LT.second.isVector() &&
6118 isLegalBroadcastLoad(SrcTy->getElementType(),
6119 LT.second.getVectorElementCount()))
6120 return 0;
6121 }
6122
6123 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6124 // from the perfect shuffle tables.
6125 if (Mask.size() == 4 &&
6126 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6127 (SrcTy->getScalarSizeInBits() == 16 ||
6128 SrcTy->getScalarSizeInBits() == 32) &&
6129 all_of(Mask, [](int E) { return E < 8; }))
6130 return getPerfectShuffleCost(Mask);
6131
6132 // Check for other shuffles that are not SK_ kinds but we have native
6133 // instructions for, for example ZIP and UZP.
6134 unsigned Unused;
6135 if (LT.second.isFixedLengthVector() &&
6136 LT.second.getVectorNumElements() == Mask.size() &&
6137 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6138 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6139 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6140 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6141 LT.second.getVectorNumElements(), 16) ||
6142 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6143 LT.second.getVectorNumElements(), 32) ||
6144 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6145 LT.second.getVectorNumElements(), 64) ||
6146 // Check for non-zero lane splats
6147 all_of(drop_begin(Mask),
6148 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6149 return 1;
6150
6151 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6152 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6153 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6154 static const CostTblEntry ShuffleTbl[] = {
6155 // Broadcast shuffle kinds can be performed with 'dup'.
6156 {TTI::SK_Broadcast, MVT::v8i8, 1},
6157 {TTI::SK_Broadcast, MVT::v16i8, 1},
6158 {TTI::SK_Broadcast, MVT::v4i16, 1},
6159 {TTI::SK_Broadcast, MVT::v8i16, 1},
6160 {TTI::SK_Broadcast, MVT::v2i32, 1},
6161 {TTI::SK_Broadcast, MVT::v4i32, 1},
6162 {TTI::SK_Broadcast, MVT::v2i64, 1},
6163 {TTI::SK_Broadcast, MVT::v4f16, 1},
6164 {TTI::SK_Broadcast, MVT::v8f16, 1},
6165 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6166 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6167 {TTI::SK_Broadcast, MVT::v2f32, 1},
6168 {TTI::SK_Broadcast, MVT::v4f32, 1},
6169 {TTI::SK_Broadcast, MVT::v2f64, 1},
6170 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6171 // 'zip1/zip2' instructions.
6172 {TTI::SK_Transpose, MVT::v8i8, 1},
6173 {TTI::SK_Transpose, MVT::v16i8, 1},
6174 {TTI::SK_Transpose, MVT::v4i16, 1},
6175 {TTI::SK_Transpose, MVT::v8i16, 1},
6176 {TTI::SK_Transpose, MVT::v2i32, 1},
6177 {TTI::SK_Transpose, MVT::v4i32, 1},
6178 {TTI::SK_Transpose, MVT::v2i64, 1},
6179 {TTI::SK_Transpose, MVT::v4f16, 1},
6180 {TTI::SK_Transpose, MVT::v8f16, 1},
6181 {TTI::SK_Transpose, MVT::v4bf16, 1},
6182 {TTI::SK_Transpose, MVT::v8bf16, 1},
6183 {TTI::SK_Transpose, MVT::v2f32, 1},
6184 {TTI::SK_Transpose, MVT::v4f32, 1},
6185 {TTI::SK_Transpose, MVT::v2f64, 1},
6186 // Select shuffle kinds.
6187 // TODO: handle vXi8/vXi16.
6188 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6189 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6190 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6191 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6192 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6193 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6194 // PermuteSingleSrc shuffle kinds.
6195 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6196 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6197 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6198 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6199 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6200 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6201 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6202 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6203 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6204 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6205 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6206 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6207 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6208 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6209 // Reverse can be lowered with `rev`.
6210 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6211 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6212 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6213 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6214 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6215 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6216 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6217 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6218 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6219 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6220 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6221 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6222 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6223 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6224 // Splice can all be lowered as `ext`.
6225 {TTI::SK_Splice, MVT::v2i32, 1},
6226 {TTI::SK_Splice, MVT::v4i32, 1},
6227 {TTI::SK_Splice, MVT::v2i64, 1},
6228 {TTI::SK_Splice, MVT::v2f32, 1},
6229 {TTI::SK_Splice, MVT::v4f32, 1},
6230 {TTI::SK_Splice, MVT::v2f64, 1},
6231 {TTI::SK_Splice, MVT::v8f16, 1},
6232 {TTI::SK_Splice, MVT::v8bf16, 1},
6233 {TTI::SK_Splice, MVT::v8i16, 1},
6234 {TTI::SK_Splice, MVT::v16i8, 1},
6235 {TTI::SK_Splice, MVT::v4f16, 1},
6236 {TTI::SK_Splice, MVT::v4bf16, 1},
6237 {TTI::SK_Splice, MVT::v4i16, 1},
6238 {TTI::SK_Splice, MVT::v8i8, 1},
6239 // Broadcast shuffle kinds for scalable vectors
6240 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6241 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6242 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6243 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6244 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6245 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6246 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6247 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6248 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6249 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6250 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6251 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6252 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6253 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6254 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6255 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6256 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6257 // Handle the cases for vector.reverse with scalable vectors
6258 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6259 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6260 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6261 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6262 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6263 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6264 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6265 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6266 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6267 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6268 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6269 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6270 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6271 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6272 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6273 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6274 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6275 };
6276 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6277 return LT.first * Entry->Cost;
6278 }
6279
6280 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6281 return getSpliceCost(SrcTy, Index, CostKind);
6282
6283 // Inserting a subvector can often be done with either a D, S or H register
6284 // move, so long as the inserted vector is "aligned".
6285 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6286 LT.second.getSizeInBits() <= 128 && SubTp) {
6287 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6288 if (SubLT.second.isVector()) {
6289 int NumElts = LT.second.getVectorNumElements();
6290 int NumSubElts = SubLT.second.getVectorNumElements();
6291 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6292 return SubLT.first;
6293 }
6294 }
6295
6296 // Restore optimal kind.
6297 if (IsExtractSubvector)
6299 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6300 Args, CxtI);
6301}
6302
6305 const DominatorTree &DT) {
6306 const auto &Strides = DenseMap<Value *, const SCEV *>();
6307 for (BasicBlock *BB : TheLoop->blocks()) {
6308 // Scan the instructions in the block and look for addresses that are
6309 // consecutive and decreasing.
6310 for (Instruction &I : *BB) {
6311 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6313 Type *AccessTy = getLoadStoreType(&I);
6314 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6315 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6316 .value_or(0) < 0)
6317 return true;
6318 }
6319 }
6320 }
6321 return false;
6322}
6323
6325 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6327 // For cases like post-LTO vectorization, when we eventually know the trip
6328 // count, epilogue with fixed-width vectorization can be deleted if the trip
6329 // count is less than the epilogue iterations. That's why we prefer
6330 // fixed-width vectorization in epilogue in case of equal costs.
6331 if (IsEpilogue)
6332 return true;
6333 return ST->useFixedOverScalableIfEqualCost();
6334}
6335
6337 return ST->getEpilogueVectorizationMinVF();
6338}
6339
6341 if (!ST->hasSVE())
6342 return false;
6343
6344 // We don't currently support vectorisation with interleaving for SVE - with
6345 // such loops we're better off not using tail-folding. This gives us a chance
6346 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6347 if (TFI->IAI->hasGroups())
6348 return false;
6349
6351 if (TFI->LVL->getReductionVars().size())
6353 if (TFI->LVL->getFixedOrderRecurrences().size())
6355
6356 // We call this to discover whether any load/store pointers in the loop have
6357 // negative strides. This will require extra work to reverse the loop
6358 // predicate, which may be expensive.
6361 *TFI->LVL->getDominatorTree()))
6365
6366 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6367 Required))
6368 return false;
6369
6370 // Don't tail-fold for tight loops where we would be better off interleaving
6371 // with an unpredicated loop.
6372 unsigned NumInsns = 0;
6373 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6374 NumInsns += BB->sizeWithoutDebug();
6375 }
6376
6377 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6378 return NumInsns >= SVETailFoldInsnThreshold;
6379}
6380
6383 StackOffset BaseOffset, bool HasBaseReg,
6384 int64_t Scale, unsigned AddrSpace) const {
6385 // Scaling factors are not free at all.
6386 // Operands | Rt Latency
6387 // -------------------------------------------
6388 // Rt, [Xn, Xm] | 4
6389 // -------------------------------------------
6390 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6391 // Rt, [Xn, Wm, <extend> #imm] |
6393 AM.BaseGV = BaseGV;
6394 AM.BaseOffs = BaseOffset.getFixed();
6395 AM.HasBaseReg = HasBaseReg;
6396 AM.Scale = Scale;
6397 AM.ScalableOffset = BaseOffset.getScalable();
6398 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6399 // Scale represents reg2 * scale, thus account for 1 if
6400 // it is not equal to 0 or 1.
6401 return AM.Scale != 0 && AM.Scale != 1;
6403}
6404
6406 const Instruction *I) const {
6408 // For the binary operators (e.g. or) we need to be more careful than
6409 // selects, here we only transform them if they are already at a natural
6410 // break point in the code - the end of a block with an unconditional
6411 // terminator.
6412 if (I->getOpcode() == Instruction::Or &&
6413 isa<BranchInst>(I->getNextNode()) &&
6414 cast<BranchInst>(I->getNextNode())->isUnconditional())
6415 return true;
6416
6417 if (I->getOpcode() == Instruction::Add ||
6418 I->getOpcode() == Instruction::Sub)
6419 return true;
6420 }
6422}
6423
6426 const TargetTransformInfo::LSRCost &C2) const {
6427 // AArch64 specific here is adding the number of instructions to the
6428 // comparison (though not as the first consideration, as some targets do)
6429 // along with changing the priority of the base additions.
6430 // TODO: Maybe a more nuanced tradeoff between instruction count
6431 // and number of registers? To be investigated at a later date.
6432 if (EnableLSRCostOpt)
6433 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6434 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6435 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6436 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6437
6439}
6440
6441static bool isSplatShuffle(Value *V) {
6442 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6443 return all_equal(Shuf->getShuffleMask());
6444 return false;
6445}
6446
6447/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6448/// or upper half of the vector elements.
6449static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6450 bool AllowSplat = false) {
6451 // Scalable types can't be extract shuffle vectors.
6452 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6453 return false;
6454
6455 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6456 auto *FullTy = FullV->getType();
6457 auto *HalfTy = HalfV->getType();
6458 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6459 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6460 };
6461
6462 auto extractHalf = [](Value *FullV, Value *HalfV) {
6463 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6464 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6465 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6466 };
6467
6468 ArrayRef<int> M1, M2;
6469 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6470 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6471 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6472 return false;
6473
6474 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6475 // it is not checked as an extract below.
6476 if (AllowSplat && isSplatShuffle(Op1))
6477 S1Op1 = nullptr;
6478 if (AllowSplat && isSplatShuffle(Op2))
6479 S2Op1 = nullptr;
6480
6481 // Check that the operands are half as wide as the result and we extract
6482 // half of the elements of the input vectors.
6483 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6484 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6485 return false;
6486
6487 // Check the mask extracts either the lower or upper half of vector
6488 // elements.
6489 int M1Start = 0;
6490 int M2Start = 0;
6491 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6492 if ((S1Op1 &&
6493 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6494 (S2Op1 &&
6495 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6496 return false;
6497
6498 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6499 (M2Start != 0 && M2Start != (NumElements / 2)))
6500 return false;
6501 if (S1Op1 && S2Op1 && M1Start != M2Start)
6502 return false;
6503
6504 return true;
6505}
6506
6507/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6508/// of the vector elements.
6509static bool areExtractExts(Value *Ext1, Value *Ext2) {
6510 auto areExtDoubled = [](Instruction *Ext) {
6511 return Ext->getType()->getScalarSizeInBits() ==
6512 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6513 };
6514
6515 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6516 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6517 !areExtDoubled(cast<Instruction>(Ext1)) ||
6518 !areExtDoubled(cast<Instruction>(Ext2)))
6519 return false;
6520
6521 return true;
6522}
6523
6524/// Check if Op could be used with vmull_high_p64 intrinsic.
6526 Value *VectorOperand = nullptr;
6527 ConstantInt *ElementIndex = nullptr;
6528 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6529 m_ConstantInt(ElementIndex))) &&
6530 ElementIndex->getValue() == 1 &&
6531 isa<FixedVectorType>(VectorOperand->getType()) &&
6532 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6533}
6534
6535/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6536static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6538}
6539
6541 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6542 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6543 if (!GEP || GEP->getNumOperands() != 2)
6544 return false;
6545
6546 Value *Base = GEP->getOperand(0);
6547 Value *Offsets = GEP->getOperand(1);
6548
6549 // We only care about scalar_base+vector_offsets.
6550 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6551 return false;
6552
6553 // Sink extends that would allow us to use 32-bit offset vectors.
6554 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6555 auto *OffsetsInst = cast<Instruction>(Offsets);
6556 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6557 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6558 Ops.push_back(&GEP->getOperandUse(1));
6559 }
6560
6561 // Sink the GEP.
6562 return true;
6563}
6564
6565/// We want to sink following cases:
6566/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6567/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6569 if (match(Op, m_VScale()))
6570 return true;
6571 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6573 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6574 return true;
6575 }
6576 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6578 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6579 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6580 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6581 return true;
6582 }
6583 return false;
6584}
6585
6586/// Check if sinking \p I's operands to I's basic block is profitable, because
6587/// the operands can be folded into a target instruction, e.g.
6588/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6592 switch (II->getIntrinsicID()) {
6593 case Intrinsic::aarch64_neon_smull:
6594 case Intrinsic::aarch64_neon_umull:
6595 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6596 /*AllowSplat=*/true)) {
6597 Ops.push_back(&II->getOperandUse(0));
6598 Ops.push_back(&II->getOperandUse(1));
6599 return true;
6600 }
6601 [[fallthrough]];
6602
6603 case Intrinsic::fma:
6604 case Intrinsic::fmuladd:
6605 if (isa<VectorType>(I->getType()) &&
6606 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6607 !ST->hasFullFP16())
6608 return false;
6609 [[fallthrough]];
6610 case Intrinsic::aarch64_neon_sqdmull:
6611 case Intrinsic::aarch64_neon_sqdmulh:
6612 case Intrinsic::aarch64_neon_sqrdmulh:
6613 // Sink splats for index lane variants
6614 if (isSplatShuffle(II->getOperand(0)))
6615 Ops.push_back(&II->getOperandUse(0));
6616 if (isSplatShuffle(II->getOperand(1)))
6617 Ops.push_back(&II->getOperandUse(1));
6618 return !Ops.empty();
6619 case Intrinsic::aarch64_neon_fmlal:
6620 case Intrinsic::aarch64_neon_fmlal2:
6621 case Intrinsic::aarch64_neon_fmlsl:
6622 case Intrinsic::aarch64_neon_fmlsl2:
6623 // Sink splats for index lane variants
6624 if (isSplatShuffle(II->getOperand(1)))
6625 Ops.push_back(&II->getOperandUse(1));
6626 if (isSplatShuffle(II->getOperand(2)))
6627 Ops.push_back(&II->getOperandUse(2));
6628 return !Ops.empty();
6629 case Intrinsic::aarch64_sve_ptest_first:
6630 case Intrinsic::aarch64_sve_ptest_last:
6631 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6632 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6633 Ops.push_back(&II->getOperandUse(0));
6634 return !Ops.empty();
6635 case Intrinsic::aarch64_sme_write_horiz:
6636 case Intrinsic::aarch64_sme_write_vert:
6637 case Intrinsic::aarch64_sme_writeq_horiz:
6638 case Intrinsic::aarch64_sme_writeq_vert: {
6639 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6640 if (!Idx || Idx->getOpcode() != Instruction::Add)
6641 return false;
6642 Ops.push_back(&II->getOperandUse(1));
6643 return true;
6644 }
6645 case Intrinsic::aarch64_sme_read_horiz:
6646 case Intrinsic::aarch64_sme_read_vert:
6647 case Intrinsic::aarch64_sme_readq_horiz:
6648 case Intrinsic::aarch64_sme_readq_vert:
6649 case Intrinsic::aarch64_sme_ld1b_vert:
6650 case Intrinsic::aarch64_sme_ld1h_vert:
6651 case Intrinsic::aarch64_sme_ld1w_vert:
6652 case Intrinsic::aarch64_sme_ld1d_vert:
6653 case Intrinsic::aarch64_sme_ld1q_vert:
6654 case Intrinsic::aarch64_sme_st1b_vert:
6655 case Intrinsic::aarch64_sme_st1h_vert:
6656 case Intrinsic::aarch64_sme_st1w_vert:
6657 case Intrinsic::aarch64_sme_st1d_vert:
6658 case Intrinsic::aarch64_sme_st1q_vert:
6659 case Intrinsic::aarch64_sme_ld1b_horiz:
6660 case Intrinsic::aarch64_sme_ld1h_horiz:
6661 case Intrinsic::aarch64_sme_ld1w_horiz:
6662 case Intrinsic::aarch64_sme_ld1d_horiz:
6663 case Intrinsic::aarch64_sme_ld1q_horiz:
6664 case Intrinsic::aarch64_sme_st1b_horiz:
6665 case Intrinsic::aarch64_sme_st1h_horiz:
6666 case Intrinsic::aarch64_sme_st1w_horiz:
6667 case Intrinsic::aarch64_sme_st1d_horiz:
6668 case Intrinsic::aarch64_sme_st1q_horiz: {
6669 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6670 if (!Idx || Idx->getOpcode() != Instruction::Add)
6671 return false;
6672 Ops.push_back(&II->getOperandUse(3));
6673 return true;
6674 }
6675 case Intrinsic::aarch64_neon_pmull:
6676 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6677 return false;
6678 Ops.push_back(&II->getOperandUse(0));
6679 Ops.push_back(&II->getOperandUse(1));
6680 return true;
6681 case Intrinsic::aarch64_neon_pmull64:
6682 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6683 II->getArgOperand(1)))
6684 return false;
6685 Ops.push_back(&II->getArgOperandUse(0));
6686 Ops.push_back(&II->getArgOperandUse(1));
6687 return true;
6688 case Intrinsic::masked_gather:
6689 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6690 return false;
6691 Ops.push_back(&II->getArgOperandUse(0));
6692 return true;
6693 case Intrinsic::masked_scatter:
6694 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6695 return false;
6696 Ops.push_back(&II->getArgOperandUse(1));
6697 return true;
6698 default:
6699 return false;
6700 }
6701 }
6702
6703 auto ShouldSinkCondition = [](Value *Cond,
6704 SmallVectorImpl<Use *> &Ops) -> bool {
6706 return false;
6708 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6709 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6710 return false;
6711 if (isa<CmpInst>(II->getOperand(0)))
6712 Ops.push_back(&II->getOperandUse(0));
6713 return true;
6714 };
6715
6716 switch (I->getOpcode()) {
6717 case Instruction::GetElementPtr:
6718 case Instruction::Add:
6719 case Instruction::Sub:
6720 // Sink vscales closer to uses for better isel
6721 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6722 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6723 Ops.push_back(&I->getOperandUse(Op));
6724 return true;
6725 }
6726 }
6727 break;
6728 case Instruction::Select: {
6729 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6730 return false;
6731
6732 Ops.push_back(&I->getOperandUse(0));
6733 return true;
6734 }
6735 case Instruction::Br: {
6736 if (cast<BranchInst>(I)->isUnconditional())
6737 return false;
6738
6739 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6740 return false;
6741
6742 Ops.push_back(&I->getOperandUse(0));
6743 return true;
6744 }
6745 default:
6746 break;
6747 }
6748
6749 if (!I->getType()->isVectorTy())
6750 return false;
6751
6752 switch (I->getOpcode()) {
6753 case Instruction::Sub:
6754 case Instruction::Add: {
6755 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6756 return false;
6757
6758 // If the exts' operands extract either the lower or upper elements, we
6759 // can sink them too.
6760 auto Ext1 = cast<Instruction>(I->getOperand(0));
6761 auto Ext2 = cast<Instruction>(I->getOperand(1));
6762 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6763 Ops.push_back(&Ext1->getOperandUse(0));
6764 Ops.push_back(&Ext2->getOperandUse(0));
6765 }
6766
6767 Ops.push_back(&I->getOperandUse(0));
6768 Ops.push_back(&I->getOperandUse(1));
6769
6770 return true;
6771 }
6772 case Instruction::Or: {
6773 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6774 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6775 if (ST->hasNEON()) {
6776 Instruction *OtherAnd, *IA, *IB;
6777 Value *MaskValue;
6778 // MainAnd refers to And instruction that has 'Not' as one of its operands
6779 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6780 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6781 m_Instruction(IA)))))) {
6782 if (match(OtherAnd,
6783 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6784 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6785 ? cast<Instruction>(I->getOperand(1))
6786 : cast<Instruction>(I->getOperand(0));
6787
6788 // Both Ands should be in same basic block as Or
6789 if (I->getParent() != MainAnd->getParent() ||
6790 I->getParent() != OtherAnd->getParent())
6791 return false;
6792
6793 // Non-mask operands of both Ands should also be in same basic block
6794 if (I->getParent() != IA->getParent() ||
6795 I->getParent() != IB->getParent())
6796 return false;
6797
6798 Ops.push_back(
6799 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6800 Ops.push_back(&I->getOperandUse(0));
6801 Ops.push_back(&I->getOperandUse(1));
6802
6803 return true;
6804 }
6805 }
6806 }
6807
6808 return false;
6809 }
6810 case Instruction::Mul: {
6811 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6812 auto *Ty = cast<VectorType>(V->getType());
6813 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6814 if (Ty->isScalableTy())
6815 return false;
6816
6817 // Indexed variants of Mul exist for i16 and i32 element types only.
6818 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6819 };
6820
6821 int NumZExts = 0, NumSExts = 0;
6822 for (auto &Op : I->operands()) {
6823 // Make sure we are not already sinking this operand
6824 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6825 continue;
6826
6827 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6828 auto *Ext = cast<Instruction>(Op);
6829 auto *ExtOp = Ext->getOperand(0);
6830 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6831 Ops.push_back(&Ext->getOperandUse(0));
6832 Ops.push_back(&Op);
6833
6834 if (isa<SExtInst>(Ext)) {
6835 NumSExts++;
6836 } else {
6837 NumZExts++;
6838 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6839 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
6840 I->getType()->getScalarSizeInBits())
6841 NumSExts++;
6842 }
6843
6844 continue;
6845 }
6846
6848 if (!Shuffle)
6849 continue;
6850
6851 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6852 // operand and the s/zext can help create indexed s/umull. This is
6853 // especially useful to prevent i64 mul being scalarized.
6854 if (isSplatShuffle(Shuffle) &&
6855 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6856 Ops.push_back(&Shuffle->getOperandUse(0));
6857 Ops.push_back(&Op);
6858 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6859 NumSExts++;
6860 else
6861 NumZExts++;
6862 continue;
6863 }
6864
6865 Value *ShuffleOperand = Shuffle->getOperand(0);
6866 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6867 if (!Insert)
6868 continue;
6869
6870 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6871 if (!OperandInstr)
6872 continue;
6873
6874 ConstantInt *ElementConstant =
6875 dyn_cast<ConstantInt>(Insert->getOperand(2));
6876 // Check that the insertelement is inserting into element 0
6877 if (!ElementConstant || !ElementConstant->isZero())
6878 continue;
6879
6880 unsigned Opcode = OperandInstr->getOpcode();
6881 if (Opcode == Instruction::SExt)
6882 NumSExts++;
6883 else if (Opcode == Instruction::ZExt)
6884 NumZExts++;
6885 else {
6886 // If we find that the top bits are known 0, then we can sink and allow
6887 // the backend to generate a umull.
6888 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6889 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6890 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6891 continue;
6892 NumZExts++;
6893 }
6894
6895 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6896 // the And, just to hoist it again back to the load.
6897 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6898 Ops.push_back(&Insert->getOperandUse(1));
6899 Ops.push_back(&Shuffle->getOperandUse(0));
6900 Ops.push_back(&Op);
6901 }
6902
6903 // It is profitable to sink if we found two of the same type of extends.
6904 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6905 return true;
6906
6907 // Otherwise, see if we should sink splats for indexed variants.
6908 if (!ShouldSinkSplatForIndexedVariant(I))
6909 return false;
6910
6911 Ops.clear();
6912 if (isSplatShuffle(I->getOperand(0)))
6913 Ops.push_back(&I->getOperandUse(0));
6914 if (isSplatShuffle(I->getOperand(1)))
6915 Ops.push_back(&I->getOperandUse(1));
6916
6917 return !Ops.empty();
6918 }
6919 case Instruction::FMul: {
6920 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6921 if (I->getType()->isScalableTy())
6922 return false;
6923
6924 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6925 !ST->hasFullFP16())
6926 return false;
6927
6928 // Sink splats for index lane variants
6929 if (isSplatShuffle(I->getOperand(0)))
6930 Ops.push_back(&I->getOperandUse(0));
6931 if (isSplatShuffle(I->getOperand(1)))
6932 Ops.push_back(&I->getOperandUse(1));
6933 return !Ops.empty();
6934 }
6935 default:
6936 return false;
6937 }
6938 return false;
6939}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
unsigned countLeadingOnes() const
Definition APInt.h:1625
void negate()
Negate this APInt in place.
Definition APInt.h:1469
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1762
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:771
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1107
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:595
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:580
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1926
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1714
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:702
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:981
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:966
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2120
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isFixedLengthVector() const
Definition ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...