LLVM 22.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
80namespace {
81class TailFoldingOption {
82 // These bitfields will only ever be set to something non-zero in operator=,
83 // when setting the -sve-tail-folding option. This option should always be of
84 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
85 // InitialBits is one of (disabled|all|simple). EnableBits represents
86 // additional flags we're enabling, and DisableBits for those flags we're
87 // disabling. The default flag is tracked in the variable NeedsDefault, since
88 // at the time of setting the option we may not know what the default value
89 // for the CPU is.
93
94 // This value needs to be initialised to true in case the user does not
95 // explicitly set the -sve-tail-folding option.
96 bool NeedsDefault = true;
97
98 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
99
100 void setNeedsDefault(bool V) { NeedsDefault = V; }
101
102 void setEnableBit(TailFoldingOpts Bit) {
103 EnableBits |= Bit;
104 DisableBits &= ~Bit;
105 }
106
107 void setDisableBit(TailFoldingOpts Bit) {
108 EnableBits &= ~Bit;
109 DisableBits |= Bit;
110 }
111
112 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
113 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
114
115 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
116 "Initial bits should only include one of "
117 "(disabled|all|simple|default)");
118 Bits = NeedsDefault ? DefaultBits : InitialBits;
119 Bits |= EnableBits;
120 Bits &= ~DisableBits;
121
122 return Bits;
123 }
124
125 void reportError(std::string Opt) {
126 errs() << "invalid argument '" << Opt
127 << "' to -sve-tail-folding=; the option should be of the form\n"
128 " (disabled|all|default|simple)[+(reductions|recurrences"
129 "|reverse|noreductions|norecurrences|noreverse)]\n";
130 report_fatal_error("Unrecognised tail-folding option");
131 }
132
133public:
134
135 void operator=(const std::string &Val) {
136 // If the user explicitly sets -sve-tail-folding= then treat as an error.
137 if (Val.empty()) {
138 reportError("");
139 return;
140 }
141
142 // Since the user is explicitly setting the option we don't automatically
143 // need the default unless they require it.
144 setNeedsDefault(false);
145
146 SmallVector<StringRef, 4> TailFoldTypes;
147 StringRef(Val).split(TailFoldTypes, '+', -1, false);
148
149 unsigned StartIdx = 1;
150 if (TailFoldTypes[0] == "disabled")
151 setInitialBits(TailFoldingOpts::Disabled);
152 else if (TailFoldTypes[0] == "all")
153 setInitialBits(TailFoldingOpts::All);
154 else if (TailFoldTypes[0] == "default")
155 setNeedsDefault(true);
156 else if (TailFoldTypes[0] == "simple")
157 setInitialBits(TailFoldingOpts::Simple);
158 else {
159 StartIdx = 0;
160 setInitialBits(TailFoldingOpts::Disabled);
161 }
162
163 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
164 if (TailFoldTypes[I] == "reductions")
165 setEnableBit(TailFoldingOpts::Reductions);
166 else if (TailFoldTypes[I] == "recurrences")
167 setEnableBit(TailFoldingOpts::Recurrences);
168 else if (TailFoldTypes[I] == "reverse")
169 setEnableBit(TailFoldingOpts::Reverse);
170 else if (TailFoldTypes[I] == "noreductions")
171 setDisableBit(TailFoldingOpts::Reductions);
172 else if (TailFoldTypes[I] == "norecurrences")
173 setDisableBit(TailFoldingOpts::Recurrences);
174 else if (TailFoldTypes[I] == "noreverse")
175 setDisableBit(TailFoldingOpts::Reverse);
176 else
177 reportError(Val);
178 }
179 }
180
181 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
182 return (getBits(DefaultBits) & Required) == Required;
183 }
184};
185} // namespace
186
187TailFoldingOption TailFoldingOptionLoc;
188
190 "sve-tail-folding",
191 cl::desc(
192 "Control the use of vectorisation using tail-folding for SVE where the"
193 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
194 "\ndisabled (Initial) No loop types will vectorize using "
195 "tail-folding"
196 "\ndefault (Initial) Uses the default tail-folding settings for "
197 "the target CPU"
198 "\nall (Initial) All legal loop types will vectorize using "
199 "tail-folding"
200 "\nsimple (Initial) Use tail-folding for simple loops (not "
201 "reductions or recurrences)"
202 "\nreductions Use tail-folding for loops containing reductions"
203 "\nnoreductions Inverse of above"
204 "\nrecurrences Use tail-folding for loops containing fixed order "
205 "recurrences"
206 "\nnorecurrences Inverse of above"
207 "\nreverse Use tail-folding for loops requiring reversed "
208 "predicates"
209 "\nnoreverse Inverse of above"),
211
212// Experimental option that will only be fully functional when the
213// code-generator is changed to use SVE instead of NEON for all fixed-width
214// operations.
216 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
217
218// Experimental option that will only be fully functional when the cost-model
219// and code-generator have been changed to avoid using scalable vector
220// instructions that are not legal in streaming SVE mode.
222 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
223
224static bool isSMEABIRoutineCall(const CallInst &CI,
225 const AArch64TargetLowering &TLI) {
226 const auto *F = CI.getCalledFunction();
227 return F &&
229}
230
231/// Returns true if the function has explicit operations that can only be
232/// lowered using incompatible instructions for the selected mode. This also
233/// returns true if the function F may use or modify ZA state.
235 const AArch64TargetLowering &TLI) {
236 for (const BasicBlock &BB : *F) {
237 for (const Instruction &I : BB) {
238 // Be conservative for now and assume that any call to inline asm or to
239 // intrinsics could could result in non-streaming ops (e.g. calls to
240 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
241 // all native LLVM instructions can be lowered to compatible instructions.
242 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
243 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
245 return true;
246 }
247 }
248 return false;
249}
250
252 StringRef AttributeStr =
253 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
254 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
256 FeatureStr.split(Features, ",");
257 return AArch64::getFMVPriority(Features);
258}
259
261 return F.hasFnAttribute("fmv-features");
262}
263
264const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
265 AArch64::FeatureExecuteOnly,
266};
267
269 const Function *Callee) const {
270 SMECallAttrs CallAttrs(*Caller, *Callee);
271
272 // Never inline a function explicitly marked as being streaming,
273 // into a non-streaming function. Assume it was marked as streaming
274 // for a reason.
275 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
277 return false;
278
279 // When inlining, we should consider the body of the function, not the
280 // interface.
281 if (CallAttrs.callee().hasStreamingBody()) {
282 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
283 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
284 }
285
286 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
287 return false;
288
289 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
290 CallAttrs.requiresPreservingZT0() ||
291 CallAttrs.requiresPreservingAllZAState()) {
292 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
293 return false;
294 }
295
296 const TargetMachine &TM = getTLI()->getTargetMachine();
297 const FeatureBitset &CallerBits =
298 TM.getSubtargetImpl(*Caller)->getFeatureBits();
299 const FeatureBitset &CalleeBits =
300 TM.getSubtargetImpl(*Callee)->getFeatureBits();
301 // Adjust the feature bitsets by inverting some of the bits. This is needed
302 // for target features that represent restrictions rather than capabilities,
303 // for example a "+execute-only" callee can be inlined into a caller without
304 // "+execute-only", but not vice versa.
305 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
306 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
307
308 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
309}
310
312 const Function *Callee,
313 ArrayRef<Type *> Types) const {
314 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
315 return false;
316
317 // We need to ensure that argument promotion does not attempt to promote
318 // pointers to fixed-length vector types larger than 128 bits like
319 // <8 x float> (and pointers to aggregate types which have such fixed-length
320 // vector type members) into the values of the pointees. Such vector types
321 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
322 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
323 // types can be safely treated as 128-bit NEON types and they cannot be
324 // distinguished in IR.
325 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
326 auto FVTy = dyn_cast<FixedVectorType>(Ty);
327 return FVTy &&
328 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
329 }))
330 return false;
331
332 return true;
333}
334
335unsigned
337 unsigned DefaultCallPenalty) const {
338 // This function calculates a penalty for executing Call in F.
339 //
340 // There are two ways this function can be called:
341 // (1) F:
342 // call from F -> G (the call here is Call)
343 //
344 // For (1), Call.getCaller() == F, so it will always return a high cost if
345 // a streaming-mode change is required (thus promoting the need to inline the
346 // function)
347 //
348 // (2) F:
349 // call from F -> G (the call here is not Call)
350 // G:
351 // call from G -> H (the call here is Call)
352 //
353 // For (2), if after inlining the body of G into F the call to H requires a
354 // streaming-mode change, and the call to G from F would also require a
355 // streaming-mode change, then there is benefit to do the streaming-mode
356 // change only once and avoid inlining of G into F.
357
358 SMEAttrs FAttrs(*F);
359 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
360
361 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
362 if (F == Call.getCaller()) // (1)
363 return CallPenaltyChangeSM * DefaultCallPenalty;
364 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
365 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
366 }
367
368 return DefaultCallPenalty;
369}
370
377
378/// Calculate the cost of materializing a 64-bit value. This helper
379/// method might only calculate a fraction of a larger immediate. Therefore it
380/// is valid to return a cost of ZERO.
382 // Check if the immediate can be encoded within an instruction.
383 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
384 return 0;
385
386 if (Val < 0)
387 Val = ~Val;
388
389 // Calculate how many moves we will need to materialize this constant.
391 AArch64_IMM::expandMOVImm(Val, 64, Insn);
392 return Insn.size();
393}
394
395/// Calculate the cost of materializing the given constant.
399 assert(Ty->isIntegerTy());
400
401 unsigned BitSize = Ty->getPrimitiveSizeInBits();
402 if (BitSize == 0)
403 return ~0U;
404
405 // Sign-extend all constants to a multiple of 64-bit.
406 APInt ImmVal = Imm;
407 if (BitSize & 0x3f)
408 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
409
410 // Split the constant into 64-bit chunks and calculate the cost for each
411 // chunk.
413 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
414 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
415 int64_t Val = Tmp.getSExtValue();
416 Cost += getIntImmCost(Val);
417 }
418 // We need at least one instruction to materialze the constant.
419 return std::max<InstructionCost>(1, Cost);
420}
421
423 const APInt &Imm, Type *Ty,
425 Instruction *Inst) const {
426 assert(Ty->isIntegerTy());
427
428 unsigned BitSize = Ty->getPrimitiveSizeInBits();
429 // There is no cost model for constants with a bit size of 0. Return TCC_Free
430 // here, so that constant hoisting will ignore this constant.
431 if (BitSize == 0)
432 return TTI::TCC_Free;
433
434 unsigned ImmIdx = ~0U;
435 switch (Opcode) {
436 default:
437 return TTI::TCC_Free;
438 case Instruction::GetElementPtr:
439 // Always hoist the base address of a GetElementPtr.
440 if (Idx == 0)
441 return 2 * TTI::TCC_Basic;
442 return TTI::TCC_Free;
443 case Instruction::Store:
444 ImmIdx = 0;
445 break;
446 case Instruction::Add:
447 case Instruction::Sub:
448 case Instruction::Mul:
449 case Instruction::UDiv:
450 case Instruction::SDiv:
451 case Instruction::URem:
452 case Instruction::SRem:
453 case Instruction::And:
454 case Instruction::Or:
455 case Instruction::Xor:
456 case Instruction::ICmp:
457 ImmIdx = 1;
458 break;
459 // Always return TCC_Free for the shift value of a shift instruction.
460 case Instruction::Shl:
461 case Instruction::LShr:
462 case Instruction::AShr:
463 if (Idx == 1)
464 return TTI::TCC_Free;
465 break;
466 case Instruction::Trunc:
467 case Instruction::ZExt:
468 case Instruction::SExt:
469 case Instruction::IntToPtr:
470 case Instruction::PtrToInt:
471 case Instruction::BitCast:
472 case Instruction::PHI:
473 case Instruction::Call:
474 case Instruction::Select:
475 case Instruction::Ret:
476 case Instruction::Load:
477 break;
478 }
479
480 if (Idx == ImmIdx) {
481 int NumConstants = (BitSize + 63) / 64;
483 return (Cost <= NumConstants * TTI::TCC_Basic)
484 ? static_cast<int>(TTI::TCC_Free)
485 : Cost;
486 }
488}
489
492 const APInt &Imm, Type *Ty,
494 assert(Ty->isIntegerTy());
495
496 unsigned BitSize = Ty->getPrimitiveSizeInBits();
497 // There is no cost model for constants with a bit size of 0. Return TCC_Free
498 // here, so that constant hoisting will ignore this constant.
499 if (BitSize == 0)
500 return TTI::TCC_Free;
501
502 // Most (all?) AArch64 intrinsics do not support folding immediates into the
503 // selected instruction, so we compute the materialization cost for the
504 // immediate directly.
505 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
507
508 switch (IID) {
509 default:
510 return TTI::TCC_Free;
511 case Intrinsic::sadd_with_overflow:
512 case Intrinsic::uadd_with_overflow:
513 case Intrinsic::ssub_with_overflow:
514 case Intrinsic::usub_with_overflow:
515 case Intrinsic::smul_with_overflow:
516 case Intrinsic::umul_with_overflow:
517 if (Idx == 1) {
518 int NumConstants = (BitSize + 63) / 64;
520 return (Cost <= NumConstants * TTI::TCC_Basic)
521 ? static_cast<int>(TTI::TCC_Free)
522 : Cost;
523 }
524 break;
525 case Intrinsic::experimental_stackmap:
526 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
527 return TTI::TCC_Free;
528 break;
529 case Intrinsic::experimental_patchpoint_void:
530 case Intrinsic::experimental_patchpoint:
531 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
532 return TTI::TCC_Free;
533 break;
534 case Intrinsic::experimental_gc_statepoint:
535 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
536 return TTI::TCC_Free;
537 break;
538 }
540}
541
543AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
544 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
545 if (TyWidth == 32 || TyWidth == 64)
547 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
548 return TTI::PSK_Software;
549}
550
551static bool isUnpackedVectorVT(EVT VecVT) {
552 return VecVT.isScalableVector() &&
554}
555
557 const IntrinsicCostAttributes &ICA) {
558 // We need to know at least the number of elements in the vector of buckets
559 // and the size of each element to update.
560 if (ICA.getArgTypes().size() < 2)
562
563 // Only interested in costing for the hardware instruction from SVE2.
564 if (!ST->hasSVE2())
566
567 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
568 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
569 unsigned TotalHistCnts = 1;
570
571 unsigned EltSize = EltTy->getScalarSizeInBits();
572 // Only allow (up to 64b) integers or pointers
573 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
575
576 // FIXME: We should be able to generate histcnt for fixed-length vectors
577 // using ptrue with a specific VL.
578 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
579 unsigned EC = VTy->getElementCount().getKnownMinValue();
580 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
582
583 // HistCnt only supports 32b and 64b element types
584 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
585
586 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
588
589 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
590 TotalHistCnts = EC / NaturalVectorWidth;
591
592 return InstructionCost(BaseHistCntCost * TotalHistCnts);
593 }
594
596}
597
601 // The code-generator is currently not able to handle scalable vectors
602 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
603 // it. This change will be removed when code-generation for these types is
604 // sufficiently reliable.
605 auto *RetTy = ICA.getReturnType();
606 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
607 if (VTy->getElementCount() == ElementCount::getScalable(1))
609
610 switch (ICA.getID()) {
611 case Intrinsic::experimental_vector_histogram_add: {
612 InstructionCost HistCost = getHistogramCost(ST, ICA);
613 // If the cost isn't valid, we may still be able to scalarize
614 if (HistCost.isValid())
615 return HistCost;
616 break;
617 }
618 case Intrinsic::umin:
619 case Intrinsic::umax:
620 case Intrinsic::smin:
621 case Intrinsic::smax: {
622 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
623 MVT::v8i16, MVT::v2i32, MVT::v4i32,
624 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
625 MVT::nxv2i64};
626 auto LT = getTypeLegalizationCost(RetTy);
627 // v2i64 types get converted to cmp+bif hence the cost of 2
628 if (LT.second == MVT::v2i64)
629 return LT.first * 2;
630 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
631 return LT.first;
632 break;
633 }
634 case Intrinsic::sadd_sat:
635 case Intrinsic::ssub_sat:
636 case Intrinsic::uadd_sat:
637 case Intrinsic::usub_sat: {
638 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
639 MVT::v8i16, MVT::v2i32, MVT::v4i32,
640 MVT::v2i64};
641 auto LT = getTypeLegalizationCost(RetTy);
642 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
643 // need to extend the type, as it uses shr(qadd(shl, shl)).
644 unsigned Instrs =
645 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
646 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
647 return LT.first * Instrs;
648
650 uint64_t VectorSize = TS.getKnownMinValue();
651
652 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
653 return LT.first * Instrs;
654
655 break;
656 }
657 case Intrinsic::abs: {
658 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
659 MVT::v8i16, MVT::v2i32, MVT::v4i32,
660 MVT::v2i64};
661 auto LT = getTypeLegalizationCost(RetTy);
662 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
663 return LT.first;
664 break;
665 }
666 case Intrinsic::bswap: {
667 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
668 MVT::v4i32, MVT::v2i64};
669 auto LT = getTypeLegalizationCost(RetTy);
670 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
671 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
672 return LT.first;
673 break;
674 }
675 case Intrinsic::fma:
676 case Intrinsic::fmuladd: {
677 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
678 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
679 Type *EltTy = RetTy->getScalarType();
680 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
681 (EltTy->isHalfTy() && ST->hasFullFP16()))
682 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
683 break;
684 }
685 case Intrinsic::stepvector: {
686 InstructionCost Cost = 1; // Cost of the `index' instruction
687 auto LT = getTypeLegalizationCost(RetTy);
688 // Legalisation of illegal vectors involves an `index' instruction plus
689 // (LT.first - 1) vector adds.
690 if (LT.first > 1) {
691 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
692 InstructionCost AddCost =
693 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
694 Cost += AddCost * (LT.first - 1);
695 }
696 return Cost;
697 }
698 case Intrinsic::vector_extract:
699 case Intrinsic::vector_insert: {
700 // If both the vector and subvector types are legal types and the index
701 // is 0, then this should be a no-op or simple operation; return a
702 // relatively low cost.
703
704 // If arguments aren't actually supplied, then we cannot determine the
705 // value of the index. We also want to skip predicate types.
706 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
708 break;
709
710 LLVMContext &C = RetTy->getContext();
711 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
712 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
713 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
714 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
715 // Skip this if either the vector or subvector types are unpacked
716 // SVE types; they may get lowered to stack stores and loads.
717 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
718 break;
719
721 getTLI()->getTypeConversion(C, SubVecVT);
723 getTLI()->getTypeConversion(C, VecVT);
724 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
725 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
726 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
727 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
728 return TTI::TCC_Free;
729 break;
730 }
731 case Intrinsic::bitreverse: {
732 static const CostTblEntry BitreverseTbl[] = {
733 {Intrinsic::bitreverse, MVT::i32, 1},
734 {Intrinsic::bitreverse, MVT::i64, 1},
735 {Intrinsic::bitreverse, MVT::v8i8, 1},
736 {Intrinsic::bitreverse, MVT::v16i8, 1},
737 {Intrinsic::bitreverse, MVT::v4i16, 2},
738 {Intrinsic::bitreverse, MVT::v8i16, 2},
739 {Intrinsic::bitreverse, MVT::v2i32, 2},
740 {Intrinsic::bitreverse, MVT::v4i32, 2},
741 {Intrinsic::bitreverse, MVT::v1i64, 2},
742 {Intrinsic::bitreverse, MVT::v2i64, 2},
743 };
744 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
745 const auto *Entry =
746 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
747 if (Entry) {
748 // Cost Model is using the legal type(i32) that i8 and i16 will be
749 // converted to +1 so that we match the actual lowering cost
750 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
751 TLI->getValueType(DL, RetTy, true) == MVT::i16)
752 return LegalisationCost.first * Entry->Cost + 1;
753
754 return LegalisationCost.first * Entry->Cost;
755 }
756 break;
757 }
758 case Intrinsic::ctpop: {
759 if (!ST->hasNEON()) {
760 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
761 return getTypeLegalizationCost(RetTy).first * 12;
762 }
763 static const CostTblEntry CtpopCostTbl[] = {
764 {ISD::CTPOP, MVT::v2i64, 4},
765 {ISD::CTPOP, MVT::v4i32, 3},
766 {ISD::CTPOP, MVT::v8i16, 2},
767 {ISD::CTPOP, MVT::v16i8, 1},
768 {ISD::CTPOP, MVT::i64, 4},
769 {ISD::CTPOP, MVT::v2i32, 3},
770 {ISD::CTPOP, MVT::v4i16, 2},
771 {ISD::CTPOP, MVT::v8i8, 1},
772 {ISD::CTPOP, MVT::i32, 5},
773 };
774 auto LT = getTypeLegalizationCost(RetTy);
775 MVT MTy = LT.second;
776 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
777 // Extra cost of +1 when illegal vector types are legalized by promoting
778 // the integer type.
779 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
780 RetTy->getScalarSizeInBits()
781 ? 1
782 : 0;
783 return LT.first * Entry->Cost + ExtraCost;
784 }
785 break;
786 }
787 case Intrinsic::sadd_with_overflow:
788 case Intrinsic::uadd_with_overflow:
789 case Intrinsic::ssub_with_overflow:
790 case Intrinsic::usub_with_overflow:
791 case Intrinsic::smul_with_overflow:
792 case Intrinsic::umul_with_overflow: {
793 static const CostTblEntry WithOverflowCostTbl[] = {
794 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
795 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
796 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
797 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
798 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
799 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
800 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
801 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
802 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
803 {Intrinsic::usub_with_overflow, MVT::i8, 3},
804 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
805 {Intrinsic::usub_with_overflow, MVT::i16, 3},
806 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
807 {Intrinsic::usub_with_overflow, MVT::i32, 1},
808 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
809 {Intrinsic::usub_with_overflow, MVT::i64, 1},
810 {Intrinsic::smul_with_overflow, MVT::i8, 5},
811 {Intrinsic::umul_with_overflow, MVT::i8, 4},
812 {Intrinsic::smul_with_overflow, MVT::i16, 5},
813 {Intrinsic::umul_with_overflow, MVT::i16, 4},
814 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
815 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
816 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
817 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
818 };
819 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
820 if (MTy.isSimple())
821 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
822 MTy.getSimpleVT()))
823 return Entry->Cost;
824 break;
825 }
826 case Intrinsic::fptosi_sat:
827 case Intrinsic::fptoui_sat: {
828 if (ICA.getArgTypes().empty())
829 break;
830 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
831 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
832 EVT MTy = TLI->getValueType(DL, RetTy);
833 // Check for the legal types, which are where the size of the input and the
834 // output are the same, or we are using cvt f64->i32 or f32->i64.
835 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
836 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
837 LT.second == MVT::v2f64)) {
838 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
839 (LT.second == MVT::f64 && MTy == MVT::i32) ||
840 (LT.second == MVT::f32 && MTy == MVT::i64)))
841 return LT.first;
842 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
843 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
844 MTy.getScalarSizeInBits() == 64)
845 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
846 }
847 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
848 // f32.
849 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
850 return LT.first + getIntrinsicInstrCost(
851 {ICA.getID(),
852 RetTy,
853 {ICA.getArgTypes()[0]->getWithNewType(
854 Type::getFloatTy(RetTy->getContext()))}},
855 CostKind);
856 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
857 (LT.second == MVT::f16 && MTy == MVT::i64) ||
858 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
859 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
860 return LT.first;
861 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
862 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
863 MTy.getScalarSizeInBits() == 32)
864 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
865 // Extending vector types v8f16->v8i32. These current scalarize but the
866 // codegen could be better.
867 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
868 MTy.getScalarSizeInBits() == 64)
869 return MTy.getVectorNumElements() * 3;
870
871 // If we can we use a legal convert followed by a min+max
872 if ((LT.second.getScalarType() == MVT::f32 ||
873 LT.second.getScalarType() == MVT::f64 ||
874 LT.second.getScalarType() == MVT::f16) &&
875 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
876 Type *LegalTy =
877 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
878 if (LT.second.isVector())
879 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
881 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
882 LegalTy, {LegalTy, LegalTy});
884 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
885 LegalTy, {LegalTy, LegalTy});
887 return LT.first * Cost +
888 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
889 : 1);
890 }
891 // Otherwise we need to follow the default expansion that clamps the value
892 // using a float min/max with a fcmp+sel for nan handling when signed.
893 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
894 RetTy = RetTy->getScalarType();
895 if (LT.second.isVector()) {
896 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
897 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
898 }
899 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
901 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
903 Cost +=
904 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
906 if (IsSigned) {
907 Type *CondTy = RetTy->getWithNewBitWidth(1);
908 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
910 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
912 }
913 return LT.first * Cost;
914 }
915 case Intrinsic::fshl:
916 case Intrinsic::fshr: {
917 if (ICA.getArgs().empty())
918 break;
919
920 // TODO: Add handling for fshl where third argument is not a constant.
921 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
922 if (!OpInfoZ.isConstant())
923 break;
924
925 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
926 if (OpInfoZ.isUniform()) {
927 static const CostTblEntry FshlTbl[] = {
928 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
929 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
930 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
931 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
932 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
933 // to avoid having to duplicate the costs.
934 const auto *Entry =
935 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
936 if (Entry)
937 return LegalisationCost.first * Entry->Cost;
938 }
939
940 auto TyL = getTypeLegalizationCost(RetTy);
941 if (!RetTy->isIntegerTy())
942 break;
943
944 // Estimate cost manually, as types like i8 and i16 will get promoted to
945 // i32 and CostTableLookup will ignore the extra conversion cost.
946 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
947 RetTy->getScalarSizeInBits() < 64) ||
948 (RetTy->getScalarSizeInBits() % 64 != 0);
949 unsigned ExtraCost = HigherCost ? 1 : 0;
950 if (RetTy->getScalarSizeInBits() == 32 ||
951 RetTy->getScalarSizeInBits() == 64)
952 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
953 // extr instruction.
954 else if (HigherCost)
955 ExtraCost = 1;
956 else
957 break;
958 return TyL.first + ExtraCost;
959 }
960 case Intrinsic::get_active_lane_mask: {
961 auto RetTy = cast<VectorType>(ICA.getReturnType());
962 EVT RetVT = getTLI()->getValueType(DL, RetTy);
963 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
964 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
965 break;
966
967 if (RetTy->isScalableTy()) {
968 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
970 break;
971
972 auto LT = getTypeLegalizationCost(RetTy);
973 InstructionCost Cost = LT.first;
974 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
975 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
976 // nxv32i1 = get_active_lane_mask(base, idx) ->
977 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
978 if (ST->hasSVE2p1() || ST->hasSME2()) {
979 Cost /= 2;
980 if (Cost == 1)
981 return Cost;
982 }
983
984 // If more than one whilelo intrinsic is required, include the extra cost
985 // required by the saturating add & select required to increment the
986 // start value after the first intrinsic call.
987 Type *OpTy = ICA.getArgTypes()[0];
988 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
989 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
990 Type *CondTy = OpTy->getWithNewBitWidth(1);
991 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
993 return Cost + (SplitCost * (Cost - 1));
994 } else if (!getTLI()->isTypeLegal(RetVT)) {
995 // We don't have enough context at this point to determine if the mask
996 // is going to be kept live after the block, which will force the vXi1
997 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
998 // For now, we just assume the vectorizer created this intrinsic and
999 // the result will be the input for a PHI. In this case the cost will
1000 // be extremely high for fixed-width vectors.
1001 // NOTE: getScalarizationOverhead returns a cost that's far too
1002 // pessimistic for the actual generated codegen. In reality there are
1003 // two instructions generated per lane.
1004 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1005 }
1006 break;
1007 }
1008 case Intrinsic::experimental_vector_match: {
1009 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1010 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1011 unsigned SearchSize = NeedleTy->getNumElements();
1012 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1013 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1014 // Neoverse V3, these are cheap operations with the same latency as a
1015 // vector ADD. In most cases, however, we also need to do an extra DUP.
1016 // For fixed-length vectors we currently need an extra five--six
1017 // instructions besides the MATCH.
1019 if (isa<FixedVectorType>(RetTy))
1020 Cost += 10;
1021 return Cost;
1022 }
1023 break;
1024 }
1025 case Intrinsic::experimental_cttz_elts: {
1026 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1027 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1028 // This will consist of a SVE brkb and a cntp instruction. These
1029 // typically have the same latency and half the throughput as a vector
1030 // add instruction.
1031 return 4;
1032 }
1033 break;
1034 }
1035 case Intrinsic::experimental_vector_extract_last_active:
1036 if (ST->isSVEorStreamingSVEAvailable()) {
1037 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1038 // This should turn into chained clastb instructions.
1039 return LegalCost;
1040 }
1041 break;
1042 default:
1043 break;
1044 }
1046}
1047
1048/// The function will remove redundant reinterprets casting in the presence
1049/// of the control flow
1050static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1051 IntrinsicInst &II) {
1053 auto RequiredType = II.getType();
1054
1055 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1056 assert(PN && "Expected Phi Node!");
1057
1058 // Don't create a new Phi unless we can remove the old one.
1059 if (!PN->hasOneUse())
1060 return std::nullopt;
1061
1062 for (Value *IncValPhi : PN->incoming_values()) {
1063 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1064 if (!Reinterpret ||
1065 Reinterpret->getIntrinsicID() !=
1066 Intrinsic::aarch64_sve_convert_to_svbool ||
1067 RequiredType != Reinterpret->getArgOperand(0)->getType())
1068 return std::nullopt;
1069 }
1070
1071 // Create the new Phi
1072 IC.Builder.SetInsertPoint(PN);
1073 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1074 Worklist.push_back(PN);
1075
1076 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1077 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1078 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1079 Worklist.push_back(Reinterpret);
1080 }
1081
1082 // Cleanup Phi Node and reinterprets
1083 return IC.replaceInstUsesWith(II, NPN);
1084}
1085
1086// A collection of properties common to SVE intrinsics that allow for combines
1087// to be written without needing to know the specific intrinsic.
1089 //
1090 // Helper routines for common intrinsic definitions.
1091 //
1092
1093 // e.g. llvm.aarch64.sve.add pg, op1, op2
1094 // with IID ==> llvm.aarch64.sve.add_u
1095 static SVEIntrinsicInfo
1102
1103 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1110
1111 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1117
1118 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1124
1125 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1126 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1127 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1128 return SVEIntrinsicInfo()
1131 }
1132
1133 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1134 // llvm.aarch64.sve.ld1 pg, ptr
1141
1142 // All properties relate to predication and thus having a general predicate
1143 // is the minimum requirement to say there is intrinsic info to act on.
1144 explicit operator bool() const { return hasGoverningPredicate(); }
1145
1146 //
1147 // Properties relating to the governing predicate.
1148 //
1149
1151 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1152 }
1153
1155 assert(hasGoverningPredicate() && "Propery not set!");
1156 return GoverningPredicateIdx;
1157 }
1158
1160 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1161 GoverningPredicateIdx = Index;
1162 return *this;
1163 }
1164
1165 //
1166 // Properties relating to operations the intrinsic could be transformed into.
1167 // NOTE: This does not mean such a transformation is always possible, but the
1168 // knowledge makes it possible to reuse existing optimisations without needing
1169 // to embed specific handling for each intrinsic. For example, instruction
1170 // simplification can be used to optimise an intrinsic's active lanes.
1171 //
1172
1174 return UndefIntrinsic != Intrinsic::not_intrinsic;
1175 }
1176
1178 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1179 return UndefIntrinsic;
1180 }
1181
1183 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1184 UndefIntrinsic = IID;
1185 return *this;
1186 }
1187
1188 bool hasMatchingIROpode() const { return IROpcode != 0; }
1189
1190 unsigned getMatchingIROpode() const {
1191 assert(hasMatchingIROpode() && "Propery not set!");
1192 return IROpcode;
1193 }
1194
1196 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1197 IROpcode = Opcode;
1198 return *this;
1199 }
1200
1201 //
1202 // Properties relating to the result of inactive lanes.
1203 //
1204
1206 return ResultLanes == InactiveLanesTakenFromOperand;
1207 }
1208
1210 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1211 return OperandIdxForInactiveLanes;
1212 }
1213
1215 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1216 ResultLanes = InactiveLanesTakenFromOperand;
1217 OperandIdxForInactiveLanes = Index;
1218 return *this;
1219 }
1220
1222 return ResultLanes == InactiveLanesAreNotDefined;
1223 }
1224
1226 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1227 ResultLanes = InactiveLanesAreNotDefined;
1228 return *this;
1229 }
1230
1232 return ResultLanes == InactiveLanesAreUnused;
1233 }
1234
1236 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1237 ResultLanes = InactiveLanesAreUnused;
1238 return *this;
1239 }
1240
1241 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1242 // inactiveLanesAreZeroed =
1243 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1244 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1245
1247 ResultIsZeroInitialized = true;
1248 return *this;
1249 }
1250
1251 //
1252 // The first operand of unary merging operations is typically only used to
1253 // set the result for inactive lanes. Knowing this allows us to deadcode the
1254 // operand when we can prove there are no inactive lanes.
1255 //
1256
1258 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1259 }
1260
1262 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1263 return OperandIdxWithNoActiveLanes;
1264 }
1265
1267 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1268 OperandIdxWithNoActiveLanes = Index;
1269 return *this;
1270 }
1271
1272private:
1273 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1274
1275 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1276 unsigned IROpcode = 0;
1277
1278 enum PredicationStyle {
1280 InactiveLanesTakenFromOperand,
1281 InactiveLanesAreNotDefined,
1282 InactiveLanesAreUnused
1283 } ResultLanes = Uninitialized;
1284
1285 bool ResultIsZeroInitialized = false;
1286 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1287 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1288};
1289
1291 // Some SVE intrinsics do not use scalable vector types, but since they are
1292 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1293 if (!isa<ScalableVectorType>(II.getType()) &&
1294 all_of(II.args(), [&](const Value *V) {
1295 return !isa<ScalableVectorType>(V->getType());
1296 }))
1297 return SVEIntrinsicInfo();
1298
1299 Intrinsic::ID IID = II.getIntrinsicID();
1300 switch (IID) {
1301 default:
1302 break;
1303 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1304 case Intrinsic::aarch64_sve_fcvt_f16f32:
1305 case Intrinsic::aarch64_sve_fcvt_f16f64:
1306 case Intrinsic::aarch64_sve_fcvt_f32f16:
1307 case Intrinsic::aarch64_sve_fcvt_f32f64:
1308 case Intrinsic::aarch64_sve_fcvt_f64f16:
1309 case Intrinsic::aarch64_sve_fcvt_f64f32:
1310 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1311 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1312 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1313 case Intrinsic::aarch64_sve_fcvtzs:
1314 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1315 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1316 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1317 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1318 case Intrinsic::aarch64_sve_fcvtzu:
1319 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1320 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1321 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1322 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1323 case Intrinsic::aarch64_sve_scvtf:
1324 case Intrinsic::aarch64_sve_scvtf_f16i32:
1325 case Intrinsic::aarch64_sve_scvtf_f16i64:
1326 case Intrinsic::aarch64_sve_scvtf_f32i64:
1327 case Intrinsic::aarch64_sve_scvtf_f64i32:
1328 case Intrinsic::aarch64_sve_ucvtf:
1329 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1330 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1331 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1332 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1334
1335 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1336 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1337 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1338 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1340
1341 case Intrinsic::aarch64_sve_fabd:
1342 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1343 case Intrinsic::aarch64_sve_fadd:
1344 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1345 .setMatchingIROpcode(Instruction::FAdd);
1346 case Intrinsic::aarch64_sve_fdiv:
1347 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1348 .setMatchingIROpcode(Instruction::FDiv);
1349 case Intrinsic::aarch64_sve_fmax:
1350 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1351 case Intrinsic::aarch64_sve_fmaxnm:
1352 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1353 case Intrinsic::aarch64_sve_fmin:
1354 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1355 case Intrinsic::aarch64_sve_fminnm:
1356 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1357 case Intrinsic::aarch64_sve_fmla:
1358 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1359 case Intrinsic::aarch64_sve_fmls:
1360 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1361 case Intrinsic::aarch64_sve_fmul:
1362 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1363 .setMatchingIROpcode(Instruction::FMul);
1364 case Intrinsic::aarch64_sve_fmulx:
1365 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1366 case Intrinsic::aarch64_sve_fnmla:
1367 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1368 case Intrinsic::aarch64_sve_fnmls:
1369 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1370 case Intrinsic::aarch64_sve_fsub:
1371 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1372 .setMatchingIROpcode(Instruction::FSub);
1373 case Intrinsic::aarch64_sve_add:
1374 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1375 .setMatchingIROpcode(Instruction::Add);
1376 case Intrinsic::aarch64_sve_mla:
1377 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1378 case Intrinsic::aarch64_sve_mls:
1379 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1380 case Intrinsic::aarch64_sve_mul:
1381 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1382 .setMatchingIROpcode(Instruction::Mul);
1383 case Intrinsic::aarch64_sve_sabd:
1384 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1385 case Intrinsic::aarch64_sve_sdiv:
1386 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1387 .setMatchingIROpcode(Instruction::SDiv);
1388 case Intrinsic::aarch64_sve_smax:
1389 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1390 case Intrinsic::aarch64_sve_smin:
1391 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1392 case Intrinsic::aarch64_sve_smulh:
1393 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1394 case Intrinsic::aarch64_sve_sub:
1395 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1396 .setMatchingIROpcode(Instruction::Sub);
1397 case Intrinsic::aarch64_sve_uabd:
1398 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1399 case Intrinsic::aarch64_sve_udiv:
1400 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1401 .setMatchingIROpcode(Instruction::UDiv);
1402 case Intrinsic::aarch64_sve_umax:
1403 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1404 case Intrinsic::aarch64_sve_umin:
1405 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1406 case Intrinsic::aarch64_sve_umulh:
1407 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1408 case Intrinsic::aarch64_sve_asr:
1409 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1410 .setMatchingIROpcode(Instruction::AShr);
1411 case Intrinsic::aarch64_sve_lsl:
1412 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1413 .setMatchingIROpcode(Instruction::Shl);
1414 case Intrinsic::aarch64_sve_lsr:
1415 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1416 .setMatchingIROpcode(Instruction::LShr);
1417 case Intrinsic::aarch64_sve_and:
1418 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1419 .setMatchingIROpcode(Instruction::And);
1420 case Intrinsic::aarch64_sve_bic:
1421 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1422 case Intrinsic::aarch64_sve_eor:
1423 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1424 .setMatchingIROpcode(Instruction::Xor);
1425 case Intrinsic::aarch64_sve_orr:
1426 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1427 .setMatchingIROpcode(Instruction::Or);
1428 case Intrinsic::aarch64_sve_sqsub:
1429 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1430 case Intrinsic::aarch64_sve_uqsub:
1431 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1432
1433 case Intrinsic::aarch64_sve_add_u:
1435 Instruction::Add);
1436 case Intrinsic::aarch64_sve_and_u:
1438 Instruction::And);
1439 case Intrinsic::aarch64_sve_asr_u:
1441 Instruction::AShr);
1442 case Intrinsic::aarch64_sve_eor_u:
1444 Instruction::Xor);
1445 case Intrinsic::aarch64_sve_fadd_u:
1447 Instruction::FAdd);
1448 case Intrinsic::aarch64_sve_fdiv_u:
1450 Instruction::FDiv);
1451 case Intrinsic::aarch64_sve_fmul_u:
1453 Instruction::FMul);
1454 case Intrinsic::aarch64_sve_fsub_u:
1456 Instruction::FSub);
1457 case Intrinsic::aarch64_sve_lsl_u:
1459 Instruction::Shl);
1460 case Intrinsic::aarch64_sve_lsr_u:
1462 Instruction::LShr);
1463 case Intrinsic::aarch64_sve_mul_u:
1465 Instruction::Mul);
1466 case Intrinsic::aarch64_sve_orr_u:
1468 Instruction::Or);
1469 case Intrinsic::aarch64_sve_sdiv_u:
1471 Instruction::SDiv);
1472 case Intrinsic::aarch64_sve_sub_u:
1474 Instruction::Sub);
1475 case Intrinsic::aarch64_sve_udiv_u:
1477 Instruction::UDiv);
1478
1479 case Intrinsic::aarch64_sve_addqv:
1480 case Intrinsic::aarch64_sve_and_z:
1481 case Intrinsic::aarch64_sve_bic_z:
1482 case Intrinsic::aarch64_sve_brka_z:
1483 case Intrinsic::aarch64_sve_brkb_z:
1484 case Intrinsic::aarch64_sve_brkn_z:
1485 case Intrinsic::aarch64_sve_brkpa_z:
1486 case Intrinsic::aarch64_sve_brkpb_z:
1487 case Intrinsic::aarch64_sve_cntp:
1488 case Intrinsic::aarch64_sve_compact:
1489 case Intrinsic::aarch64_sve_eor_z:
1490 case Intrinsic::aarch64_sve_eorv:
1491 case Intrinsic::aarch64_sve_eorqv:
1492 case Intrinsic::aarch64_sve_nand_z:
1493 case Intrinsic::aarch64_sve_nor_z:
1494 case Intrinsic::aarch64_sve_orn_z:
1495 case Intrinsic::aarch64_sve_orr_z:
1496 case Intrinsic::aarch64_sve_orv:
1497 case Intrinsic::aarch64_sve_orqv:
1498 case Intrinsic::aarch64_sve_pnext:
1499 case Intrinsic::aarch64_sve_rdffr_z:
1500 case Intrinsic::aarch64_sve_saddv:
1501 case Intrinsic::aarch64_sve_uaddv:
1502 case Intrinsic::aarch64_sve_umaxv:
1503 case Intrinsic::aarch64_sve_umaxqv:
1504 case Intrinsic::aarch64_sve_cmpeq:
1505 case Intrinsic::aarch64_sve_cmpeq_wide:
1506 case Intrinsic::aarch64_sve_cmpge:
1507 case Intrinsic::aarch64_sve_cmpge_wide:
1508 case Intrinsic::aarch64_sve_cmpgt:
1509 case Intrinsic::aarch64_sve_cmpgt_wide:
1510 case Intrinsic::aarch64_sve_cmphi:
1511 case Intrinsic::aarch64_sve_cmphi_wide:
1512 case Intrinsic::aarch64_sve_cmphs:
1513 case Intrinsic::aarch64_sve_cmphs_wide:
1514 case Intrinsic::aarch64_sve_cmple_wide:
1515 case Intrinsic::aarch64_sve_cmplo_wide:
1516 case Intrinsic::aarch64_sve_cmpls_wide:
1517 case Intrinsic::aarch64_sve_cmplt_wide:
1518 case Intrinsic::aarch64_sve_cmpne:
1519 case Intrinsic::aarch64_sve_cmpne_wide:
1520 case Intrinsic::aarch64_sve_facge:
1521 case Intrinsic::aarch64_sve_facgt:
1522 case Intrinsic::aarch64_sve_fcmpeq:
1523 case Intrinsic::aarch64_sve_fcmpge:
1524 case Intrinsic::aarch64_sve_fcmpgt:
1525 case Intrinsic::aarch64_sve_fcmpne:
1526 case Intrinsic::aarch64_sve_fcmpuo:
1527 case Intrinsic::aarch64_sve_ld1:
1528 case Intrinsic::aarch64_sve_ld1_gather:
1529 case Intrinsic::aarch64_sve_ld1_gather_index:
1530 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1531 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1532 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1533 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1534 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1535 case Intrinsic::aarch64_sve_ld1q_gather_index:
1536 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1537 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1538 case Intrinsic::aarch64_sve_ld1ro:
1539 case Intrinsic::aarch64_sve_ld1rq:
1540 case Intrinsic::aarch64_sve_ld1udq:
1541 case Intrinsic::aarch64_sve_ld1uwq:
1542 case Intrinsic::aarch64_sve_ld2_sret:
1543 case Intrinsic::aarch64_sve_ld2q_sret:
1544 case Intrinsic::aarch64_sve_ld3_sret:
1545 case Intrinsic::aarch64_sve_ld3q_sret:
1546 case Intrinsic::aarch64_sve_ld4_sret:
1547 case Intrinsic::aarch64_sve_ld4q_sret:
1548 case Intrinsic::aarch64_sve_ldff1:
1549 case Intrinsic::aarch64_sve_ldff1_gather:
1550 case Intrinsic::aarch64_sve_ldff1_gather_index:
1551 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1552 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1553 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1554 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1555 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1556 case Intrinsic::aarch64_sve_ldnf1:
1557 case Intrinsic::aarch64_sve_ldnt1:
1558 case Intrinsic::aarch64_sve_ldnt1_gather:
1559 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1560 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1561 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1563
1564 case Intrinsic::aarch64_sve_prf:
1565 case Intrinsic::aarch64_sve_prfb_gather_index:
1566 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1567 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1568 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1569 case Intrinsic::aarch64_sve_prfd_gather_index:
1570 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1571 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1572 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1573 case Intrinsic::aarch64_sve_prfh_gather_index:
1574 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1575 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1576 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1577 case Intrinsic::aarch64_sve_prfw_gather_index:
1578 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1579 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1580 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1582
1583 case Intrinsic::aarch64_sve_st1_scatter:
1584 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1585 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1586 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1587 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1588 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1589 case Intrinsic::aarch64_sve_st1dq:
1590 case Intrinsic::aarch64_sve_st1q_scatter_index:
1591 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1592 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1593 case Intrinsic::aarch64_sve_st1wq:
1594 case Intrinsic::aarch64_sve_stnt1:
1595 case Intrinsic::aarch64_sve_stnt1_scatter:
1596 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1597 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1598 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1600 case Intrinsic::aarch64_sve_st2:
1601 case Intrinsic::aarch64_sve_st2q:
1603 case Intrinsic::aarch64_sve_st3:
1604 case Intrinsic::aarch64_sve_st3q:
1606 case Intrinsic::aarch64_sve_st4:
1607 case Intrinsic::aarch64_sve_st4q:
1609 }
1610
1611 return SVEIntrinsicInfo();
1612}
1613
1614static bool isAllActivePredicate(Value *Pred) {
1615 Value *UncastedPred;
1616
1617 // Look through predicate casts that only remove lanes.
1619 m_Value(UncastedPred)))) {
1620 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1621 Pred = UncastedPred;
1622
1624 m_Value(UncastedPred))))
1625 // If the predicate has the same or less lanes than the uncasted predicate
1626 // then we know the casting has no effect.
1627 if (OrigPredTy->getMinNumElements() <=
1628 cast<ScalableVectorType>(UncastedPred->getType())
1629 ->getMinNumElements())
1630 Pred = UncastedPred;
1631 }
1632
1633 auto *C = dyn_cast<Constant>(Pred);
1634 return C && C->isAllOnesValue();
1635}
1636
1637// Simplify `V` by only considering the operations that affect active lanes.
1638// This function should only return existing Values or newly created Constants.
1639static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1640 auto *Dup = dyn_cast<IntrinsicInst>(V);
1641 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1642 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1644 cast<VectorType>(V->getType())->getElementCount(),
1645 cast<Constant>(Dup->getOperand(2)));
1646
1647 return V;
1648}
1649
1650static std::optional<Instruction *>
1652 const SVEIntrinsicInfo &IInfo) {
1653 const unsigned Opc = IInfo.getMatchingIROpode();
1654 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1655
1656 Value *Pg = II.getOperand(0);
1657 Value *Op1 = II.getOperand(1);
1658 Value *Op2 = II.getOperand(2);
1659 const DataLayout &DL = II.getDataLayout();
1660
1661 // Canonicalise constants to the RHS.
1663 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1664 IC.replaceOperand(II, 1, Op2);
1665 IC.replaceOperand(II, 2, Op1);
1666 return &II;
1667 }
1668
1669 // Only active lanes matter when simplifying the operation.
1670 Op1 = stripInactiveLanes(Op1, Pg);
1671 Op2 = stripInactiveLanes(Op2, Pg);
1672
1673 Value *SimpleII;
1674 if (auto FII = dyn_cast<FPMathOperator>(&II))
1675 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1676 else
1677 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1678
1679 // An SVE intrinsic's result is always defined. However, this is not the case
1680 // for its equivalent IR instruction (e.g. when shifting by an amount more
1681 // than the data's bitwidth). Simplifications to an undefined result must be
1682 // ignored to preserve the intrinsic's expected behaviour.
1683 if (!SimpleII || isa<UndefValue>(SimpleII))
1684 return std::nullopt;
1685
1686 if (IInfo.inactiveLanesAreNotDefined())
1687 return IC.replaceInstUsesWith(II, SimpleII);
1688
1689 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1690
1691 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1692 if (SimpleII == Inactive)
1693 return IC.replaceInstUsesWith(II, SimpleII);
1694
1695 // Inactive lanes must be preserved.
1696 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1697 return IC.replaceInstUsesWith(II, SimpleII);
1698}
1699
1700// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1701// to operations with less strict inactive lane requirements.
1702static std::optional<Instruction *>
1704 const SVEIntrinsicInfo &IInfo) {
1705 if (!IInfo.hasGoverningPredicate())
1706 return std::nullopt;
1707
1708 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1709
1710 // If there are no active lanes.
1711 if (match(OpPredicate, m_ZeroInt())) {
1713 return IC.replaceInstUsesWith(
1714 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1715
1716 if (IInfo.inactiveLanesAreUnused()) {
1717 if (IInfo.resultIsZeroInitialized())
1719
1720 return IC.eraseInstFromFunction(II);
1721 }
1722 }
1723
1724 // If there are no inactive lanes.
1725 if (isAllActivePredicate(OpPredicate)) {
1726 if (IInfo.hasOperandWithNoActiveLanes()) {
1727 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1728 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1729 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1730 }
1731
1732 if (IInfo.hasMatchingUndefIntrinsic()) {
1733 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1734 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1735 II.setCalledFunction(NewDecl);
1736 return &II;
1737 }
1738 }
1739
1740 // Operation specific simplifications.
1741 if (IInfo.hasMatchingIROpode() &&
1743 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1744
1745 return std::nullopt;
1746}
1747
1748// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1749// => (binop (pred) (from_svbool _) (from_svbool _))
1750//
1751// The above transformation eliminates a `to_svbool` in the predicate
1752// operand of bitwise operation `binop` by narrowing the vector width of
1753// the operation. For example, it would convert a `<vscale x 16 x i1>
1754// and` into a `<vscale x 4 x i1> and`. This is profitable because
1755// to_svbool must zero the new lanes during widening, whereas
1756// from_svbool is free.
1757static std::optional<Instruction *>
1759 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1760 if (!BinOp)
1761 return std::nullopt;
1762
1763 auto IntrinsicID = BinOp->getIntrinsicID();
1764 switch (IntrinsicID) {
1765 case Intrinsic::aarch64_sve_and_z:
1766 case Intrinsic::aarch64_sve_bic_z:
1767 case Intrinsic::aarch64_sve_eor_z:
1768 case Intrinsic::aarch64_sve_nand_z:
1769 case Intrinsic::aarch64_sve_nor_z:
1770 case Intrinsic::aarch64_sve_orn_z:
1771 case Intrinsic::aarch64_sve_orr_z:
1772 break;
1773 default:
1774 return std::nullopt;
1775 }
1776
1777 auto BinOpPred = BinOp->getOperand(0);
1778 auto BinOpOp1 = BinOp->getOperand(1);
1779 auto BinOpOp2 = BinOp->getOperand(2);
1780
1781 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1782 if (!PredIntr ||
1783 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1784 return std::nullopt;
1785
1786 auto PredOp = PredIntr->getOperand(0);
1787 auto PredOpTy = cast<VectorType>(PredOp->getType());
1788 if (PredOpTy != II.getType())
1789 return std::nullopt;
1790
1791 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1792 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1793 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1794 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1795 if (BinOpOp1 == BinOpOp2)
1796 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1797 else
1798 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1799 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1800
1801 auto NarrowedBinOp =
1802 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1803 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1804}
1805
1806static std::optional<Instruction *>
1808 // If the reinterpret instruction operand is a PHI Node
1809 if (isa<PHINode>(II.getArgOperand(0)))
1810 return processPhiNode(IC, II);
1811
1812 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1813 return BinOpCombine;
1814
1815 // Ignore converts to/from svcount_t.
1816 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1817 isa<TargetExtType>(II.getType()))
1818 return std::nullopt;
1819
1820 SmallVector<Instruction *, 32> CandidatesForRemoval;
1821 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1822
1823 const auto *IVTy = cast<VectorType>(II.getType());
1824
1825 // Walk the chain of conversions.
1826 while (Cursor) {
1827 // If the type of the cursor has fewer lanes than the final result, zeroing
1828 // must take place, which breaks the equivalence chain.
1829 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1830 if (CursorVTy->getElementCount().getKnownMinValue() <
1831 IVTy->getElementCount().getKnownMinValue())
1832 break;
1833
1834 // If the cursor has the same type as I, it is a viable replacement.
1835 if (Cursor->getType() == IVTy)
1836 EarliestReplacement = Cursor;
1837
1838 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1839
1840 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1841 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1842 Intrinsic::aarch64_sve_convert_to_svbool ||
1843 IntrinsicCursor->getIntrinsicID() ==
1844 Intrinsic::aarch64_sve_convert_from_svbool))
1845 break;
1846
1847 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1848 Cursor = IntrinsicCursor->getOperand(0);
1849 }
1850
1851 // If no viable replacement in the conversion chain was found, there is
1852 // nothing to do.
1853 if (!EarliestReplacement)
1854 return std::nullopt;
1855
1856 return IC.replaceInstUsesWith(II, EarliestReplacement);
1857}
1858
1859static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1860 IntrinsicInst &II) {
1861 // svsel(ptrue, x, y) => x
1862 auto *OpPredicate = II.getOperand(0);
1863 if (isAllActivePredicate(OpPredicate))
1864 return IC.replaceInstUsesWith(II, II.getOperand(1));
1865
1866 auto Select =
1867 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1868 return IC.replaceInstUsesWith(II, Select);
1869}
1870
1871static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1872 IntrinsicInst &II) {
1873 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1874 if (!Pg)
1875 return std::nullopt;
1876
1877 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1878 return std::nullopt;
1879
1880 const auto PTruePattern =
1881 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1882 if (PTruePattern != AArch64SVEPredPattern::vl1)
1883 return std::nullopt;
1884
1885 // The intrinsic is inserting into lane zero so use an insert instead.
1886 auto *IdxTy = Type::getInt64Ty(II.getContext());
1887 auto *Insert = InsertElementInst::Create(
1888 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1889 Insert->insertBefore(II.getIterator());
1890 Insert->takeName(&II);
1891
1892 return IC.replaceInstUsesWith(II, Insert);
1893}
1894
1895static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1896 IntrinsicInst &II) {
1897 // Replace DupX with a regular IR splat.
1898 auto *RetTy = cast<ScalableVectorType>(II.getType());
1899 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1900 II.getArgOperand(0));
1901 Splat->takeName(&II);
1902 return IC.replaceInstUsesWith(II, Splat);
1903}
1904
1905static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1906 IntrinsicInst &II) {
1907 LLVMContext &Ctx = II.getContext();
1908
1909 if (!isAllActivePredicate(II.getArgOperand(0)))
1910 return std::nullopt;
1911
1912 // Check that we have a compare of zero..
1913 auto *SplatValue =
1915 if (!SplatValue || !SplatValue->isZero())
1916 return std::nullopt;
1917
1918 // ..against a dupq
1919 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1920 if (!DupQLane ||
1921 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1922 return std::nullopt;
1923
1924 // Where the dupq is a lane 0 replicate of a vector insert
1925 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1926 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1927 return std::nullopt;
1928
1929 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1930 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1931 return std::nullopt;
1932
1933 // Where the vector insert is a fixed constant vector insert into undef at
1934 // index zero
1935 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1936 return std::nullopt;
1937
1938 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1939 return std::nullopt;
1940
1941 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1942 if (!ConstVec)
1943 return std::nullopt;
1944
1945 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1946 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1947 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1948 return std::nullopt;
1949
1950 unsigned NumElts = VecTy->getNumElements();
1951 unsigned PredicateBits = 0;
1952
1953 // Expand intrinsic operands to a 16-bit byte level predicate
1954 for (unsigned I = 0; I < NumElts; ++I) {
1955 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1956 if (!Arg)
1957 return std::nullopt;
1958 if (!Arg->isZero())
1959 PredicateBits |= 1 << (I * (16 / NumElts));
1960 }
1961
1962 // If all bits are zero bail early with an empty predicate
1963 if (PredicateBits == 0) {
1964 auto *PFalse = Constant::getNullValue(II.getType());
1965 PFalse->takeName(&II);
1966 return IC.replaceInstUsesWith(II, PFalse);
1967 }
1968
1969 // Calculate largest predicate type used (where byte predicate is largest)
1970 unsigned Mask = 8;
1971 for (unsigned I = 0; I < 16; ++I)
1972 if ((PredicateBits & (1 << I)) != 0)
1973 Mask |= (I % 8);
1974
1975 unsigned PredSize = Mask & -Mask;
1976 auto *PredType = ScalableVectorType::get(
1977 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1978
1979 // Ensure all relevant bits are set
1980 for (unsigned I = 0; I < 16; I += PredSize)
1981 if ((PredicateBits & (1 << I)) == 0)
1982 return std::nullopt;
1983
1984 auto *PTruePat =
1985 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1986 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1987 {PredType}, {PTruePat});
1988 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1989 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1990 auto *ConvertFromSVBool =
1991 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1992 {II.getType()}, {ConvertToSVBool});
1993
1994 ConvertFromSVBool->takeName(&II);
1995 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1996}
1997
1998static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1999 IntrinsicInst &II) {
2000 Value *Pg = II.getArgOperand(0);
2001 Value *Vec = II.getArgOperand(1);
2002 auto IntrinsicID = II.getIntrinsicID();
2003 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2004
2005 // lastX(splat(X)) --> X
2006 if (auto *SplatVal = getSplatValue(Vec))
2007 return IC.replaceInstUsesWith(II, SplatVal);
2008
2009 // If x and/or y is a splat value then:
2010 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2011 Value *LHS, *RHS;
2012 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2013 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2014 auto *OldBinOp = cast<BinaryOperator>(Vec);
2015 auto OpC = OldBinOp->getOpcode();
2016 auto *NewLHS =
2017 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2018 auto *NewRHS =
2019 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2021 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2022 return IC.replaceInstUsesWith(II, NewBinOp);
2023 }
2024 }
2025
2026 auto *C = dyn_cast<Constant>(Pg);
2027 if (IsAfter && C && C->isNullValue()) {
2028 // The intrinsic is extracting lane 0 so use an extract instead.
2029 auto *IdxTy = Type::getInt64Ty(II.getContext());
2030 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2031 Extract->insertBefore(II.getIterator());
2032 Extract->takeName(&II);
2033 return IC.replaceInstUsesWith(II, Extract);
2034 }
2035
2036 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2037 if (!IntrPG)
2038 return std::nullopt;
2039
2040 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2041 return std::nullopt;
2042
2043 const auto PTruePattern =
2044 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2045
2046 // Can the intrinsic's predicate be converted to a known constant index?
2047 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2048 if (!MinNumElts)
2049 return std::nullopt;
2050
2051 unsigned Idx = MinNumElts - 1;
2052 // Increment the index if extracting the element after the last active
2053 // predicate element.
2054 if (IsAfter)
2055 ++Idx;
2056
2057 // Ignore extracts whose index is larger than the known minimum vector
2058 // length. NOTE: This is an artificial constraint where we prefer to
2059 // maintain what the user asked for until an alternative is proven faster.
2060 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2061 if (Idx >= PgVTy->getMinNumElements())
2062 return std::nullopt;
2063
2064 // The intrinsic is extracting a fixed lane so use an extract instead.
2065 auto *IdxTy = Type::getInt64Ty(II.getContext());
2066 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2067 Extract->insertBefore(II.getIterator());
2068 Extract->takeName(&II);
2069 return IC.replaceInstUsesWith(II, Extract);
2070}
2071
2072static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2073 IntrinsicInst &II) {
2074 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2075 // integer variant across a variety of micro-architectures. Replace scalar
2076 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2077 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2078 // depending on the micro-architecture, but has been observed as generally
2079 // being faster, particularly when the CLAST[AB] op is a loop-carried
2080 // dependency.
2081 Value *Pg = II.getArgOperand(0);
2082 Value *Fallback = II.getArgOperand(1);
2083 Value *Vec = II.getArgOperand(2);
2084 Type *Ty = II.getType();
2085
2086 if (!Ty->isIntegerTy())
2087 return std::nullopt;
2088
2089 Type *FPTy;
2090 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2091 default:
2092 return std::nullopt;
2093 case 16:
2094 FPTy = IC.Builder.getHalfTy();
2095 break;
2096 case 32:
2097 FPTy = IC.Builder.getFloatTy();
2098 break;
2099 case 64:
2100 FPTy = IC.Builder.getDoubleTy();
2101 break;
2102 }
2103
2104 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2105 auto *FPVTy = VectorType::get(
2106 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2107 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2108 auto *FPII = IC.Builder.CreateIntrinsic(
2109 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2110 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2111 return IC.replaceInstUsesWith(II, FPIItoInt);
2112}
2113
2114static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2115 IntrinsicInst &II) {
2116 LLVMContext &Ctx = II.getContext();
2117 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2118 // can work with RDFFR_PP for ptest elimination.
2119 auto *AllPat =
2120 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2121 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2122 {II.getType()}, {AllPat});
2123 auto *RDFFR =
2124 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2125 RDFFR->takeName(&II);
2126 return IC.replaceInstUsesWith(II, RDFFR);
2127}
2128
2129static std::optional<Instruction *>
2131 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2132
2133 if (Pattern == AArch64SVEPredPattern::all) {
2135 II.getType(), ElementCount::getScalable(NumElts));
2136 Cnt->takeName(&II);
2137 return IC.replaceInstUsesWith(II, Cnt);
2138 }
2139
2140 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2141
2142 return MinNumElts && NumElts >= MinNumElts
2143 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2144 II, ConstantInt::get(II.getType(), MinNumElts)))
2145 : std::nullopt;
2146}
2147
2148static std::optional<Instruction *>
2150 const AArch64Subtarget *ST) {
2151 if (!ST->isStreaming())
2152 return std::nullopt;
2153
2154 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2155 // with SVEPredPattern::all
2156 Value *Cnt =
2158 Cnt->takeName(&II);
2159 return IC.replaceInstUsesWith(II, Cnt);
2160}
2161
2162static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2163 IntrinsicInst &II) {
2164 Value *PgVal = II.getArgOperand(0);
2165 Value *OpVal = II.getArgOperand(1);
2166
2167 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2168 // Later optimizations prefer this form.
2169 if (PgVal == OpVal &&
2170 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2171 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2172 Value *Ops[] = {PgVal, OpVal};
2173 Type *Tys[] = {PgVal->getType()};
2174
2175 auto *PTest =
2176 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2177 PTest->takeName(&II);
2178
2179 return IC.replaceInstUsesWith(II, PTest);
2180 }
2181
2184
2185 if (!Pg || !Op)
2186 return std::nullopt;
2187
2188 Intrinsic::ID OpIID = Op->getIntrinsicID();
2189
2190 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2191 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2192 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2193 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2194 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2195
2196 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2197
2198 PTest->takeName(&II);
2199 return IC.replaceInstUsesWith(II, PTest);
2200 }
2201
2202 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2203 // Later optimizations may rewrite sequence to use the flag-setting variant
2204 // of instruction X to remove PTEST.
2205 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2206 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2207 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2208 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2209 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2210 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2211 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2212 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2213 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2214 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2215 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2216 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2217 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2218 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2219 Type *Tys[] = {Pg->getType()};
2220
2221 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2222 PTest->takeName(&II);
2223
2224 return IC.replaceInstUsesWith(II, PTest);
2225 }
2226
2227 return std::nullopt;
2228}
2229
2230template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2231static std::optional<Instruction *>
2233 bool MergeIntoAddendOp) {
2234 Value *P = II.getOperand(0);
2235 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2236 if (MergeIntoAddendOp) {
2237 AddendOp = II.getOperand(1);
2238 Mul = II.getOperand(2);
2239 } else {
2240 AddendOp = II.getOperand(2);
2241 Mul = II.getOperand(1);
2242 }
2243
2245 m_Value(MulOp1))))
2246 return std::nullopt;
2247
2248 if (!Mul->hasOneUse())
2249 return std::nullopt;
2250
2251 Instruction *FMFSource = nullptr;
2252 if (II.getType()->isFPOrFPVectorTy()) {
2253 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2254 // Stop the combine when the flags on the inputs differ in case dropping
2255 // flags would lead to us missing out on more beneficial optimizations.
2256 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2257 return std::nullopt;
2258 if (!FAddFlags.allowContract())
2259 return std::nullopt;
2260 FMFSource = &II;
2261 }
2262
2263 CallInst *Res;
2264 if (MergeIntoAddendOp)
2265 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2266 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2267 else
2268 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2269 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2270
2271 return IC.replaceInstUsesWith(II, Res);
2272}
2273
2274static std::optional<Instruction *>
2276 Value *Pred = II.getOperand(0);
2277 Value *PtrOp = II.getOperand(1);
2278 Type *VecTy = II.getType();
2279
2280 if (isAllActivePredicate(Pred)) {
2281 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2282 Load->copyMetadata(II);
2283 return IC.replaceInstUsesWith(II, Load);
2284 }
2285
2286 CallInst *MaskedLoad =
2287 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2288 Pred, ConstantAggregateZero::get(VecTy));
2289 MaskedLoad->copyMetadata(II);
2290 return IC.replaceInstUsesWith(II, MaskedLoad);
2291}
2292
2293static std::optional<Instruction *>
2295 Value *VecOp = II.getOperand(0);
2296 Value *Pred = II.getOperand(1);
2297 Value *PtrOp = II.getOperand(2);
2298
2299 if (isAllActivePredicate(Pred)) {
2300 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2301 Store->copyMetadata(II);
2302 return IC.eraseInstFromFunction(II);
2303 }
2304
2305 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2306 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2307 MaskedStore->copyMetadata(II);
2308 return IC.eraseInstFromFunction(II);
2309}
2310
2312 switch (Intrinsic) {
2313 case Intrinsic::aarch64_sve_fmul_u:
2314 return Instruction::BinaryOps::FMul;
2315 case Intrinsic::aarch64_sve_fadd_u:
2316 return Instruction::BinaryOps::FAdd;
2317 case Intrinsic::aarch64_sve_fsub_u:
2318 return Instruction::BinaryOps::FSub;
2319 default:
2320 return Instruction::BinaryOpsEnd;
2321 }
2322}
2323
2324static std::optional<Instruction *>
2326 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2327 if (II.isStrictFP())
2328 return std::nullopt;
2329
2330 auto *OpPredicate = II.getOperand(0);
2331 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2332 if (BinOpCode == Instruction::BinaryOpsEnd ||
2333 !isAllActivePredicate(OpPredicate))
2334 return std::nullopt;
2335 auto BinOp = IC.Builder.CreateBinOpFMF(
2336 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2337 return IC.replaceInstUsesWith(II, BinOp);
2338}
2339
2340static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2341 IntrinsicInst &II) {
2342 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2343 Intrinsic::aarch64_sve_mla>(
2344 IC, II, true))
2345 return MLA;
2346 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2347 Intrinsic::aarch64_sve_mad>(
2348 IC, II, false))
2349 return MAD;
2350 return std::nullopt;
2351}
2352
2353static std::optional<Instruction *>
2355 if (auto FMLA =
2356 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2357 Intrinsic::aarch64_sve_fmla>(IC, II,
2358 true))
2359 return FMLA;
2360 if (auto FMAD =
2361 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2362 Intrinsic::aarch64_sve_fmad>(IC, II,
2363 false))
2364 return FMAD;
2365 if (auto FMLA =
2366 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2367 Intrinsic::aarch64_sve_fmla>(IC, II,
2368 true))
2369 return FMLA;
2370 return std::nullopt;
2371}
2372
2373static std::optional<Instruction *>
2375 if (auto FMLA =
2376 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2377 Intrinsic::aarch64_sve_fmla>(IC, II,
2378 true))
2379 return FMLA;
2380 if (auto FMAD =
2381 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2382 Intrinsic::aarch64_sve_fmad>(IC, II,
2383 false))
2384 return FMAD;
2385 if (auto FMLA_U =
2386 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2387 Intrinsic::aarch64_sve_fmla_u>(
2388 IC, II, true))
2389 return FMLA_U;
2390 return instCombineSVEVectorBinOp(IC, II);
2391}
2392
2393static std::optional<Instruction *>
2395 if (auto FMLS =
2396 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2397 Intrinsic::aarch64_sve_fmls>(IC, II,
2398 true))
2399 return FMLS;
2400 if (auto FMSB =
2401 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2402 Intrinsic::aarch64_sve_fnmsb>(
2403 IC, II, false))
2404 return FMSB;
2405 if (auto FMLS =
2406 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2407 Intrinsic::aarch64_sve_fmls>(IC, II,
2408 true))
2409 return FMLS;
2410 return std::nullopt;
2411}
2412
2413static std::optional<Instruction *>
2415 if (auto FMLS =
2416 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2417 Intrinsic::aarch64_sve_fmls>(IC, II,
2418 true))
2419 return FMLS;
2420 if (auto FMSB =
2421 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2422 Intrinsic::aarch64_sve_fnmsb>(
2423 IC, II, false))
2424 return FMSB;
2425 if (auto FMLS_U =
2426 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2427 Intrinsic::aarch64_sve_fmls_u>(
2428 IC, II, true))
2429 return FMLS_U;
2430 return instCombineSVEVectorBinOp(IC, II);
2431}
2432
2433static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2434 IntrinsicInst &II) {
2435 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2436 Intrinsic::aarch64_sve_mls>(
2437 IC, II, true))
2438 return MLS;
2439 return std::nullopt;
2440}
2441
2442static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2443 IntrinsicInst &II) {
2444 Value *UnpackArg = II.getArgOperand(0);
2445 auto *RetTy = cast<ScalableVectorType>(II.getType());
2446 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2447 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2448
2449 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2450 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2451 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2452 ScalarArg =
2453 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2454 Value *NewVal =
2455 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2456 NewVal->takeName(&II);
2457 return IC.replaceInstUsesWith(II, NewVal);
2458 }
2459
2460 return std::nullopt;
2461}
2462static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2463 IntrinsicInst &II) {
2464 auto *OpVal = II.getOperand(0);
2465 auto *OpIndices = II.getOperand(1);
2466 VectorType *VTy = cast<VectorType>(II.getType());
2467
2468 // Check whether OpIndices is a constant splat value < minimal element count
2469 // of result.
2470 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2471 if (!SplatValue ||
2472 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2473 return std::nullopt;
2474
2475 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2476 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2477 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2478 auto *VectorSplat =
2479 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2480
2481 VectorSplat->takeName(&II);
2482 return IC.replaceInstUsesWith(II, VectorSplat);
2483}
2484
2485static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2486 IntrinsicInst &II) {
2487 Value *A, *B;
2488 Type *RetTy = II.getType();
2489 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2490 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2491
2492 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2493 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2494 if ((match(II.getArgOperand(0),
2496 match(II.getArgOperand(1),
2498 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2499 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2500 auto *TyA = cast<ScalableVectorType>(A->getType());
2501 if (TyA == B->getType() &&
2503 auto *SubVec = IC.Builder.CreateInsertVector(
2504 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2505 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2506 TyA->getMinNumElements());
2507 ConcatVec->takeName(&II);
2508 return IC.replaceInstUsesWith(II, ConcatVec);
2509 }
2510 }
2511
2512 return std::nullopt;
2513}
2514
2515static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2516 IntrinsicInst &II) {
2517 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2518 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2519 Value *A, *B;
2520 if (match(II.getArgOperand(0),
2523 m_Specific(A), m_Specific(B))))
2524 return IC.replaceInstUsesWith(
2525 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2526
2527 return std::nullopt;
2528}
2529
2530static std::optional<Instruction *>
2532 Value *Mask = II.getOperand(0);
2533 Value *BasePtr = II.getOperand(1);
2534 Value *Index = II.getOperand(2);
2535 Type *Ty = II.getType();
2536 Value *PassThru = ConstantAggregateZero::get(Ty);
2537
2538 // Contiguous gather => masked load.
2539 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2540 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2541 Value *IndexBase;
2543 m_Value(IndexBase), m_SpecificInt(1)))) {
2544 Align Alignment =
2545 BasePtr->getPointerAlignment(II.getDataLayout());
2546
2547 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2548 BasePtr, IndexBase);
2549 CallInst *MaskedLoad =
2550 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2551 MaskedLoad->takeName(&II);
2552 return IC.replaceInstUsesWith(II, MaskedLoad);
2553 }
2554
2555 return std::nullopt;
2556}
2557
2558static std::optional<Instruction *>
2560 Value *Val = II.getOperand(0);
2561 Value *Mask = II.getOperand(1);
2562 Value *BasePtr = II.getOperand(2);
2563 Value *Index = II.getOperand(3);
2564 Type *Ty = Val->getType();
2565
2566 // Contiguous scatter => masked store.
2567 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2568 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2569 Value *IndexBase;
2571 m_Value(IndexBase), m_SpecificInt(1)))) {
2572 Align Alignment =
2573 BasePtr->getPointerAlignment(II.getDataLayout());
2574
2575 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2576 BasePtr, IndexBase);
2577 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2578
2579 return IC.eraseInstFromFunction(II);
2580 }
2581
2582 return std::nullopt;
2583}
2584
2585static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2586 IntrinsicInst &II) {
2588 Value *Pred = II.getOperand(0);
2589 Value *Vec = II.getOperand(1);
2590 Value *DivVec = II.getOperand(2);
2591
2592 Value *SplatValue = getSplatValue(DivVec);
2593 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2594 if (!SplatConstantInt)
2595 return std::nullopt;
2596
2597 APInt Divisor = SplatConstantInt->getValue();
2598 const int64_t DivisorValue = Divisor.getSExtValue();
2599 if (DivisorValue == -1)
2600 return std::nullopt;
2601 if (DivisorValue == 1)
2602 IC.replaceInstUsesWith(II, Vec);
2603
2604 if (Divisor.isPowerOf2()) {
2605 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2606 auto ASRD = IC.Builder.CreateIntrinsic(
2607 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2608 return IC.replaceInstUsesWith(II, ASRD);
2609 }
2610 if (Divisor.isNegatedPowerOf2()) {
2611 Divisor.negate();
2612 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2613 auto ASRD = IC.Builder.CreateIntrinsic(
2614 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2615 auto NEG = IC.Builder.CreateIntrinsic(
2616 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2617 return IC.replaceInstUsesWith(II, NEG);
2618 }
2619
2620 return std::nullopt;
2621}
2622
2623bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2624 size_t VecSize = Vec.size();
2625 if (VecSize == 1)
2626 return true;
2627 if (!isPowerOf2_64(VecSize))
2628 return false;
2629 size_t HalfVecSize = VecSize / 2;
2630
2631 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2632 RHS != Vec.end(); LHS++, RHS++) {
2633 if (*LHS != nullptr && *RHS != nullptr) {
2634 if (*LHS == *RHS)
2635 continue;
2636 else
2637 return false;
2638 }
2639 if (!AllowPoison)
2640 return false;
2641 if (*LHS == nullptr && *RHS != nullptr)
2642 *LHS = *RHS;
2643 }
2644
2645 Vec.resize(HalfVecSize);
2646 SimplifyValuePattern(Vec, AllowPoison);
2647 return true;
2648}
2649
2650// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2651// to dupqlane(f64(C)) where C is A concatenated with B
2652static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2653 IntrinsicInst &II) {
2654 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2655 if (!match(II.getOperand(0),
2657 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2658 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2659 return std::nullopt;
2660 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2661
2662 // Insert the scalars into a container ordered by InsertElement index
2663 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2664 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2665 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2666 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2667 CurrentInsertElt = InsertElt->getOperand(0);
2668 }
2669
2670 bool AllowPoison =
2671 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2672 if (!SimplifyValuePattern(Elts, AllowPoison))
2673 return std::nullopt;
2674
2675 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2676 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2677 for (size_t I = 0; I < Elts.size(); I++) {
2678 if (Elts[I] == nullptr)
2679 continue;
2680 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2681 IC.Builder.getInt64(I));
2682 }
2683 if (InsertEltChain == nullptr)
2684 return std::nullopt;
2685
2686 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2687 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2688 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2689 // be narrowed back to the original type.
2690 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2691 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2692 IIScalableTy->getMinNumElements() /
2693 PatternWidth;
2694
2695 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2696 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2697 auto *WideShuffleMaskTy =
2698 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2699
2700 auto InsertSubvector = IC.Builder.CreateInsertVector(
2701 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2702 uint64_t(0));
2703 auto WideBitcast =
2704 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2705 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2706 auto WideShuffle = IC.Builder.CreateShuffleVector(
2707 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2708 auto NarrowBitcast =
2709 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2710
2711 return IC.replaceInstUsesWith(II, NarrowBitcast);
2712}
2713
2714static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2715 IntrinsicInst &II) {
2716 Value *A = II.getArgOperand(0);
2717 Value *B = II.getArgOperand(1);
2718 if (A == B)
2719 return IC.replaceInstUsesWith(II, A);
2720
2721 return std::nullopt;
2722}
2723
2724static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2725 IntrinsicInst &II) {
2726 Value *Pred = II.getOperand(0);
2727 Value *Vec = II.getOperand(1);
2728 Value *Shift = II.getOperand(2);
2729
2730 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2731 Value *AbsPred, *MergedValue;
2733 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2735 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2736
2737 return std::nullopt;
2738
2739 // Transform is valid if any of the following are true:
2740 // * The ABS merge value is an undef or non-negative
2741 // * The ABS predicate is all active
2742 // * The ABS predicate and the SRSHL predicates are the same
2743 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2744 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2745 return std::nullopt;
2746
2747 // Only valid when the shift amount is non-negative, otherwise the rounding
2748 // behaviour of SRSHL cannot be ignored.
2749 if (!match(Shift, m_NonNegative()))
2750 return std::nullopt;
2751
2752 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2753 {II.getType()}, {Pred, Vec, Shift});
2754
2755 return IC.replaceInstUsesWith(II, LSL);
2756}
2757
2758static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2759 IntrinsicInst &II) {
2760 Value *Vec = II.getOperand(0);
2761
2762 if (getSplatValue(Vec) == II.getOperand(1))
2763 return IC.replaceInstUsesWith(II, Vec);
2764
2765 return std::nullopt;
2766}
2767
2768static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2769 IntrinsicInst &II) {
2770 // If this barrier is post-dominated by identical one we can remove it
2771 auto *NI = II.getNextNode();
2772 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2773 auto CanSkipOver = [](Instruction *I) {
2774 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2775 };
2776 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2777 auto *NIBB = NI->getParent();
2778 NI = NI->getNextNode();
2779 if (!NI) {
2780 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2781 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2782 else
2783 break;
2784 }
2785 }
2786 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2787 if (NextII && II.isIdenticalTo(NextII))
2788 return IC.eraseInstFromFunction(II);
2789
2790 return std::nullopt;
2791}
2792
2793static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2794 IntrinsicInst &II) {
2795 return IC.replaceInstUsesWith(
2796 II,
2797 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2798 {II.getType(), II.getOperand(0)->getType()},
2799 {II.getOperand(0), II.getOperand(1)}));
2800}
2801
2802static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2803 IntrinsicInst &II) {
2805 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2806 return std::nullopt;
2807}
2808
2809static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2811 unsigned NumBits) {
2812 Value *Passthru = II.getOperand(0);
2813 Value *Pg = II.getOperand(1);
2814 Value *Op = II.getOperand(2);
2815
2816 // Convert UXT[BHW] to AND.
2817 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2818 auto *Ty = cast<VectorType>(II.getType());
2819 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2820 auto *Mask = ConstantInt::get(Ty, MaskValue);
2821 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2822 {Pg, Op, Mask});
2823 return IC.replaceInstUsesWith(II, And);
2824 }
2825
2826 return std::nullopt;
2827}
2828
2829static std::optional<Instruction *>
2831 SMEAttrs FnSMEAttrs(*II.getFunction());
2832 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2833 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2834 return IC.replaceInstUsesWith(
2835 II, ConstantInt::getBool(II.getType(), IsStreaming));
2836 return std::nullopt;
2837}
2838
2839std::optional<Instruction *>
2841 IntrinsicInst &II) const {
2843 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2844 return I;
2845
2846 Intrinsic::ID IID = II.getIntrinsicID();
2847 switch (IID) {
2848 default:
2849 break;
2850 case Intrinsic::aarch64_dmb:
2851 return instCombineDMB(IC, II);
2852 case Intrinsic::aarch64_neon_fmaxnm:
2853 case Intrinsic::aarch64_neon_fminnm:
2854 return instCombineMaxMinNM(IC, II);
2855 case Intrinsic::aarch64_sve_convert_from_svbool:
2856 return instCombineConvertFromSVBool(IC, II);
2857 case Intrinsic::aarch64_sve_dup:
2858 return instCombineSVEDup(IC, II);
2859 case Intrinsic::aarch64_sve_dup_x:
2860 return instCombineSVEDupX(IC, II);
2861 case Intrinsic::aarch64_sve_cmpne:
2862 case Intrinsic::aarch64_sve_cmpne_wide:
2863 return instCombineSVECmpNE(IC, II);
2864 case Intrinsic::aarch64_sve_rdffr:
2865 return instCombineRDFFR(IC, II);
2866 case Intrinsic::aarch64_sve_lasta:
2867 case Intrinsic::aarch64_sve_lastb:
2868 return instCombineSVELast(IC, II);
2869 case Intrinsic::aarch64_sve_clasta_n:
2870 case Intrinsic::aarch64_sve_clastb_n:
2871 return instCombineSVECondLast(IC, II);
2872 case Intrinsic::aarch64_sve_cntd:
2873 return instCombineSVECntElts(IC, II, 2);
2874 case Intrinsic::aarch64_sve_cntw:
2875 return instCombineSVECntElts(IC, II, 4);
2876 case Intrinsic::aarch64_sve_cnth:
2877 return instCombineSVECntElts(IC, II, 8);
2878 case Intrinsic::aarch64_sve_cntb:
2879 return instCombineSVECntElts(IC, II, 16);
2880 case Intrinsic::aarch64_sme_cntsd:
2881 return instCombineSMECntsd(IC, II, ST);
2882 case Intrinsic::aarch64_sve_ptest_any:
2883 case Intrinsic::aarch64_sve_ptest_first:
2884 case Intrinsic::aarch64_sve_ptest_last:
2885 return instCombineSVEPTest(IC, II);
2886 case Intrinsic::aarch64_sve_fadd:
2887 return instCombineSVEVectorFAdd(IC, II);
2888 case Intrinsic::aarch64_sve_fadd_u:
2889 return instCombineSVEVectorFAddU(IC, II);
2890 case Intrinsic::aarch64_sve_fmul_u:
2891 return instCombineSVEVectorBinOp(IC, II);
2892 case Intrinsic::aarch64_sve_fsub:
2893 return instCombineSVEVectorFSub(IC, II);
2894 case Intrinsic::aarch64_sve_fsub_u:
2895 return instCombineSVEVectorFSubU(IC, II);
2896 case Intrinsic::aarch64_sve_add:
2897 return instCombineSVEVectorAdd(IC, II);
2898 case Intrinsic::aarch64_sve_add_u:
2899 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2900 Intrinsic::aarch64_sve_mla_u>(
2901 IC, II, true);
2902 case Intrinsic::aarch64_sve_sub:
2903 return instCombineSVEVectorSub(IC, II);
2904 case Intrinsic::aarch64_sve_sub_u:
2905 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2906 Intrinsic::aarch64_sve_mls_u>(
2907 IC, II, true);
2908 case Intrinsic::aarch64_sve_tbl:
2909 return instCombineSVETBL(IC, II);
2910 case Intrinsic::aarch64_sve_uunpkhi:
2911 case Intrinsic::aarch64_sve_uunpklo:
2912 case Intrinsic::aarch64_sve_sunpkhi:
2913 case Intrinsic::aarch64_sve_sunpklo:
2914 return instCombineSVEUnpack(IC, II);
2915 case Intrinsic::aarch64_sve_uzp1:
2916 return instCombineSVEUzp1(IC, II);
2917 case Intrinsic::aarch64_sve_zip1:
2918 case Intrinsic::aarch64_sve_zip2:
2919 return instCombineSVEZip(IC, II);
2920 case Intrinsic::aarch64_sve_ld1_gather_index:
2921 return instCombineLD1GatherIndex(IC, II);
2922 case Intrinsic::aarch64_sve_st1_scatter_index:
2923 return instCombineST1ScatterIndex(IC, II);
2924 case Intrinsic::aarch64_sve_ld1:
2925 return instCombineSVELD1(IC, II, DL);
2926 case Intrinsic::aarch64_sve_st1:
2927 return instCombineSVEST1(IC, II, DL);
2928 case Intrinsic::aarch64_sve_sdiv:
2929 return instCombineSVESDIV(IC, II);
2930 case Intrinsic::aarch64_sve_sel:
2931 return instCombineSVESel(IC, II);
2932 case Intrinsic::aarch64_sve_srshl:
2933 return instCombineSVESrshl(IC, II);
2934 case Intrinsic::aarch64_sve_dupq_lane:
2935 return instCombineSVEDupqLane(IC, II);
2936 case Intrinsic::aarch64_sve_insr:
2937 return instCombineSVEInsr(IC, II);
2938 case Intrinsic::aarch64_sve_whilelo:
2939 return instCombineWhilelo(IC, II);
2940 case Intrinsic::aarch64_sve_ptrue:
2941 return instCombinePTrue(IC, II);
2942 case Intrinsic::aarch64_sve_uxtb:
2943 return instCombineSVEUxt(IC, II, 8);
2944 case Intrinsic::aarch64_sve_uxth:
2945 return instCombineSVEUxt(IC, II, 16);
2946 case Intrinsic::aarch64_sve_uxtw:
2947 return instCombineSVEUxt(IC, II, 32);
2948 case Intrinsic::aarch64_sme_in_streaming_mode:
2949 return instCombineInStreamingMode(IC, II);
2950 }
2951
2952 return std::nullopt;
2953}
2954
2956 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2957 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2958 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2959 SimplifyAndSetOp) const {
2960 switch (II.getIntrinsicID()) {
2961 default:
2962 break;
2963 case Intrinsic::aarch64_neon_fcvtxn:
2964 case Intrinsic::aarch64_neon_rshrn:
2965 case Intrinsic::aarch64_neon_sqrshrn:
2966 case Intrinsic::aarch64_neon_sqrshrun:
2967 case Intrinsic::aarch64_neon_sqshrn:
2968 case Intrinsic::aarch64_neon_sqshrun:
2969 case Intrinsic::aarch64_neon_sqxtn:
2970 case Intrinsic::aarch64_neon_sqxtun:
2971 case Intrinsic::aarch64_neon_uqrshrn:
2972 case Intrinsic::aarch64_neon_uqshrn:
2973 case Intrinsic::aarch64_neon_uqxtn:
2974 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2975 break;
2976 }
2977
2978 return std::nullopt;
2979}
2980
2982 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2984}
2985
2988 switch (K) {
2990 return TypeSize::getFixed(64);
2992 if (ST->useSVEForFixedLengthVectors() &&
2993 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
2994 return TypeSize::getFixed(
2995 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2996 else if (ST->isNeonAvailable())
2997 return TypeSize::getFixed(128);
2998 else
2999 return TypeSize::getFixed(0);
3001 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3003 return TypeSize::getScalable(128);
3004 else
3005 return TypeSize::getScalable(0);
3006 }
3007 llvm_unreachable("Unsupported register kind");
3008}
3009
3010bool AArch64TTIImpl::isSingleExtWideningInstruction(
3011 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3012 Type *SrcOverrideTy) const {
3013 // A helper that returns a vector type from the given type. The number of
3014 // elements in type Ty determines the vector width.
3015 auto toVectorTy = [&](Type *ArgTy) {
3016 return VectorType::get(ArgTy->getScalarType(),
3017 cast<VectorType>(DstTy)->getElementCount());
3018 };
3019
3020 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3021 // i32, i64]. SVE doesn't generally have the same set of instructions to
3022 // perform an extend with the add/sub/mul. There are SMULLB style
3023 // instructions, but they operate on top/bottom, requiring some sort of lane
3024 // interleaving to be used with zext/sext.
3025 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3026 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3027 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3028 return false;
3029
3030 Type *SrcTy = SrcOverrideTy;
3031 switch (Opcode) {
3032 case Instruction::Add: // UADDW(2), SADDW(2).
3033 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3034 // The second operand needs to be an extend
3035 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3036 if (!SrcTy)
3037 SrcTy =
3038 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3039 break;
3040 }
3041
3042 if (Opcode == Instruction::Sub)
3043 return false;
3044
3045 // UADDW(2), SADDW(2) can be commutted.
3046 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3047 if (!SrcTy)
3048 SrcTy =
3049 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3050 break;
3051 }
3052 return false;
3053 }
3054 default:
3055 return false;
3056 }
3057
3058 // Legalize the destination type and ensure it can be used in a widening
3059 // operation.
3060 auto DstTyL = getTypeLegalizationCost(DstTy);
3061 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3062 return false;
3063
3064 // Legalize the source type and ensure it can be used in a widening
3065 // operation.
3066 assert(SrcTy && "Expected some SrcTy");
3067 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3068 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3069 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3070 return false;
3071
3072 // Get the total number of vector elements in the legalized types.
3073 InstructionCost NumDstEls =
3074 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3075 InstructionCost NumSrcEls =
3076 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3077
3078 // Return true if the legalized types have the same number of vector elements
3079 // and the destination element type size is twice that of the source type.
3080 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3081}
3082
3083Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3085 Type *SrcOverrideTy) const {
3086 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3087 Opcode != Instruction::Mul)
3088 return nullptr;
3089
3090 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3091 // i32, i64]. SVE doesn't generally have the same set of instructions to
3092 // perform an extend with the add/sub/mul. There are SMULLB style
3093 // instructions, but they operate on top/bottom, requiring some sort of lane
3094 // interleaving to be used with zext/sext.
3095 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3096 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3097 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3098 return nullptr;
3099
3100 auto getScalarSizeWithOverride = [&](const Value *V) {
3101 if (SrcOverrideTy)
3102 return SrcOverrideTy->getScalarSizeInBits();
3103 return cast<Instruction>(V)
3104 ->getOperand(0)
3105 ->getType()
3106 ->getScalarSizeInBits();
3107 };
3108
3109 unsigned MaxEltSize = 0;
3110 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3111 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3112 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3113 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3114 MaxEltSize = std::max(EltSize0, EltSize1);
3115 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3116 isa<SExtInst, ZExtInst>(Args[1])) {
3117 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3118 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3119 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3120 // enough.
3121 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3122 return nullptr;
3123 MaxEltSize = DstEltSize / 2;
3124 } else if (Opcode == Instruction::Mul &&
3125 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3126 // If one of the operands is a Zext and the other has enough zero bits
3127 // to be treated as unsigned, we can still generate a umull, meaning the
3128 // zext is free.
3129 KnownBits Known =
3130 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3131 if (Args[0]->getType()->getScalarSizeInBits() -
3132 Known.Zero.countLeadingOnes() >
3133 DstTy->getScalarSizeInBits() / 2)
3134 return nullptr;
3135
3136 MaxEltSize =
3137 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3138 } else
3139 return nullptr;
3140
3141 if (MaxEltSize * 2 > DstEltSize)
3142 return nullptr;
3143
3144 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3145 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3146 return nullptr;
3147 return ExtTy;
3148}
3149
3150// s/urhadd instructions implement the following pattern, making the
3151// extends free:
3152// %x = add ((zext i8 -> i16), 1)
3153// %y = (zext i8 -> i16)
3154// trunc i16 (lshr (add %x, %y), 1) -> i8
3155//
3157 Type *Src) const {
3158 // The source should be a legal vector type.
3159 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3160 (Src->isScalableTy() && !ST->hasSVE2()))
3161 return false;
3162
3163 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3164 return false;
3165
3166 // Look for trunc/shl/add before trying to match the pattern.
3167 const Instruction *Add = ExtUser;
3168 auto *AddUser =
3169 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3170 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3171 Add = AddUser;
3172
3173 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3174 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3175 return false;
3176
3177 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3178 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3179 Src->getScalarSizeInBits() !=
3180 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3181 return false;
3182
3183 // Try to match the whole pattern. Ext could be either the first or second
3184 // m_ZExtOrSExt matched.
3185 Instruction *Ex1, *Ex2;
3186 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3187 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3188 return false;
3189
3190 // Ensure both extends are of the same type
3191 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3192 Ex1->getOpcode() == Ex2->getOpcode())
3193 return true;
3194
3195 return false;
3196}
3197
3199 Type *Src,
3202 const Instruction *I) const {
3203 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3204 assert(ISD && "Invalid opcode");
3205 // If the cast is observable, and it is used by a widening instruction (e.g.,
3206 // uaddl, saddw, etc.), it may be free.
3207 if (I && I->hasOneUser()) {
3208 auto *SingleUser = cast<Instruction>(*I->user_begin());
3209 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3210 if (Type *ExtTy = isBinExtWideningInstruction(
3211 SingleUser->getOpcode(), Dst, Operands,
3212 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3213 // The cost from Src->Src*2 needs to be added if required, the cost from
3214 // Src*2->ExtTy is free.
3215 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3216 Type *DoubleSrcTy =
3217 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3218 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3220 }
3221
3222 return 0;
3223 }
3224
3225 if (isSingleExtWideningInstruction(
3226 SingleUser->getOpcode(), Dst, Operands,
3227 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3228 // For adds only count the second operand as free if both operands are
3229 // extends but not the same operation. (i.e both operands are not free in
3230 // add(sext, zext)).
3231 if (SingleUser->getOpcode() == Instruction::Add) {
3232 if (I == SingleUser->getOperand(1) ||
3233 (isa<CastInst>(SingleUser->getOperand(1)) &&
3234 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3235 return 0;
3236 } else {
3237 // Others are free so long as isSingleExtWideningInstruction
3238 // returned true.
3239 return 0;
3240 }
3241 }
3242
3243 // The cast will be free for the s/urhadd instructions
3244 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3245 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3246 return 0;
3247 }
3248
3249 // TODO: Allow non-throughput costs that aren't binary.
3250 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
3252 return Cost == 0 ? 0 : 1;
3253 return Cost;
3254 };
3255
3256 EVT SrcTy = TLI->getValueType(DL, Src);
3257 EVT DstTy = TLI->getValueType(DL, Dst);
3258
3259 if (!SrcTy.isSimple() || !DstTy.isSimple())
3260 return AdjustCost(
3261 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3262
3263 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3264 // we use fcvtx under SVE2. Give them invalid costs.
3265 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3266 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3267 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3269
3270 static const TypeConversionCostTblEntry BF16Tbl[] = {
3271 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3272 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3273 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3274 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3275 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3276 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3277 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3278 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3279 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3280 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3281 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3282 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3283 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3284 };
3285
3286 if (ST->hasBF16())
3287 if (const auto *Entry = ConvertCostTableLookup(
3288 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3289 return AdjustCost(Entry->Cost);
3290
3291 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3292 // The cost of unpacking twice is artificially increased for now in order
3293 // to avoid regressions against NEON, which will use tbl instructions directly
3294 // instead of multiple layers of [s|u]unpk[lo|hi].
3295 // We use the unpacks in cases where the destination type is illegal and
3296 // requires splitting of the input, even if the input type itself is legal.
3297 const unsigned int SVE_EXT_COST = 1;
3298 const unsigned int SVE_FCVT_COST = 1;
3299 const unsigned int SVE_UNPACK_ONCE = 4;
3300 const unsigned int SVE_UNPACK_TWICE = 16;
3301
3302 static const TypeConversionCostTblEntry ConversionTbl[] = {
3303 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3304 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3305 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3306 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3307 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3308 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3309 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3310 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3311 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3312 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3313 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3314 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3315 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3316 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3317 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3318 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3319 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3320 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3321 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3322 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3323
3324 // Truncations on nxvmiN
3325 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3326 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3327 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3328 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3329 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3330 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3331 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3332 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3333 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3334 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3335 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3336 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3337 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3338 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3339 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3340 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3341 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3342 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3343 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3344 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3345 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3346 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3347 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3348 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3349 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3350 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3351 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3352 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3353 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3354 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3355 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3356 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3357 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3358
3359 // The number of shll instructions for the extension.
3360 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3361 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3362 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3363 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3364 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3365 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3366 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3367 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3368 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3369 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3370 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3371 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3372 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3373 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3374 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3375 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3376
3377 // FP Ext and trunc
3378 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3379 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3380 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3381 // FP16
3382 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3383 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3384 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3385 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3386 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3387 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3388 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3389 // BF16 (uses shift)
3390 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3391 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3392 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3393 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3394 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3395 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3396 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3397 // FP Ext and trunc
3398 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3399 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3400 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3401 // FP16
3402 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3403 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3404 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3405 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3406 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3407 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3408 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3409 // BF16 (more complex, with +bf16 is handled above)
3410 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3411 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3412 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3413 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3414 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3415 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3416 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3417 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3418
3419 // LowerVectorINT_TO_FP:
3420 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3421 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3422 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3423 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3424 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3425 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3426
3427 // SVE: to nxv2f16
3428 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3429 SVE_EXT_COST + SVE_FCVT_COST},
3430 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3431 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3432 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3433 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3434 SVE_EXT_COST + SVE_FCVT_COST},
3435 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3436 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3437 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3438
3439 // SVE: to nxv4f16
3440 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3441 SVE_EXT_COST + SVE_FCVT_COST},
3442 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3443 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3444 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3445 SVE_EXT_COST + SVE_FCVT_COST},
3446 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3447 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3448
3449 // SVE: to nxv8f16
3450 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3451 SVE_EXT_COST + SVE_FCVT_COST},
3452 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3453 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3454 SVE_EXT_COST + SVE_FCVT_COST},
3455 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3456
3457 // SVE: to nxv16f16
3458 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3459 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3460 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3461 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3462
3463 // Complex: to v2f32
3464 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3465 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3466 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3467 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3468
3469 // SVE: to nxv2f32
3470 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3471 SVE_EXT_COST + SVE_FCVT_COST},
3472 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3473 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3474 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3475 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3476 SVE_EXT_COST + SVE_FCVT_COST},
3477 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3478 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3479 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3480
3481 // Complex: to v4f32
3482 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3483 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3484 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3485 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3486
3487 // SVE: to nxv4f32
3488 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3489 SVE_EXT_COST + SVE_FCVT_COST},
3490 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3491 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3492 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3493 SVE_EXT_COST + SVE_FCVT_COST},
3494 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3495 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3496
3497 // Complex: to v8f32
3498 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3499 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3500 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3501 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3502
3503 // SVE: to nxv8f32
3504 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3505 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3506 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3507 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3508 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3509 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3510 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3511 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3512
3513 // SVE: to nxv16f32
3514 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3515 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3516 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3517 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3518
3519 // Complex: to v16f32
3520 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3521 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3522
3523 // Complex: to v2f64
3524 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3525 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3526 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3527 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3528 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3529 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3530
3531 // SVE: to nxv2f64
3532 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3533 SVE_EXT_COST + SVE_FCVT_COST},
3534 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3535 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3536 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3537 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3538 SVE_EXT_COST + SVE_FCVT_COST},
3539 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3540 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3541 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3542
3543 // Complex: to v4f64
3544 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3545 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3546
3547 // SVE: to nxv4f64
3548 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3549 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3550 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3551 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3552 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3553 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3554 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3555 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3556 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3557 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3558 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3559 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3560
3561 // SVE: to nxv8f64
3562 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3563 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3564 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3565 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3566 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3567 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3568 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3569 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3570
3571 // LowerVectorFP_TO_INT
3572 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3573 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3574 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3575 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3576 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3577 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3578
3579 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3580 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3581 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3582 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3583 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3584 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3585 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3586
3587 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3588 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3589 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3590 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3591 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3592
3593 // Complex, from nxv2f32.
3594 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3595 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3596 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3597 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3598 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3599 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3600 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3601 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3602
3603 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3604 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3605 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3606 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3607 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3608 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3609 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3610
3611 // Complex, from nxv2f64.
3612 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3613 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3614 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3615 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3616 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3617 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3618 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3619 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3620 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3621 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3622
3623 // Complex, from nxv4f32.
3624 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3625 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3626 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3627 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3628 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3629 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3630 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3631 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3632 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3633 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3634
3635 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3636 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3637 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3638 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3639 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3640
3641 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3642 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3643 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3644 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3645 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3646 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3647 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3648
3649 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3650 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3651 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3652 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3653 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3654
3655 // Complex, from nxv8f16.
3656 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3657 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3658 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3659 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3660 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3661 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3662 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3663 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3664 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3665 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3666
3667 // Complex, from nxv4f16.
3668 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3669 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3670 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3671 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3672 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3673 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3674 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3675 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3676
3677 // Complex, from nxv2f16.
3678 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3679 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3680 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3681 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3682 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3683 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3684 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3685 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3686
3687 // Truncate from nxvmf32 to nxvmf16.
3688 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3689 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3690 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3691
3692 // Truncate from nxvmf32 to nxvmbf16.
3693 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3694 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3695 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3696
3697 // Truncate from nxvmf64 to nxvmf16.
3698 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3699 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3700 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3701
3702 // Truncate from nxvmf64 to nxvmbf16.
3703 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3704 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3705 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3706
3707 // Truncate from nxvmf64 to nxvmf32.
3708 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3709 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3710 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3711
3712 // Extend from nxvmf16 to nxvmf32.
3713 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3714 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3715 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3716
3717 // Extend from nxvmbf16 to nxvmf32.
3718 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3719 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3720 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3721
3722 // Extend from nxvmf16 to nxvmf64.
3723 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3724 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3725 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3726
3727 // Extend from nxvmbf16 to nxvmf64.
3728 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3729 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3730 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3731
3732 // Extend from nxvmf32 to nxvmf64.
3733 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3734 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3735 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3736
3737 // Bitcasts from float to integer
3738 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3739 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3740 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3741
3742 // Bitcasts from integer to float
3743 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3744 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3745 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3746
3747 // Add cost for extending to illegal -too wide- scalable vectors.
3748 // zero/sign extend are implemented by multiple unpack operations,
3749 // where each operation has a cost of 1.
3750 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3751 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3752 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3753 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3754 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3755 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3756
3757 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3758 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3759 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3760 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3761 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3762 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3763 };
3764
3765 // We have to estimate a cost of fixed length operation upon
3766 // SVE registers(operations) with the number of registers required
3767 // for a fixed type to be represented upon SVE registers.
3768 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3769 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3770 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3771 ST->useSVEForFixedLengthVectors(WiderTy)) {
3772 std::pair<InstructionCost, MVT> LT =
3773 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3774 unsigned NumElements =
3775 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3776 return AdjustCost(
3777 LT.first *
3779 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3780 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3781 CostKind, I));
3782 }
3783
3784 if (const auto *Entry = ConvertCostTableLookup(
3785 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3786 return AdjustCost(Entry->Cost);
3787
3788 static const TypeConversionCostTblEntry FP16Tbl[] = {
3789 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3790 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3791 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3792 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3793 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3794 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3795 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3796 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3797 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3798 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3799 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3800 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3801 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3802 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3803 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3804 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3805 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3806 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3807 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3808 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3809 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3810 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3811 };
3812
3813 if (ST->hasFullFP16())
3814 if (const auto *Entry = ConvertCostTableLookup(
3815 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3816 return AdjustCost(Entry->Cost);
3817
3818 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3819 // double-rounding issues.
3820 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3821 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3823 return AdjustCost(
3825 getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(),
3826 CCH, CostKind) +
3828 CostKind) +
3830 CostKind));
3831
3832 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3834 ST->isSVEorStreamingSVEAvailable() &&
3835 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3837 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3839 // The standard behaviour in the backend for these cases is to split the
3840 // extend up into two parts:
3841 // 1. Perform an extending load or masked load up to the legal type.
3842 // 2. Extend the loaded data to the final type.
3843 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3844 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3846 Opcode, LegalTy, Src, CCH, CostKind, I);
3848 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3849 return Part1 + Part2;
3850 }
3851
3852 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3853 // but we also want to include the TTI::CastContextHint::Masked case too.
3854 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3856 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3858
3859 return AdjustCost(
3860 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3861}
3862
3865 VectorType *VecTy, unsigned Index,
3867
3868 // Make sure we were given a valid extend opcode.
3869 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3870 "Invalid opcode");
3871
3872 // We are extending an element we extract from a vector, so the source type
3873 // of the extend is the element type of the vector.
3874 auto *Src = VecTy->getElementType();
3875
3876 // Sign- and zero-extends are for integer types only.
3877 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3878
3879 // Get the cost for the extract. We compute the cost (if any) for the extend
3880 // below.
3881 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3882 CostKind, Index, nullptr, nullptr);
3883
3884 // Legalize the types.
3885 auto VecLT = getTypeLegalizationCost(VecTy);
3886 auto DstVT = TLI->getValueType(DL, Dst);
3887 auto SrcVT = TLI->getValueType(DL, Src);
3888
3889 // If the resulting type is still a vector and the destination type is legal,
3890 // we may get the extension for free. If not, get the default cost for the
3891 // extend.
3892 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3893 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3894 CostKind);
3895
3896 // The destination type should be larger than the element type. If not, get
3897 // the default cost for the extend.
3898 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3899 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3900 CostKind);
3901
3902 switch (Opcode) {
3903 default:
3904 llvm_unreachable("Opcode should be either SExt or ZExt");
3905
3906 // For sign-extends, we only need a smov, which performs the extension
3907 // automatically.
3908 case Instruction::SExt:
3909 return Cost;
3910
3911 // For zero-extends, the extend is performed automatically by a umov unless
3912 // the destination type is i64 and the element type is i8 or i16.
3913 case Instruction::ZExt:
3914 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3915 return Cost;
3916 }
3917
3918 // If we are unable to perform the extend for free, get the default cost.
3919 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3920 CostKind);
3921}
3922
3925 const Instruction *I) const {
3927 return Opcode == Instruction::PHI ? 0 : 1;
3928 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3929 // Branches are assumed to be predicted.
3930 return 0;
3931}
3932
3933InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3934 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3935 const Instruction *I, Value *Scalar,
3936 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3937 assert(Val->isVectorTy() && "This must be a vector type");
3938
3939 if (Index != -1U) {
3940 // Legalize the type.
3941 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3942
3943 // This type is legalized to a scalar type.
3944 if (!LT.second.isVector())
3945 return 0;
3946
3947 // The type may be split. For fixed-width vectors we can normalize the
3948 // index to the new type.
3949 if (LT.second.isFixedLengthVector()) {
3950 unsigned Width = LT.second.getVectorNumElements();
3951 Index = Index % Width;
3952 }
3953
3954 // The element at index zero is already inside the vector.
3955 // - For a insert-element or extract-element
3956 // instruction that extracts integers, an explicit FPR -> GPR move is
3957 // needed. So it has non-zero cost.
3958 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
3959 return 0;
3960
3961 // This is recognising a LD1 single-element structure to one lane of one
3962 // register instruction. I.e., if this is an `insertelement` instruction,
3963 // and its second operand is a load, then we will generate a LD1, which
3964 // are expensive instructions.
3965 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3966 return CostKind == TTI::TCK_CodeSize
3967 ? 0
3969
3970 // i1 inserts and extract will include an extra cset or cmp of the vector
3971 // value. Increase the cost by 1 to account.
3972 if (Val->getScalarSizeInBits() == 1)
3973 return CostKind == TTI::TCK_CodeSize
3974 ? 2
3976
3977 // FIXME:
3978 // If the extract-element and insert-element instructions could be
3979 // simplified away (e.g., could be combined into users by looking at use-def
3980 // context), they have no cost. This is not done in the first place for
3981 // compile-time considerations.
3982 }
3983
3984 // In case of Neon, if there exists extractelement from lane != 0 such that
3985 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3986 // 2. extractelement result feeds into fmul.
3987 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3988 // equivalent to 0.
3989 // then the extractelement can be merged with fmul in the backend and it
3990 // incurs no cost.
3991 // e.g.
3992 // define double @foo(<2 x double> %a) {
3993 // %1 = extractelement <2 x double> %a, i32 0
3994 // %2 = extractelement <2 x double> %a, i32 1
3995 // %res = fmul double %1, %2
3996 // ret double %res
3997 // }
3998 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3999 auto ExtractCanFuseWithFmul = [&]() {
4000 // We bail out if the extract is from lane 0.
4001 if (Index == 0)
4002 return false;
4003
4004 // Check if the scalar element type of the vector operand of ExtractElement
4005 // instruction is one of the allowed types.
4006 auto IsAllowedScalarTy = [&](const Type *T) {
4007 return T->isFloatTy() || T->isDoubleTy() ||
4008 (T->isHalfTy() && ST->hasFullFP16());
4009 };
4010
4011 // Check if the extractelement user is scalar fmul.
4012 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4013 // Check if the user is scalar fmul.
4014 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4015 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4016 !BO->getType()->isVectorTy();
4017 };
4018
4019 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4020 // certain scalar type and a certain vector register width.
4021 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4022 auto RegWidth =
4024 .getFixedValue();
4025 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4026 };
4027
4028 // Check if the type constraints on input vector type and result scalar type
4029 // of extractelement instruction are satisfied.
4030 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4031 return false;
4032
4033 if (Scalar) {
4034 DenseMap<User *, unsigned> UserToExtractIdx;
4035 for (auto *U : Scalar->users()) {
4036 if (!IsUserFMulScalarTy(U))
4037 return false;
4038 // Recording entry for the user is important. Index value is not
4039 // important.
4040 UserToExtractIdx[U];
4041 }
4042 if (UserToExtractIdx.empty())
4043 return false;
4044 for (auto &[S, U, L] : ScalarUserAndIdx) {
4045 for (auto *U : S->users()) {
4046 if (UserToExtractIdx.contains(U)) {
4047 auto *FMul = cast<BinaryOperator>(U);
4048 auto *Op0 = FMul->getOperand(0);
4049 auto *Op1 = FMul->getOperand(1);
4050 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4051 UserToExtractIdx[U] = L;
4052 break;
4053 }
4054 }
4055 }
4056 }
4057 for (auto &[U, L] : UserToExtractIdx) {
4058 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4059 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4060 return false;
4061 }
4062 } else {
4063 const auto *EE = cast<ExtractElementInst>(I);
4064
4065 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4066 if (!IdxOp)
4067 return false;
4068
4069 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4070 if (!IsUserFMulScalarTy(U))
4071 return false;
4072
4073 // Check if the other operand of extractelement is also extractelement
4074 // from lane equivalent to 0.
4075 const auto *BO = cast<BinaryOperator>(U);
4076 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4077 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4078 if (OtherEE) {
4079 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4080 if (!IdxOp)
4081 return false;
4082 return IsExtractLaneEquivalentToZero(
4083 cast<ConstantInt>(OtherEE->getIndexOperand())
4084 ->getValue()
4085 .getZExtValue(),
4086 OtherEE->getType()->getScalarSizeInBits());
4087 }
4088 return true;
4089 });
4090 }
4091 return true;
4092 };
4093
4094 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4095 ExtractCanFuseWithFmul())
4096 return 0;
4097
4098 // All other insert/extracts cost this much.
4099 return CostKind == TTI::TCK_CodeSize ? 1
4100 : ST->getVectorInsertExtractBaseCost();
4101}
4102
4105 unsigned Index,
4106 const Value *Op0,
4107 const Value *Op1) const {
4108 // Treat insert at lane 0 into a poison vector as having zero cost. This
4109 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4110 // single dup) are treated as cheap.
4111 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4112 isa<PoisonValue>(Op0))
4113 return 0;
4114 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
4115}
4116
4118 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4119 Value *Scalar,
4120 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
4121 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4122 ScalarUserAndIdx);
4123}
4124
4126 Type *Val,
4128 unsigned Index) const {
4129 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
4130}
4131
4135 unsigned Index) const {
4136 if (isa<FixedVectorType>(Val))
4138 Index);
4139
4140 // This typically requires both while and lastb instructions in order
4141 // to extract the last element. If this is in a loop the while
4142 // instruction can at least be hoisted out, although it will consume a
4143 // predicate register. The cost should be more expensive than the base
4144 // extract cost, which is 2 for most CPUs.
4145 return CostKind == TTI::TCK_CodeSize
4146 ? 2
4147 : ST->getVectorInsertExtractBaseCost() + 1;
4148}
4149
4151 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4152 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4153 ArrayRef<Value *> VL) const {
4156 if (Ty->getElementType()->isFloatingPointTy())
4157 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4158 CostKind);
4159 unsigned VecInstCost =
4160 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4161 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4162}
4163
4164std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4166 TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
4167 std::function<InstructionCost(Type *)> InstCost) const {
4168 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4169 return std::nullopt;
4170 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4171 return std::nullopt;
4172
4173 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4174 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4176 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4177 Cost *= 2;
4178 Cost += InstCost(PromotedTy);
4179 if (IncludeTrunc)
4180 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4182 return Cost;
4183}
4184
4186 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4188 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4189
4190 // The code-generator is currently not able to handle scalable vectors
4191 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4192 // it. This change will be removed when code-generation for these types is
4193 // sufficiently reliable.
4194 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4195 if (VTy->getElementCount() == ElementCount::getScalable(1))
4197
4198 // TODO: Handle more cost kinds.
4200 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4201 Op2Info, Args, CxtI);
4202
4203 // Legalize the type.
4204 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4205 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4206
4207 // Increase the cost for half and bfloat types if not architecturally
4208 // supported.
4209 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4210 ISD == ISD::FDIV || ISD == ISD::FREM)
4211 if (auto PromotedCost = getFP16BF16PromoteCost(
4212 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4213 [&](Type *PromotedTy) {
4214 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4215 Op1Info, Op2Info);
4216 }))
4217 return *PromotedCost;
4218
4219 // If the operation is a widening instruction (smull or umull) and both
4220 // operands are extends the cost can be cheaper by considering that the
4221 // operation will operate on the narrowest type size possible (double the
4222 // largest input size) and a further extend.
4223 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4224 if (ExtTy != Ty)
4225 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4226 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4228 return LT.first;
4229 }
4230
4231 switch (ISD) {
4232 default:
4233 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4234 Op2Info);
4235 case ISD::SREM:
4236 case ISD::SDIV:
4237 /*
4238 Notes for sdiv/srem specific costs:
4239 1. This only considers the cases where the divisor is constant, uniform and
4240 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4241 result in some form of (ldr + adrp), corresponding to constant vectors, or
4242 scalarization of the division operation.
4243 2. Constant divisors, either negative in whole or partially, don't result in
4244 significantly different codegen as compared to positive constant divisors.
4245 So, we don't consider negative divisors separately.
4246 3. If the codegen is significantly different with SVE, it has been indicated
4247 using comments at appropriate places.
4248
4249 sdiv specific cases:
4250 -----------------------------------------------------------------------
4251 codegen | pow-of-2 | Type
4252 -----------------------------------------------------------------------
4253 add + cmp + csel + asr | Y | i64
4254 add + cmp + csel + asr | Y | i32
4255 -----------------------------------------------------------------------
4256
4257 srem specific cases:
4258 -----------------------------------------------------------------------
4259 codegen | pow-of-2 | Type
4260 -----------------------------------------------------------------------
4261 negs + and + and + csneg | Y | i64
4262 negs + and + and + csneg | Y | i32
4263 -----------------------------------------------------------------------
4264
4265 other sdiv/srem cases:
4266 -------------------------------------------------------------------------
4267 common codegen | + srem | + sdiv | pow-of-2 | Type
4268 -------------------------------------------------------------------------
4269 smulh + asr + add + add | - | - | N | i64
4270 smull + lsr + add + add | - | - | N | i32
4271 usra | and + sub | sshr | Y | <2 x i64>
4272 2 * (scalar code) | - | - | N | <2 x i64>
4273 usra | bic + sub | sshr + neg | Y | <4 x i32>
4274 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4275 + sshr + usra | | | |
4276 -------------------------------------------------------------------------
4277 */
4278 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4279 InstructionCost AddCost =
4280 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4281 Op1Info.getNoProps(), Op2Info.getNoProps());
4282 InstructionCost AsrCost =
4283 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4284 Op1Info.getNoProps(), Op2Info.getNoProps());
4285 InstructionCost MulCost =
4286 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4287 Op1Info.getNoProps(), Op2Info.getNoProps());
4288 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4289 // have similar cost.
4290 auto VT = TLI->getValueType(DL, Ty);
4291 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4292 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4293 // Neg can be folded into the asr instruction.
4294 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4295 : (3 * AsrCost + AddCost);
4296 } else {
4297 return MulCost + AsrCost + 2 * AddCost;
4298 }
4299 } else if (VT.isVector()) {
4300 InstructionCost UsraCost = 2 * AsrCost;
4301 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4302 // Division with scalable types corresponds to native 'asrd'
4303 // instruction when SVE is available.
4304 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4305
4306 // One more for the negation in SDIV
4308 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4309 if (Ty->isScalableTy() && ST->hasSVE())
4310 Cost += 2 * AsrCost;
4311 else {
4312 Cost +=
4313 UsraCost +
4314 (ISD == ISD::SDIV
4315 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4316 : 2 * AddCost);
4317 }
4318 return Cost;
4319 } else if (LT.second == MVT::v2i64) {
4320 return VT.getVectorNumElements() *
4321 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4322 Op1Info.getNoProps(),
4323 Op2Info.getNoProps());
4324 } else {
4325 // When SVE is available, we get:
4326 // smulh + lsr + add/sub + asr + add/sub.
4327 if (Ty->isScalableTy() && ST->hasSVE())
4328 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4329 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4330 }
4331 }
4332 }
4333 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4334 LT.second.isFixedLengthVector()) {
4335 // FIXME: When the constant vector is non-uniform, this may result in
4336 // loading the vector from constant pool or in some cases, may also result
4337 // in scalarization. For now, we are approximating this with the
4338 // scalarization cost.
4339 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4340 CostKind, -1, nullptr, nullptr);
4341 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4342 CostKind, -1, nullptr, nullptr);
4343 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4344 return ExtractCost + InsertCost +
4345 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4346 CostKind, Op1Info.getNoProps(),
4347 Op2Info.getNoProps());
4348 }
4349 [[fallthrough]];
4350 case ISD::UDIV:
4351 case ISD::UREM: {
4352 auto VT = TLI->getValueType(DL, Ty);
4353 if (Op2Info.isConstant()) {
4354 // If the operand is a power of 2 we can use the shift or and cost.
4355 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4356 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4357 Op1Info.getNoProps(),
4358 Op2Info.getNoProps());
4359 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4360 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4361 Op1Info.getNoProps(),
4362 Op2Info.getNoProps());
4363
4364 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4365 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4366 // The MULHU will be expanded to UMULL for the types not listed below,
4367 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4368 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4369 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4370 LT.second == MVT::nxv16i8;
4371 bool Is128bit = LT.second.is128BitVector();
4372
4373 InstructionCost MulCost =
4374 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4375 Op1Info.getNoProps(), Op2Info.getNoProps());
4376 InstructionCost AddCost =
4377 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4378 Op1Info.getNoProps(), Op2Info.getNoProps());
4379 InstructionCost ShrCost =
4380 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4381 Op1Info.getNoProps(), Op2Info.getNoProps());
4382 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4383 (HasMULH ? 0 : ShrCost) + // UMULL shift
4384 AddCost * 2 + ShrCost;
4385 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4386 }
4387 }
4388
4389 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4390 // emitted by the backend even when those functions are not declared in the
4391 // module.
4392 if (!VT.isVector() && VT.getSizeInBits() > 64)
4393 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4394
4396 Opcode, Ty, CostKind, Op1Info, Op2Info);
4397 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4398 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4399 // SDIV/UDIV operations are lowered using SVE, then we can have less
4400 // costs.
4401 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4402 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4403 static const CostTblEntry DivTbl[]{
4404 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4405 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4406 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4407 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4408 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4409 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4410
4411 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4412 if (nullptr != Entry)
4413 return Entry->Cost;
4414 }
4415 // For 8/16-bit elements, the cost is higher because the type
4416 // requires promotion and possibly splitting:
4417 if (LT.second.getScalarType() == MVT::i8)
4418 Cost *= 8;
4419 else if (LT.second.getScalarType() == MVT::i16)
4420 Cost *= 4;
4421 return Cost;
4422 } else {
4423 // If one of the operands is a uniform constant then the cost for each
4424 // element is Cost for insertion, extraction and division.
4425 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4426 // operation with scalar type
4427 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4428 (Op2Info.isConstant() && Op2Info.isUniform())) {
4429 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4431 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4432 return (4 + DivCost) * VTy->getNumElements();
4433 }
4434 }
4435 // On AArch64, without SVE, vector divisions are expanded
4436 // into scalar divisions of each pair of elements.
4437 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4438 -1, nullptr, nullptr);
4439 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4440 nullptr, nullptr);
4441 }
4442
4443 // TODO: if one of the arguments is scalar, then it's not necessary to
4444 // double the cost of handling the vector elements.
4445 Cost += Cost;
4446 }
4447 return Cost;
4448 }
4449 case ISD::MUL:
4450 // When SVE is available, then we can lower the v2i64 operation using
4451 // the SVE mul instruction, which has a lower cost.
4452 if (LT.second == MVT::v2i64 && ST->hasSVE())
4453 return LT.first;
4454
4455 // When SVE is not available, there is no MUL.2d instruction,
4456 // which means mul <2 x i64> is expensive as elements are extracted
4457 // from the vectors and the muls scalarized.
4458 // As getScalarizationOverhead is a bit too pessimistic, we
4459 // estimate the cost for a i64 vector directly here, which is:
4460 // - four 2-cost i64 extracts,
4461 // - two 2-cost i64 inserts, and
4462 // - two 1-cost muls.
4463 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4464 // LT.first = 2 the cost is 28.
4465 if (LT.second != MVT::v2i64)
4466 return LT.first;
4467 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4468 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4469 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4470 nullptr, nullptr) *
4471 2 +
4472 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4473 nullptr, nullptr));
4474 case ISD::ADD:
4475 case ISD::XOR:
4476 case ISD::OR:
4477 case ISD::AND:
4478 case ISD::SRL:
4479 case ISD::SRA:
4480 case ISD::SHL:
4481 // These nodes are marked as 'custom' for combining purposes only.
4482 // We know that they are legal. See LowerAdd in ISelLowering.
4483 return LT.first;
4484
4485 case ISD::FNEG:
4486 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4487 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4488 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4489 CxtI &&
4490 ((CxtI->hasOneUse() &&
4491 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4492 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4493 return 0;
4494 [[fallthrough]];
4495 case ISD::FADD:
4496 case ISD::FSUB:
4497 if (!Ty->getScalarType()->isFP128Ty())
4498 return LT.first;
4499 [[fallthrough]];
4500 case ISD::FMUL:
4501 case ISD::FDIV:
4502 // These nodes are marked as 'custom' just to lower them to SVE.
4503 // We know said lowering will incur no additional cost.
4504 if (!Ty->getScalarType()->isFP128Ty())
4505 return 2 * LT.first;
4506
4507 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4508 Op2Info);
4509 case ISD::FREM:
4510 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4511 // those functions are not declared in the module.
4512 if (!Ty->isVectorTy())
4513 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4514 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4515 Op2Info);
4516 }
4517}
4518
4521 const SCEV *Ptr,
4523 // Address computations in vectorized code with non-consecutive addresses will
4524 // likely result in more instructions compared to scalar code where the
4525 // computation can more often be merged into the index mode. The resulting
4526 // extra micro-ops can significantly decrease throughput.
4527 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4528 int MaxMergeDistance = 64;
4529
4530 if (PtrTy->isVectorTy() && SE &&
4531 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4532 return NumVectorInstToHideOverhead;
4533
4534 // In many cases the address computation is not merged into the instruction
4535 // addressing mode.
4536 return 1;
4537}
4538
4539/// Check whether Opcode1 has less throughput according to the scheduling
4540/// model than Opcode2.
4542 unsigned Opcode1, unsigned Opcode2) const {
4543 const MCSchedModel &Sched = ST->getSchedModel();
4544 const TargetInstrInfo *TII = ST->getInstrInfo();
4545 if (!Sched.hasInstrSchedModel())
4546 return false;
4547
4548 const MCSchedClassDesc *SCD1 =
4549 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4550 const MCSchedClassDesc *SCD2 =
4551 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4552 // We cannot handle variant scheduling classes without an MI. If we need to
4553 // support them for any of the instructions we query the information of we
4554 // might need to add a way to resolve them without a MI or not use the
4555 // scheduling info.
4556 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4557 "Cannot handle variant scheduling classes without an MI");
4558 if (!SCD1->isValid() || !SCD2->isValid())
4559 return false;
4560
4561 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4563}
4564
4566 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4568 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4569 // We don't lower some vector selects well that are wider than the register
4570 // width. TODO: Improve this with different cost kinds.
4571 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4572 // We would need this many instructions to hide the scalarization happening.
4573 const int AmortizationCost = 20;
4574
4575 // If VecPred is not set, check if we can get a predicate from the context
4576 // instruction, if its type matches the requested ValTy.
4577 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4578 CmpPredicate CurrentPred;
4579 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4580 m_Value())))
4581 VecPred = CurrentPred;
4582 }
4583 // Check if we have a compare/select chain that can be lowered using
4584 // a (F)CMxx & BFI pair.
4585 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4586 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4587 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4588 VecPred == CmpInst::FCMP_UNE) {
4589 static const auto ValidMinMaxTys = {
4590 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4591 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4592 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4593
4594 auto LT = getTypeLegalizationCost(ValTy);
4595 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
4596 (ST->hasFullFP16() &&
4597 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
4598 return LT.first;
4599 }
4600
4601 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4602 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4603 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4604 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4605 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4606 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4607 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4608 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4609 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4610 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4611 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4612 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4613
4614 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4615 EVT SelValTy = TLI->getValueType(DL, ValTy);
4616 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4617 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4618 SelCondTy.getSimpleVT(),
4619 SelValTy.getSimpleVT()))
4620 return Entry->Cost;
4621 }
4622 }
4623
4624 if (Opcode == Instruction::FCmp) {
4625 if (auto PromotedCost = getFP16BF16PromoteCost(
4626 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4627 [&](Type *PromotedTy) {
4629 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4630 CostKind, Op1Info, Op2Info);
4631 if (isa<VectorType>(PromotedTy))
4633 Instruction::Trunc,
4637 return Cost;
4638 }))
4639 return *PromotedCost;
4640
4641 auto LT = getTypeLegalizationCost(ValTy);
4642 // Model unknown fp compares as a libcall.
4643 if (LT.second.getScalarType() != MVT::f64 &&
4644 LT.second.getScalarType() != MVT::f32 &&
4645 LT.second.getScalarType() != MVT::f16)
4646 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4647 {ValTy, ValTy}, CostKind);
4648
4649 // Some comparison operators require expanding to multiple compares + or.
4650 unsigned Factor = 1;
4651 if (!CondTy->isVectorTy() &&
4652 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4653 Factor = 2; // fcmp with 2 selects
4654 else if (isa<FixedVectorType>(ValTy) &&
4655 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4656 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4657 Factor = 3; // fcmxx+fcmyy+or
4658 else if (isa<ScalableVectorType>(ValTy) &&
4659 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4660 Factor = 3; // fcmxx+fcmyy+or
4661
4662 if (isa<ScalableVectorType>(ValTy) &&
4664 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4665 AArch64::FCMEQv4f32))
4666 Factor *= 2;
4667
4668 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4669 }
4670
4671 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4672 // icmp(and, 0) as free, as we can make use of ands, but only if the
4673 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4674 // providing it will not cause performance regressions.
4675 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4676 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4677 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4678 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4679 if (match(I->getOperand(1), m_Zero()))
4680 return 0;
4681
4682 // x >= 1 / x < 1 -> x > 0 / x <= 0
4683 if (match(I->getOperand(1), m_One()) &&
4684 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4685 return 0;
4686
4687 // x <= -1 / x > -1 -> x > 0 / x <= 0
4688 if (match(I->getOperand(1), m_AllOnes()) &&
4689 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4690 return 0;
4691 }
4692
4693 // The base case handles scalable vectors fine for now, since it treats the
4694 // cost as 1 * legalization cost.
4695 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4696 Op1Info, Op2Info, I);
4697}
4698
4700AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4702 if (ST->requiresStrictAlign()) {
4703 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4704 // a bunch of instructions when strict align is enabled.
4705 return Options;
4706 }
4707 Options.AllowOverlappingLoads = true;
4708 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4709 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4710 // TODO: Though vector loads usually perform well on AArch64, in some targets
4711 // they may wake up the FP unit, which raises the power consumption. Perhaps
4712 // they could be used with no holds barred (-O3).
4713 Options.LoadSizes = {8, 4, 2, 1};
4714 Options.AllowedTailExpansions = {3, 5, 6};
4715 return Options;
4716}
4717
4719 return ST->hasSVE();
4720}
4721
4724 Align Alignment, unsigned AddressSpace,
4726 if (useNeonVector(Src))
4727 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4728 CostKind);
4729 auto LT = getTypeLegalizationCost(Src);
4730 if (!LT.first.isValid())
4732
4733 // Return an invalid cost for element types that we are unable to lower.
4734 auto *VT = cast<VectorType>(Src);
4735 if (VT->getElementType()->isIntegerTy(1))
4737
4738 // The code-generator is currently not able to handle scalable vectors
4739 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4740 // it. This change will be removed when code-generation for these types is
4741 // sufficiently reliable.
4742 if (VT->getElementCount() == ElementCount::getScalable(1))
4744
4745 return LT.first;
4746}
4747
4748// This function returns gather/scatter overhead either from
4749// user-provided value or specialized values per-target from \p ST.
4750static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4751 const AArch64Subtarget *ST) {
4752 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4753 "Should be called on only load or stores.");
4754 switch (Opcode) {
4755 case Instruction::Load:
4756 if (SVEGatherOverhead.getNumOccurrences() > 0)
4757 return SVEGatherOverhead;
4758 return ST->getGatherOverhead();
4759 break;
4760 case Instruction::Store:
4761 if (SVEScatterOverhead.getNumOccurrences() > 0)
4762 return SVEScatterOverhead;
4763 return ST->getScatterOverhead();
4764 break;
4765 default:
4766 llvm_unreachable("Shouldn't have reached here");
4767 }
4768}
4769
4771 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
4772 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
4773 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4774 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
4775 Alignment, CostKind, I);
4776 auto *VT = cast<VectorType>(DataTy);
4777 auto LT = getTypeLegalizationCost(DataTy);
4778 if (!LT.first.isValid())
4780
4781 // Return an invalid cost for element types that we are unable to lower.
4782 if (!LT.second.isVector() ||
4783 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4784 VT->getElementType()->isIntegerTy(1))
4786
4787 // The code-generator is currently not able to handle scalable vectors
4788 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4789 // it. This change will be removed when code-generation for these types is
4790 // sufficiently reliable.
4791 if (VT->getElementCount() == ElementCount::getScalable(1))
4793
4794 ElementCount LegalVF = LT.second.getVectorElementCount();
4795 InstructionCost MemOpCost =
4796 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4797 {TTI::OK_AnyValue, TTI::OP_None}, I);
4798 // Add on an overhead cost for using gathers/scatters.
4799 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4800 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4801}
4802
4804 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4805}
4806
4808 Align Alignment,
4809 unsigned AddressSpace,
4811 TTI::OperandValueInfo OpInfo,
4812 const Instruction *I) const {
4813 EVT VT = TLI->getValueType(DL, Ty, true);
4814 // Type legalization can't handle structs
4815 if (VT == MVT::Other)
4816 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4817 CostKind);
4818
4819 auto LT = getTypeLegalizationCost(Ty);
4820 if (!LT.first.isValid())
4822
4823 // The code-generator is currently not able to handle scalable vectors
4824 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4825 // it. This change will be removed when code-generation for these types is
4826 // sufficiently reliable.
4827 // We also only support full register predicate loads and stores.
4828 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4829 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4830 (VTy->getElementType()->isIntegerTy(1) &&
4831 !VTy->getElementCount().isKnownMultipleOf(
4834
4835 // TODO: consider latency as well for TCK_SizeAndLatency.
4837 return LT.first;
4838
4840 return 1;
4841
4842 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4843 LT.second.is128BitVector() && Alignment < Align(16)) {
4844 // Unaligned stores are extremely inefficient. We don't split all
4845 // unaligned 128-bit stores because the negative impact that has shown in
4846 // practice on inlined block copy code.
4847 // We make such stores expensive so that we will only vectorize if there
4848 // are 6 other instructions getting vectorized.
4849 const int AmortizationCost = 6;
4850
4851 return LT.first * 2 * AmortizationCost;
4852 }
4853
4854 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4855 if (Ty->isPtrOrPtrVectorTy())
4856 return LT.first;
4857
4858 if (useNeonVector(Ty)) {
4859 // Check truncating stores and extending loads.
4860 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4861 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4862 if (VT == MVT::v4i8)
4863 return 2;
4864 // Otherwise we need to scalarize.
4865 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4866 }
4867 EVT EltVT = VT.getVectorElementType();
4868 unsigned EltSize = EltVT.getScalarSizeInBits();
4869 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4870 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4871 return LT.first;
4872 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4873 // widening to v4i8, which produces suboptimal results.
4874 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4875 return LT.first;
4876
4877 // Check non-power-of-2 loads/stores for legal vector element types with
4878 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4879 // operations on smaller power-of-2 ops, including ld1/st1.
4880 LLVMContext &C = Ty->getContext();
4882 SmallVector<EVT> TypeWorklist;
4883 TypeWorklist.push_back(VT);
4884 while (!TypeWorklist.empty()) {
4885 EVT CurrVT = TypeWorklist.pop_back_val();
4886 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4887 if (isPowerOf2_32(CurrNumElements)) {
4888 Cost += 1;
4889 continue;
4890 }
4891
4892 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4893 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4894 TypeWorklist.push_back(
4895 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4896 }
4897 return Cost;
4898 }
4899
4900 return LT.first;
4901}
4902
4904 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4905 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4906 bool UseMaskForCond, bool UseMaskForGaps) const {
4907 assert(Factor >= 2 && "Invalid interleave factor");
4908 auto *VecVTy = cast<VectorType>(VecTy);
4909
4910 if (VecTy->isScalableTy() && !ST->hasSVE())
4912
4913 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4914 // only have lowering for power-of-2 factors.
4915 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
4916 // InterleavedAccessPass for ld3/st3
4917 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
4919
4920 // Vectorization for masked interleaved accesses is only enabled for scalable
4921 // VF.
4922 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4924
4925 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4926 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4927 auto *SubVecTy =
4928 VectorType::get(VecVTy->getElementType(),
4929 VecVTy->getElementCount().divideCoefficientBy(Factor));
4930
4931 // ldN/stN only support legal vector types of size 64 or 128 in bits.
4932 // Accesses having vector types that are a multiple of 128 bits can be
4933 // matched to more than one ldN/stN instruction.
4934 bool UseScalable;
4935 if (MinElts % Factor == 0 &&
4936 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4937 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4938 }
4939
4940 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4941 Alignment, AddressSpace, CostKind,
4942 UseMaskForCond, UseMaskForGaps);
4943}
4944
4949 for (auto *I : Tys) {
4950 if (!I->isVectorTy())
4951 continue;
4952 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4953 128)
4954 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4955 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
4956 }
4957 return Cost;
4958}
4959
4961 return ST->getMaxInterleaveFactor();
4962}
4963
4964// For Falkor, we want to avoid having too many strided loads in a loop since
4965// that can exhaust the HW prefetcher resources. We adjust the unroller
4966// MaxCount preference below to attempt to ensure unrolling doesn't create too
4967// many strided loads.
4968static void
4971 enum { MaxStridedLoads = 7 };
4972 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4973 int StridedLoads = 0;
4974 // FIXME? We could make this more precise by looking at the CFG and
4975 // e.g. not counting loads in each side of an if-then-else diamond.
4976 for (const auto BB : L->blocks()) {
4977 for (auto &I : *BB) {
4978 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4979 if (!LMemI)
4980 continue;
4981
4982 Value *PtrValue = LMemI->getPointerOperand();
4983 if (L->isLoopInvariant(PtrValue))
4984 continue;
4985
4986 const SCEV *LSCEV = SE.getSCEV(PtrValue);
4987 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4988 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4989 continue;
4990
4991 // FIXME? We could take pairing of unrolled load copies into account
4992 // by looking at the AddRec, but we would probably have to limit this
4993 // to loops with no stores or other memory optimization barriers.
4994 ++StridedLoads;
4995 // We've seen enough strided loads that seeing more won't make a
4996 // difference.
4997 if (StridedLoads > MaxStridedLoads / 2)
4998 return StridedLoads;
4999 }
5000 }
5001 return StridedLoads;
5002 };
5003
5004 int StridedLoads = countStridedLoads(L, SE);
5005 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5006 << " strided loads\n");
5007 // Pick the largest power of 2 unroll count that won't result in too many
5008 // strided loads.
5009 if (StridedLoads) {
5010 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5011 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5012 << UP.MaxCount << '\n');
5013 }
5014}
5015
5016// This function returns true if the loop:
5017// 1. Has a valid cost, and
5018// 2. Has a cost within the supplied budget.
5019// Otherwise it returns false.
5021 InstructionCost Budget,
5022 unsigned *FinalSize) {
5023 // Estimate the size of the loop.
5024 InstructionCost LoopCost = 0;
5025
5026 for (auto *BB : L->getBlocks()) {
5027 for (auto &I : *BB) {
5028 SmallVector<const Value *, 4> Operands(I.operand_values());
5029 InstructionCost Cost =
5030 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5031 // This can happen with intrinsics that don't currently have a cost model
5032 // or for some operations that require SVE.
5033 if (!Cost.isValid())
5034 return false;
5035
5036 LoopCost += Cost;
5037 if (LoopCost > Budget)
5038 return false;
5039 }
5040 }
5041
5042 if (FinalSize)
5043 *FinalSize = LoopCost.getValue();
5044 return true;
5045}
5046
5048 const AArch64TTIImpl &TTI) {
5049 // Only consider loops with unknown trip counts for which we can determine
5050 // a symbolic expression. Multi-exit loops with small known trip counts will
5051 // likely be unrolled anyway.
5052 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5054 return false;
5055
5056 // It might not be worth unrolling loops with low max trip counts. Restrict
5057 // this to max trip counts > 32 for now.
5058 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5059 if (MaxTC > 0 && MaxTC <= 32)
5060 return false;
5061
5062 // Make sure the loop size is <= 5.
5063 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5064 return false;
5065
5066 // Small search loops with multiple exits can be highly beneficial to unroll.
5067 // We only care about loops with exactly two exiting blocks, although each
5068 // block could jump to the same exit block.
5069 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5070 if (Blocks.size() != 2)
5071 return false;
5072
5073 if (any_of(Blocks, [](BasicBlock *BB) {
5074 return !isa<BranchInst>(BB->getTerminator());
5075 }))
5076 return false;
5077
5078 return true;
5079}
5080
5081/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5082/// OOO engine's wide instruction window and various predictors.
5083static void
5086 const AArch64TTIImpl &TTI) {
5087 // Limit loops with structure that is highly likely to benefit from runtime
5088 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5089 // likely with complex control flow). Note that the heuristics here may be
5090 // overly conservative and we err on the side of avoiding runtime unrolling
5091 // rather than unroll excessively. They are all subject to further refinement.
5092 if (!L->isInnermost() || L->getNumBlocks() > 8)
5093 return;
5094
5095 // Loops with multiple exits are handled by common code.
5096 if (!L->getExitBlock())
5097 return;
5098
5099 // Check if the loop contains any reductions that could be parallelized when
5100 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5101 // a multiple of 2.
5102 bool HasParellelizableReductions =
5103 L->getNumBlocks() == 1 &&
5104 any_of(L->getHeader()->phis(),
5105 [&SE, L](PHINode &Phi) {
5106 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5107 }) &&
5108 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5109 if (HasParellelizableReductions &&
5110 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5111 UP.Partial = true;
5112 UP.MaxCount = 4;
5113 UP.AddAdditionalAccumulators = true;
5114 }
5115
5116 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5118 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5119 SE.getSmallConstantMaxTripCount(L) <= 32))
5120 return;
5121
5122 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5123 return;
5124
5126 return;
5127
5128 // Limit to loops with trip counts that are cheap to expand.
5129 UP.SCEVExpansionBudget = 1;
5130
5131 if (HasParellelizableReductions) {
5132 UP.Runtime = true;
5134 UP.AddAdditionalAccumulators = true;
5135 }
5136
5137 // Try to unroll small loops, of few-blocks with low budget, if they have
5138 // load/store dependencies, to expose more parallel memory access streams,
5139 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5140 BasicBlock *Header = L->getHeader();
5141 BasicBlock *Latch = L->getLoopLatch();
5142 if (Header == Latch) {
5143 // Estimate the size of the loop.
5144 unsigned Size;
5145 unsigned Width = 10;
5146 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5147 return;
5148
5149 // Try to find an unroll count that maximizes the use of the instruction
5150 // window, i.e. trying to fetch as many instructions per cycle as possible.
5151 unsigned MaxInstsPerLine = 16;
5152 unsigned UC = 1;
5153 unsigned BestUC = 1;
5154 unsigned SizeWithBestUC = BestUC * Size;
5155 while (UC <= 8) {
5156 unsigned SizeWithUC = UC * Size;
5157 if (SizeWithUC > 48)
5158 break;
5159 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5160 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5161 BestUC = UC;
5162 SizeWithBestUC = BestUC * Size;
5163 }
5164 UC++;
5165 }
5166
5167 if (BestUC == 1)
5168 return;
5169
5170 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5172 for (auto *BB : L->blocks()) {
5173 for (auto &I : *BB) {
5175 if (!Ptr)
5176 continue;
5177 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5178 if (SE.isLoopInvariant(PtrSCEV, L))
5179 continue;
5180 if (isa<LoadInst>(&I)) {
5181 LoadedValuesPlus.insert(&I);
5182 // Include in-loop 1st users of loaded values.
5183 for (auto *U : I.users())
5184 if (L->contains(cast<Instruction>(U)))
5185 LoadedValuesPlus.insert(U);
5186 } else
5187 Stores.push_back(cast<StoreInst>(&I));
5188 }
5189 }
5190
5191 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5192 return LoadedValuesPlus.contains(SI->getOperand(0));
5193 }))
5194 return;
5195
5196 UP.Runtime = true;
5197 UP.DefaultUnrollRuntimeCount = BestUC;
5198 return;
5199 }
5200
5201 // Try to runtime-unroll loops with early-continues depending on loop-varying
5202 // loads; this helps with branch-prediction for the early-continues.
5203 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5205 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5206 !llvm::is_contained(Preds, Header) ||
5207 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5208 return;
5209
5210 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5211 [&](Instruction *I, unsigned Depth) -> bool {
5212 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5213 return false;
5214
5215 if (isa<LoadInst>(I))
5216 return true;
5217
5218 return any_of(I->operands(), [&](Value *V) {
5219 auto *I = dyn_cast<Instruction>(V);
5220 return I && DependsOnLoopLoad(I, Depth + 1);
5221 });
5222 };
5223 CmpPredicate Pred;
5224 Instruction *I;
5225 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5226 m_Value())) &&
5227 DependsOnLoopLoad(I, 0)) {
5228 UP.Runtime = true;
5229 }
5230}
5231
5234 OptimizationRemarkEmitter *ORE) const {
5235 // Enable partial unrolling and runtime unrolling.
5236 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5237
5238 UP.UpperBound = true;
5239
5240 // For inner loop, it is more likely to be a hot one, and the runtime check
5241 // can be promoted out from LICM pass, so the overhead is less, let's try
5242 // a larger threshold to unroll more loops.
5243 if (L->getLoopDepth() > 1)
5244 UP.PartialThreshold *= 2;
5245
5246 // Disable partial & runtime unrolling on -Os.
5248
5249 // Scan the loop: don't unroll loops with calls as this could prevent
5250 // inlining. Don't unroll auto-vectorized loops either, though do allow
5251 // unrolling of the scalar remainder.
5252 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5253 for (auto *BB : L->getBlocks()) {
5254 for (auto &I : *BB) {
5255 // Both auto-vectorized loops and the scalar remainder have the
5256 // isvectorized attribute, so differentiate between them by the presence
5257 // of vector instructions.
5258 if (IsVectorized && I.getType()->isVectorTy())
5259 return;
5260 if (isa<CallBase>(I)) {
5263 if (!isLoweredToCall(F))
5264 continue;
5265 return;
5266 }
5267 }
5268 }
5269
5270 // Apply subtarget-specific unrolling preferences.
5271 switch (ST->getProcFamily()) {
5272 case AArch64Subtarget::AppleA14:
5273 case AArch64Subtarget::AppleA15:
5274 case AArch64Subtarget::AppleA16:
5275 case AArch64Subtarget::AppleM4:
5276 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5277 break;
5278 case AArch64Subtarget::Falkor:
5281 break;
5282 default:
5283 break;
5284 }
5285
5286 // If this is a small, multi-exit loop similar to something like std::find,
5287 // then there is typically a performance improvement achieved by unrolling.
5288 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5289 UP.RuntimeUnrollMultiExit = true;
5290 UP.Runtime = true;
5291 // Limit unroll count.
5293 // Allow slightly more costly trip-count expansion to catch search loops
5294 // with pointer inductions.
5295 UP.SCEVExpansionBudget = 5;
5296 return;
5297 }
5298
5299 // Enable runtime unrolling for in-order models
5300 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5301 // checking for that case, we can ensure that the default behaviour is
5302 // unchanged
5303 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5304 !ST->getSchedModel().isOutOfOrder()) {
5305 UP.Runtime = true;
5306 UP.Partial = true;
5307 UP.UnrollRemainder = true;
5309
5310 UP.UnrollAndJam = true;
5312 }
5313}
5314
5319
5321 Type *ExpectedType,
5322 bool CanCreate) const {
5323 switch (Inst->getIntrinsicID()) {
5324 default:
5325 return nullptr;
5326 case Intrinsic::aarch64_neon_st2:
5327 case Intrinsic::aarch64_neon_st3:
5328 case Intrinsic::aarch64_neon_st4: {
5329 // Create a struct type
5330 StructType *ST = dyn_cast<StructType>(ExpectedType);
5331 if (!CanCreate || !ST)
5332 return nullptr;
5333 unsigned NumElts = Inst->arg_size() - 1;
5334 if (ST->getNumElements() != NumElts)
5335 return nullptr;
5336 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5337 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5338 return nullptr;
5339 }
5340 Value *Res = PoisonValue::get(ExpectedType);
5341 IRBuilder<> Builder(Inst);
5342 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5343 Value *L = Inst->getArgOperand(i);
5344 Res = Builder.CreateInsertValue(Res, L, i);
5345 }
5346 return Res;
5347 }
5348 case Intrinsic::aarch64_neon_ld2:
5349 case Intrinsic::aarch64_neon_ld3:
5350 case Intrinsic::aarch64_neon_ld4:
5351 if (Inst->getType() == ExpectedType)
5352 return Inst;
5353 return nullptr;
5354 }
5355}
5356
5358 MemIntrinsicInfo &Info) const {
5359 switch (Inst->getIntrinsicID()) {
5360 default:
5361 break;
5362 case Intrinsic::aarch64_neon_ld2:
5363 case Intrinsic::aarch64_neon_ld3:
5364 case Intrinsic::aarch64_neon_ld4:
5365 Info.ReadMem = true;
5366 Info.WriteMem = false;
5367 Info.PtrVal = Inst->getArgOperand(0);
5368 break;
5369 case Intrinsic::aarch64_neon_st2:
5370 case Intrinsic::aarch64_neon_st3:
5371 case Intrinsic::aarch64_neon_st4:
5372 Info.ReadMem = false;
5373 Info.WriteMem = true;
5374 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5375 break;
5376 }
5377
5378 switch (Inst->getIntrinsicID()) {
5379 default:
5380 return false;
5381 case Intrinsic::aarch64_neon_ld2:
5382 case Intrinsic::aarch64_neon_st2:
5383 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5384 break;
5385 case Intrinsic::aarch64_neon_ld3:
5386 case Intrinsic::aarch64_neon_st3:
5387 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5388 break;
5389 case Intrinsic::aarch64_neon_ld4:
5390 case Intrinsic::aarch64_neon_st4:
5391 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5392 break;
5393 }
5394 return true;
5395}
5396
5397/// See if \p I should be considered for address type promotion. We check if \p
5398/// I is a sext with right type and used in memory accesses. If it used in a
5399/// "complex" getelementptr, we allow it to be promoted without finding other
5400/// sext instructions that sign extended the same initial value. A getelementptr
5401/// is considered as "complex" if it has more than 2 operands.
5403 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5404 bool Considerable = false;
5405 AllowPromotionWithoutCommonHeader = false;
5406 if (!isa<SExtInst>(&I))
5407 return false;
5408 Type *ConsideredSExtType =
5409 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5410 if (I.getType() != ConsideredSExtType)
5411 return false;
5412 // See if the sext is the one with the right type and used in at least one
5413 // GetElementPtrInst.
5414 for (const User *U : I.users()) {
5415 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5416 Considerable = true;
5417 // A getelementptr is considered as "complex" if it has more than 2
5418 // operands. We will promote a SExt used in such complex GEP as we
5419 // expect some computation to be merged if they are done on 64 bits.
5420 if (GEPInst->getNumOperands() > 2) {
5421 AllowPromotionWithoutCommonHeader = true;
5422 break;
5423 }
5424 }
5425 }
5426 return Considerable;
5427}
5428
5430 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5431 if (!VF.isScalable())
5432 return true;
5433
5434 Type *Ty = RdxDesc.getRecurrenceType();
5435 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5436 return false;
5437
5438 switch (RdxDesc.getRecurrenceKind()) {
5439 case RecurKind::Sub:
5441 case RecurKind::Add:
5442 case RecurKind::FAdd:
5443 case RecurKind::And:
5444 case RecurKind::Or:
5445 case RecurKind::Xor:
5446 case RecurKind::SMin:
5447 case RecurKind::SMax:
5448 case RecurKind::UMin:
5449 case RecurKind::UMax:
5450 case RecurKind::FMin:
5451 case RecurKind::FMax:
5452 case RecurKind::FMulAdd:
5453 case RecurKind::AnyOf:
5454 return true;
5455 default:
5456 return false;
5457 }
5458}
5459
5462 FastMathFlags FMF,
5464 // The code-generator is currently not able to handle scalable vectors
5465 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5466 // it. This change will be removed when code-generation for these types is
5467 // sufficiently reliable.
5468 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5469 if (VTy->getElementCount() == ElementCount::getScalable(1))
5471
5472 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5473
5474 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5475 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5476
5477 InstructionCost LegalizationCost = 0;
5478 if (LT.first > 1) {
5479 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5480 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5481 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5482 }
5483
5484 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5485}
5486
5488 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5489 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5490 InstructionCost LegalizationCost = 0;
5491 if (LT.first > 1) {
5492 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5493 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5494 LegalizationCost *= LT.first - 1;
5495 }
5496
5497 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5498 assert(ISD && "Invalid opcode");
5499 // Add the final reduction cost for the legal horizontal reduction
5500 switch (ISD) {
5501 case ISD::ADD:
5502 case ISD::AND:
5503 case ISD::OR:
5504 case ISD::XOR:
5505 case ISD::FADD:
5506 return LegalizationCost + 2;
5507 default:
5509 }
5510}
5511
5514 std::optional<FastMathFlags> FMF,
5516 // The code-generator is currently not able to handle scalable vectors
5517 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5518 // it. This change will be removed when code-generation for these types is
5519 // sufficiently reliable.
5520 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5521 if (VTy->getElementCount() == ElementCount::getScalable(1))
5523
5525 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5526 InstructionCost BaseCost =
5527 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5528 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5529 // end up vectorizing for more computationally intensive loops.
5530 return BaseCost + FixedVTy->getNumElements();
5531 }
5532
5533 if (Opcode != Instruction::FAdd)
5535
5536 auto *VTy = cast<ScalableVectorType>(ValTy);
5538 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5539 Cost *= getMaxNumElements(VTy->getElementCount());
5540 return Cost;
5541 }
5542
5543 if (isa<ScalableVectorType>(ValTy))
5544 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5545
5546 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5547 MVT MTy = LT.second;
5548 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5549 assert(ISD && "Invalid opcode");
5550
5551 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5552 // instructions as twice a normal vector add, plus 1 for each legalization
5553 // step (LT.first). This is the only arithmetic vector reduction operation for
5554 // which we have an instruction.
5555 // OR, XOR and AND costs should match the codegen from:
5556 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5557 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5558 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5559 static const CostTblEntry CostTblNoPairwise[]{
5560 {ISD::ADD, MVT::v8i8, 2},
5561 {ISD::ADD, MVT::v16i8, 2},
5562 {ISD::ADD, MVT::v4i16, 2},
5563 {ISD::ADD, MVT::v8i16, 2},
5564 {ISD::ADD, MVT::v2i32, 2},
5565 {ISD::ADD, MVT::v4i32, 2},
5566 {ISD::ADD, MVT::v2i64, 2},
5567 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5568 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5569 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5570 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5571 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5572 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5573 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5574 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5575 {ISD::XOR, MVT::v16i8, 7},
5576 {ISD::XOR, MVT::v4i16, 4},
5577 {ISD::XOR, MVT::v8i16, 6},
5578 {ISD::XOR, MVT::v2i32, 3},
5579 {ISD::XOR, MVT::v4i32, 5},
5580 {ISD::XOR, MVT::v2i64, 3},
5581 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5582 {ISD::AND, MVT::v16i8, 7},
5583 {ISD::AND, MVT::v4i16, 4},
5584 {ISD::AND, MVT::v8i16, 6},
5585 {ISD::AND, MVT::v2i32, 3},
5586 {ISD::AND, MVT::v4i32, 5},
5587 {ISD::AND, MVT::v2i64, 3},
5588 };
5589 switch (ISD) {
5590 default:
5591 break;
5592 case ISD::FADD:
5593 if (Type *EltTy = ValTy->getScalarType();
5594 // FIXME: For half types without fullfp16 support, this could extend and
5595 // use a fp32 faddp reduction but current codegen unrolls.
5596 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5597 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5598 const unsigned NElts = MTy.getVectorNumElements();
5599 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5600 isPowerOf2_32(NElts))
5601 // Reduction corresponding to series of fadd instructions is lowered to
5602 // series of faddp instructions. faddp has latency/throughput that
5603 // matches fadd instruction and hence, every faddp instruction can be
5604 // considered to have a relative cost = 1 with
5605 // CostKind = TCK_RecipThroughput.
5606 // An faddp will pairwise add vector elements, so the size of input
5607 // vector reduces by half every time, requiring
5608 // #(faddp instructions) = log2_32(NElts).
5609 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5610 }
5611 break;
5612 case ISD::ADD:
5613 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5614 return (LT.first - 1) + Entry->Cost;
5615 break;
5616 case ISD::XOR:
5617 case ISD::AND:
5618 case ISD::OR:
5619 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5620 if (!Entry)
5621 break;
5622 auto *ValVTy = cast<FixedVectorType>(ValTy);
5623 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5624 isPowerOf2_32(ValVTy->getNumElements())) {
5625 InstructionCost ExtraCost = 0;
5626 if (LT.first != 1) {
5627 // Type needs to be split, so there is an extra cost of LT.first - 1
5628 // arithmetic ops.
5629 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5630 MTy.getVectorNumElements());
5631 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5632 ExtraCost *= LT.first - 1;
5633 }
5634 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5635 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5636 return Cost + ExtraCost;
5637 }
5638 break;
5639 }
5640 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5641}
5642
5644 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5645 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5646 EVT VecVT = TLI->getValueType(DL, VecTy);
5647 EVT ResVT = TLI->getValueType(DL, ResTy);
5648
5649 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5650 VecVT.getSizeInBits() >= 64) {
5651 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5652
5653 // The legal cases are:
5654 // UADDLV 8/16/32->32
5655 // UADDLP 32->64
5656 unsigned RevVTSize = ResVT.getSizeInBits();
5657 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5658 RevVTSize <= 32) ||
5659 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5660 RevVTSize <= 32) ||
5661 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5662 RevVTSize <= 64))
5663 return (LT.first - 1) * 2 + 2;
5664 }
5665
5666 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5667 CostKind);
5668}
5669
5671AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5672 Type *ResTy, VectorType *VecTy,
5674 EVT VecVT = TLI->getValueType(DL, VecTy);
5675 EVT ResVT = TLI->getValueType(DL, ResTy);
5676
5677 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5678 RedOpcode == Instruction::Add) {
5679 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5680
5681 // The legal cases with dotprod are
5682 // UDOT 8->32
5683 // Which requires an additional uaddv to sum the i32 values.
5684 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5685 ResVT == MVT::i32)
5686 return LT.first + 2;
5687 }
5688
5689 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5690 CostKind);
5691}
5692
5696 static const CostTblEntry ShuffleTbl[] = {
5697 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5698 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5699 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5700 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5701 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5702 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5703 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5704 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5705 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5706 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5707 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5708 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5709 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5710 };
5711
5712 // The code-generator is currently not able to handle scalable vectors
5713 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5714 // it. This change will be removed when code-generation for these types is
5715 // sufficiently reliable.
5718
5719 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5720 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5721 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5722 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5723 : LT.second;
5724 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5725 InstructionCost LegalizationCost = 0;
5726 if (Index < 0) {
5727 LegalizationCost =
5728 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5730 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5732 }
5733
5734 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5735 // Cost performed on a promoted type.
5736 if (LT.second.getScalarType() == MVT::i1) {
5737 LegalizationCost +=
5738 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5740 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5742 }
5743 const auto *Entry =
5744 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5745 assert(Entry && "Illegal Type for Splice");
5746 LegalizationCost += Entry->Cost;
5747 return LegalizationCost * LT.first;
5748}
5749
5751 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5753 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5756
5758 return Invalid;
5759
5760 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5761 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5762 return Invalid;
5763
5764 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
5765 OpAExtend == TTI::PR_None)
5766 return Invalid;
5767
5768 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5769 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5770 "Unexpected values for OpBExtend or InputTypeB");
5771
5772 // We only support multiply binary operations for now, and for muls we
5773 // require the types being extended to be the same.
5774 if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
5775 return Invalid;
5776
5777 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5778 if (IsUSDot && !ST->hasMatMulInt8())
5779 return Invalid;
5780
5781 unsigned Ratio =
5782 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5783 if (VF.getKnownMinValue() <= Ratio)
5784 return Invalid;
5785
5786 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
5787 VectorType *AccumVectorType =
5788 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
5789 // We don't yet support all kinds of legalization.
5790 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
5791 EVT::getEVT(AccumVectorType));
5792 switch (TC.first) {
5793 default:
5794 return Invalid;
5798 // The legalised type (e.g. after splitting) must be legal too.
5799 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
5801 return Invalid;
5802 break;
5803 }
5804
5805 std::pair<InstructionCost, MVT> AccumLT =
5806 getTypeLegalizationCost(AccumVectorType);
5807 std::pair<InstructionCost, MVT> InputLT =
5808 getTypeLegalizationCost(InputVectorType);
5809
5810 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5811
5812 // Prefer using full types by costing half-full input types as more expensive.
5813 if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
5815 // FIXME: This can be removed after the cost of the extends are folded into
5816 // the dot-product expression in VPlan, after landing:
5817 // https://github.com/llvm/llvm-project/pull/147302
5818 Cost *= 2;
5819
5820 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
5821 // i16 -> i64 is natively supported for udot/sdot
5822 if (AccumLT.second.getScalarType() == MVT::i64 &&
5823 InputLT.second.getScalarType() == MVT::i16)
5824 return Cost;
5825 // i8 -> i64 is supported with an extra level of extends
5826 if (AccumLT.second.getScalarType() == MVT::i64 &&
5827 InputLT.second.getScalarType() == MVT::i8)
5828 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
5829 // because it requires two extra extends on the inputs. But if we'd change
5830 // that now, a regular reduction would be cheaper because the costs of
5831 // the extends in the IR are still counted. This can be fixed
5832 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
5833 return Cost;
5834 }
5835
5836 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
5837 if (ST->isSVEorStreamingSVEAvailable() ||
5838 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
5839 ST->hasDotProd())) {
5840 if (AccumLT.second.getScalarType() == MVT::i32 &&
5841 InputLT.second.getScalarType() == MVT::i8)
5842 return Cost;
5843 }
5844
5845 // Add additional cost for the extends that would need to be inserted.
5846 return Cost + 2;
5847}
5848
5851 VectorType *SrcTy, ArrayRef<int> Mask,
5852 TTI::TargetCostKind CostKind, int Index,
5854 const Instruction *CxtI) const {
5855 assert((Mask.empty() || DstTy->isScalableTy() ||
5856 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5857 "Expected the Mask to match the return size if given");
5858 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5859 "Expected the same scalar types");
5860 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
5861
5862 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5863 // into smaller vectors and sum the cost of each shuffle.
5864 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
5865 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5866 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5867 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5868 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5869 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5870 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5871 // cost than just the load.
5872 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5875 return std::max<InstructionCost>(1, LT.first / 4);
5876
5877 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5878 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5879 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5880 // cost than just the store.
5881 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
5883 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
5885 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
5886 return LT.first;
5887
5888 unsigned TpNumElts = Mask.size();
5889 unsigned LTNumElts = LT.second.getVectorNumElements();
5890 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5891 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
5892 LT.second.getVectorElementCount());
5894 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5895 PreviousCosts;
5896 for (unsigned N = 0; N < NumVecs; N++) {
5897 SmallVector<int> NMask;
5898 // Split the existing mask into chunks of size LTNumElts. Track the source
5899 // sub-vectors to ensure the result has at most 2 inputs.
5900 unsigned Source1 = -1U, Source2 = -1U;
5901 unsigned NumSources = 0;
5902 for (unsigned E = 0; E < LTNumElts; E++) {
5903 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
5905 if (MaskElt < 0) {
5907 continue;
5908 }
5909
5910 // Calculate which source from the input this comes from and whether it
5911 // is new to us.
5912 unsigned Source = MaskElt / LTNumElts;
5913 if (NumSources == 0) {
5914 Source1 = Source;
5915 NumSources = 1;
5916 } else if (NumSources == 1 && Source != Source1) {
5917 Source2 = Source;
5918 NumSources = 2;
5919 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
5920 NumSources++;
5921 }
5922
5923 // Add to the new mask. For the NumSources>2 case these are not correct,
5924 // but are only used for the modular lane number.
5925 if (Source == Source1)
5926 NMask.push_back(MaskElt % LTNumElts);
5927 else if (Source == Source2)
5928 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
5929 else
5930 NMask.push_back(MaskElt % LTNumElts);
5931 }
5932 // Check if we have already generated this sub-shuffle, which means we
5933 // will have already generated the output. For example a <16 x i32> splat
5934 // will be the same sub-splat 4 times, which only needs to be generated
5935 // once and reused.
5936 auto Result =
5937 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
5938 // Check if it was already in the map (already costed).
5939 if (!Result.second)
5940 continue;
5941 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
5942 // getShuffleCost. If not then cost it using the worst case as the number
5943 // of element moves into a new vector.
5944 InstructionCost NCost =
5945 NumSources <= 2
5946 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
5948 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
5949 CxtI)
5950 : LTNumElts;
5951 Result.first->second = NCost;
5952 Cost += NCost;
5953 }
5954 return Cost;
5955 }
5956
5957 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
5958 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
5959 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
5960 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
5961 // This currently only handles low or high extracts to prevent SLP vectorizer
5962 // regressions.
5963 // Note that SVE's ext instruction is destructive, but it can be fused with
5964 // a movprfx to act like a constructive instruction.
5965 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
5966 if (LT.second.getFixedSizeInBits() >= 128 &&
5967 cast<FixedVectorType>(SubTp)->getNumElements() ==
5968 LT.second.getVectorNumElements() / 2) {
5969 if (Index == 0)
5970 return 0;
5971 if (Index == (int)LT.second.getVectorNumElements() / 2)
5972 return 1;
5973 }
5975 }
5976 // FIXME: This was added to keep the costs equal when adding DstTys. Update
5977 // the code to handle length-changing shuffles.
5978 if (Kind == TTI::SK_InsertSubvector) {
5979 LT = getTypeLegalizationCost(DstTy);
5980 SrcTy = DstTy;
5981 }
5982
5983 // Segmented shuffle matching.
5984 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
5985 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
5986 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
5988
5990 unsigned Segments =
5992 unsigned SegmentElts = VTy->getNumElements() / Segments;
5993
5994 // dupq zd.t, zn.t[idx]
5995 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
5996 ST->isSVEorStreamingSVEAvailable() &&
5997 isDUPQMask(Mask, Segments, SegmentElts))
5998 return LT.first;
5999
6000 // mov zd.q, vn
6001 if (ST->isSVEorStreamingSVEAvailable() &&
6002 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6003 return LT.first;
6004 }
6005
6006 // Check for broadcast loads, which are supported by the LD1R instruction.
6007 // In terms of code-size, the shuffle vector is free when a load + dup get
6008 // folded into a LD1R. That's what we check and return here. For performance
6009 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6010 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6011 // that we model the load + dup sequence slightly higher because LD1R is a
6012 // high latency instruction.
6013 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6014 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6015 if (IsLoad && LT.second.isVector() &&
6016 isLegalBroadcastLoad(SrcTy->getElementType(),
6017 LT.second.getVectorElementCount()))
6018 return 0;
6019 }
6020
6021 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6022 // from the perfect shuffle tables.
6023 if (Mask.size() == 4 &&
6024 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6025 (SrcTy->getScalarSizeInBits() == 16 ||
6026 SrcTy->getScalarSizeInBits() == 32) &&
6027 all_of(Mask, [](int E) { return E < 8; }))
6028 return getPerfectShuffleCost(Mask);
6029
6030 // Check for identity masks, which we can treat as free.
6031 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6032 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6033 all_of(enumerate(Mask), [](const auto &M) {
6034 return M.value() < 0 || M.value() == (int)M.index();
6035 }))
6036 return 0;
6037
6038 // Check for other shuffles that are not SK_ kinds but we have native
6039 // instructions for, for example ZIP and UZP.
6040 unsigned Unused;
6041 if (LT.second.isFixedLengthVector() &&
6042 LT.second.getVectorNumElements() == Mask.size() &&
6043 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6044 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6045 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6046 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6047 LT.second.getVectorNumElements(), 16) ||
6048 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6049 LT.second.getVectorNumElements(), 32) ||
6050 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6051 LT.second.getVectorNumElements(), 64) ||
6052 // Check for non-zero lane splats
6053 all_of(drop_begin(Mask),
6054 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6055 return 1;
6056
6057 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6058 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6059 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6060 static const CostTblEntry ShuffleTbl[] = {
6061 // Broadcast shuffle kinds can be performed with 'dup'.
6062 {TTI::SK_Broadcast, MVT::v8i8, 1},
6063 {TTI::SK_Broadcast, MVT::v16i8, 1},
6064 {TTI::SK_Broadcast, MVT::v4i16, 1},
6065 {TTI::SK_Broadcast, MVT::v8i16, 1},
6066 {TTI::SK_Broadcast, MVT::v2i32, 1},
6067 {TTI::SK_Broadcast, MVT::v4i32, 1},
6068 {TTI::SK_Broadcast, MVT::v2i64, 1},
6069 {TTI::SK_Broadcast, MVT::v4f16, 1},
6070 {TTI::SK_Broadcast, MVT::v8f16, 1},
6071 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6072 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6073 {TTI::SK_Broadcast, MVT::v2f32, 1},
6074 {TTI::SK_Broadcast, MVT::v4f32, 1},
6075 {TTI::SK_Broadcast, MVT::v2f64, 1},
6076 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6077 // 'zip1/zip2' instructions.
6078 {TTI::SK_Transpose, MVT::v8i8, 1},
6079 {TTI::SK_Transpose, MVT::v16i8, 1},
6080 {TTI::SK_Transpose, MVT::v4i16, 1},
6081 {TTI::SK_Transpose, MVT::v8i16, 1},
6082 {TTI::SK_Transpose, MVT::v2i32, 1},
6083 {TTI::SK_Transpose, MVT::v4i32, 1},
6084 {TTI::SK_Transpose, MVT::v2i64, 1},
6085 {TTI::SK_Transpose, MVT::v4f16, 1},
6086 {TTI::SK_Transpose, MVT::v8f16, 1},
6087 {TTI::SK_Transpose, MVT::v4bf16, 1},
6088 {TTI::SK_Transpose, MVT::v8bf16, 1},
6089 {TTI::SK_Transpose, MVT::v2f32, 1},
6090 {TTI::SK_Transpose, MVT::v4f32, 1},
6091 {TTI::SK_Transpose, MVT::v2f64, 1},
6092 // Select shuffle kinds.
6093 // TODO: handle vXi8/vXi16.
6094 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6095 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6096 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6097 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6098 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6099 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6100 // PermuteSingleSrc shuffle kinds.
6101 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6102 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6103 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6104 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6105 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6106 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6107 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6108 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6109 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6110 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6111 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6112 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6113 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6114 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6115 // Reverse can be lowered with `rev`.
6116 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6117 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6118 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6119 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6120 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6121 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6122 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6123 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6124 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6125 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6126 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6127 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6128 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6129 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6130 // Splice can all be lowered as `ext`.
6131 {TTI::SK_Splice, MVT::v2i32, 1},
6132 {TTI::SK_Splice, MVT::v4i32, 1},
6133 {TTI::SK_Splice, MVT::v2i64, 1},
6134 {TTI::SK_Splice, MVT::v2f32, 1},
6135 {TTI::SK_Splice, MVT::v4f32, 1},
6136 {TTI::SK_Splice, MVT::v2f64, 1},
6137 {TTI::SK_Splice, MVT::v8f16, 1},
6138 {TTI::SK_Splice, MVT::v8bf16, 1},
6139 {TTI::SK_Splice, MVT::v8i16, 1},
6140 {TTI::SK_Splice, MVT::v16i8, 1},
6141 {TTI::SK_Splice, MVT::v4f16, 1},
6142 {TTI::SK_Splice, MVT::v4bf16, 1},
6143 {TTI::SK_Splice, MVT::v4i16, 1},
6144 {TTI::SK_Splice, MVT::v8i8, 1},
6145 // Broadcast shuffle kinds for scalable vectors
6146 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6147 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6148 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6149 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6150 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6151 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6152 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6153 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6154 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6155 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6156 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6157 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6158 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6159 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6160 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6161 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6162 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6163 // Handle the cases for vector.reverse with scalable vectors
6164 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6165 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6166 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6167 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6168 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6169 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6170 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6171 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6172 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6173 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6174 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6175 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6176 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6177 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6178 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6179 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6180 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6181 };
6182 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6183 return LT.first * Entry->Cost;
6184 }
6185
6186 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6187 return getSpliceCost(SrcTy, Index, CostKind);
6188
6189 // Inserting a subvector can often be done with either a D, S or H register
6190 // move, so long as the inserted vector is "aligned".
6191 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6192 LT.second.getSizeInBits() <= 128 && SubTp) {
6193 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6194 if (SubLT.second.isVector()) {
6195 int NumElts = LT.second.getVectorNumElements();
6196 int NumSubElts = SubLT.second.getVectorNumElements();
6197 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6198 return SubLT.first;
6199 }
6200 }
6201
6202 // Restore optimal kind.
6203 if (IsExtractSubvector)
6205 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6206 Args, CxtI);
6207}
6208
6211 const DominatorTree &DT) {
6212 const auto &Strides = DenseMap<Value *, const SCEV *>();
6213 for (BasicBlock *BB : TheLoop->blocks()) {
6214 // Scan the instructions in the block and look for addresses that are
6215 // consecutive and decreasing.
6216 for (Instruction &I : *BB) {
6217 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6219 Type *AccessTy = getLoadStoreType(&I);
6220 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6221 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6222 .value_or(0) < 0)
6223 return true;
6224 }
6225 }
6226 }
6227 return false;
6228}
6229
6231 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6233 // For cases like post-LTO vectorization, when we eventually know the trip
6234 // count, epilogue with fixed-width vectorization can be deleted if the trip
6235 // count is less than the epilogue iterations. That's why we prefer
6236 // fixed-width vectorization in epilogue in case of equal costs.
6237 if (IsEpilogue)
6238 return true;
6239 return ST->useFixedOverScalableIfEqualCost();
6240}
6241
6243 return ST->getEpilogueVectorizationMinVF();
6244}
6245
6247 if (!ST->hasSVE())
6248 return false;
6249
6250 // We don't currently support vectorisation with interleaving for SVE - with
6251 // such loops we're better off not using tail-folding. This gives us a chance
6252 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6253 if (TFI->IAI->hasGroups())
6254 return false;
6255
6257 if (TFI->LVL->getReductionVars().size())
6259 if (TFI->LVL->getFixedOrderRecurrences().size())
6261
6262 // We call this to discover whether any load/store pointers in the loop have
6263 // negative strides. This will require extra work to reverse the loop
6264 // predicate, which may be expensive.
6267 *TFI->LVL->getDominatorTree()))
6271
6272 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6273 Required))
6274 return false;
6275
6276 // Don't tail-fold for tight loops where we would be better off interleaving
6277 // with an unpredicated loop.
6278 unsigned NumInsns = 0;
6279 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6280 NumInsns += BB->sizeWithoutDebug();
6281 }
6282
6283 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6284 return NumInsns >= SVETailFoldInsnThreshold;
6285}
6286
6289 StackOffset BaseOffset, bool HasBaseReg,
6290 int64_t Scale, unsigned AddrSpace) const {
6291 // Scaling factors are not free at all.
6292 // Operands | Rt Latency
6293 // -------------------------------------------
6294 // Rt, [Xn, Xm] | 4
6295 // -------------------------------------------
6296 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6297 // Rt, [Xn, Wm, <extend> #imm] |
6299 AM.BaseGV = BaseGV;
6300 AM.BaseOffs = BaseOffset.getFixed();
6301 AM.HasBaseReg = HasBaseReg;
6302 AM.Scale = Scale;
6303 AM.ScalableOffset = BaseOffset.getScalable();
6304 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6305 // Scale represents reg2 * scale, thus account for 1 if
6306 // it is not equal to 0 or 1.
6307 return AM.Scale != 0 && AM.Scale != 1;
6309}
6310
6312 const Instruction *I) const {
6314 // For the binary operators (e.g. or) we need to be more careful than
6315 // selects, here we only transform them if they are already at a natural
6316 // break point in the code - the end of a block with an unconditional
6317 // terminator.
6318 if (I->getOpcode() == Instruction::Or &&
6319 isa<BranchInst>(I->getNextNode()) &&
6320 cast<BranchInst>(I->getNextNode())->isUnconditional())
6321 return true;
6322
6323 if (I->getOpcode() == Instruction::Add ||
6324 I->getOpcode() == Instruction::Sub)
6325 return true;
6326 }
6328}
6329
6332 const TargetTransformInfo::LSRCost &C2) const {
6333 // AArch64 specific here is adding the number of instructions to the
6334 // comparison (though not as the first consideration, as some targets do)
6335 // along with changing the priority of the base additions.
6336 // TODO: Maybe a more nuanced tradeoff between instruction count
6337 // and number of registers? To be investigated at a later date.
6338 if (EnableLSRCostOpt)
6339 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6340 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6341 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6342 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6343
6345}
6346
6347static bool isSplatShuffle(Value *V) {
6348 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6349 return all_equal(Shuf->getShuffleMask());
6350 return false;
6351}
6352
6353/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6354/// or upper half of the vector elements.
6355static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6356 bool AllowSplat = false) {
6357 // Scalable types can't be extract shuffle vectors.
6358 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6359 return false;
6360
6361 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6362 auto *FullTy = FullV->getType();
6363 auto *HalfTy = HalfV->getType();
6364 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6365 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6366 };
6367
6368 auto extractHalf = [](Value *FullV, Value *HalfV) {
6369 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6370 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6371 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6372 };
6373
6374 ArrayRef<int> M1, M2;
6375 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6376 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6377 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6378 return false;
6379
6380 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6381 // it is not checked as an extract below.
6382 if (AllowSplat && isSplatShuffle(Op1))
6383 S1Op1 = nullptr;
6384 if (AllowSplat && isSplatShuffle(Op2))
6385 S2Op1 = nullptr;
6386
6387 // Check that the operands are half as wide as the result and we extract
6388 // half of the elements of the input vectors.
6389 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6390 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6391 return false;
6392
6393 // Check the mask extracts either the lower or upper half of vector
6394 // elements.
6395 int M1Start = 0;
6396 int M2Start = 0;
6397 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6398 if ((S1Op1 &&
6399 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6400 (S2Op1 &&
6401 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6402 return false;
6403
6404 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6405 (M2Start != 0 && M2Start != (NumElements / 2)))
6406 return false;
6407 if (S1Op1 && S2Op1 && M1Start != M2Start)
6408 return false;
6409
6410 return true;
6411}
6412
6413/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6414/// of the vector elements.
6415static bool areExtractExts(Value *Ext1, Value *Ext2) {
6416 auto areExtDoubled = [](Instruction *Ext) {
6417 return Ext->getType()->getScalarSizeInBits() ==
6418 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6419 };
6420
6421 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6422 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6423 !areExtDoubled(cast<Instruction>(Ext1)) ||
6424 !areExtDoubled(cast<Instruction>(Ext2)))
6425 return false;
6426
6427 return true;
6428}
6429
6430/// Check if Op could be used with vmull_high_p64 intrinsic.
6432 Value *VectorOperand = nullptr;
6433 ConstantInt *ElementIndex = nullptr;
6434 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6435 m_ConstantInt(ElementIndex))) &&
6436 ElementIndex->getValue() == 1 &&
6437 isa<FixedVectorType>(VectorOperand->getType()) &&
6438 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6439}
6440
6441/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6442static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6444}
6445
6447 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6448 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6449 if (!GEP || GEP->getNumOperands() != 2)
6450 return false;
6451
6452 Value *Base = GEP->getOperand(0);
6453 Value *Offsets = GEP->getOperand(1);
6454
6455 // We only care about scalar_base+vector_offsets.
6456 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6457 return false;
6458
6459 // Sink extends that would allow us to use 32-bit offset vectors.
6460 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6461 auto *OffsetsInst = cast<Instruction>(Offsets);
6462 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6463 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6464 Ops.push_back(&GEP->getOperandUse(1));
6465 }
6466
6467 // Sink the GEP.
6468 return true;
6469}
6470
6471/// We want to sink following cases:
6472/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6473/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6475 if (match(Op, m_VScale()))
6476 return true;
6477 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6479 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6480 return true;
6481 }
6482 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6484 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6485 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6486 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6487 return true;
6488 }
6489 return false;
6490}
6491
6492/// Check if sinking \p I's operands to I's basic block is profitable, because
6493/// the operands can be folded into a target instruction, e.g.
6494/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6498 switch (II->getIntrinsicID()) {
6499 case Intrinsic::aarch64_neon_smull:
6500 case Intrinsic::aarch64_neon_umull:
6501 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6502 /*AllowSplat=*/true)) {
6503 Ops.push_back(&II->getOperandUse(0));
6504 Ops.push_back(&II->getOperandUse(1));
6505 return true;
6506 }
6507 [[fallthrough]];
6508
6509 case Intrinsic::fma:
6510 case Intrinsic::fmuladd:
6511 if (isa<VectorType>(I->getType()) &&
6512 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6513 !ST->hasFullFP16())
6514 return false;
6515 [[fallthrough]];
6516 case Intrinsic::aarch64_neon_sqdmull:
6517 case Intrinsic::aarch64_neon_sqdmulh:
6518 case Intrinsic::aarch64_neon_sqrdmulh:
6519 // Sink splats for index lane variants
6520 if (isSplatShuffle(II->getOperand(0)))
6521 Ops.push_back(&II->getOperandUse(0));
6522 if (isSplatShuffle(II->getOperand(1)))
6523 Ops.push_back(&II->getOperandUse(1));
6524 return !Ops.empty();
6525 case Intrinsic::aarch64_neon_fmlal:
6526 case Intrinsic::aarch64_neon_fmlal2:
6527 case Intrinsic::aarch64_neon_fmlsl:
6528 case Intrinsic::aarch64_neon_fmlsl2:
6529 // Sink splats for index lane variants
6530 if (isSplatShuffle(II->getOperand(1)))
6531 Ops.push_back(&II->getOperandUse(1));
6532 if (isSplatShuffle(II->getOperand(2)))
6533 Ops.push_back(&II->getOperandUse(2));
6534 return !Ops.empty();
6535 case Intrinsic::aarch64_sve_ptest_first:
6536 case Intrinsic::aarch64_sve_ptest_last:
6537 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6538 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6539 Ops.push_back(&II->getOperandUse(0));
6540 return !Ops.empty();
6541 case Intrinsic::aarch64_sme_write_horiz:
6542 case Intrinsic::aarch64_sme_write_vert:
6543 case Intrinsic::aarch64_sme_writeq_horiz:
6544 case Intrinsic::aarch64_sme_writeq_vert: {
6545 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6546 if (!Idx || Idx->getOpcode() != Instruction::Add)
6547 return false;
6548 Ops.push_back(&II->getOperandUse(1));
6549 return true;
6550 }
6551 case Intrinsic::aarch64_sme_read_horiz:
6552 case Intrinsic::aarch64_sme_read_vert:
6553 case Intrinsic::aarch64_sme_readq_horiz:
6554 case Intrinsic::aarch64_sme_readq_vert:
6555 case Intrinsic::aarch64_sme_ld1b_vert:
6556 case Intrinsic::aarch64_sme_ld1h_vert:
6557 case Intrinsic::aarch64_sme_ld1w_vert:
6558 case Intrinsic::aarch64_sme_ld1d_vert:
6559 case Intrinsic::aarch64_sme_ld1q_vert:
6560 case Intrinsic::aarch64_sme_st1b_vert:
6561 case Intrinsic::aarch64_sme_st1h_vert:
6562 case Intrinsic::aarch64_sme_st1w_vert:
6563 case Intrinsic::aarch64_sme_st1d_vert:
6564 case Intrinsic::aarch64_sme_st1q_vert:
6565 case Intrinsic::aarch64_sme_ld1b_horiz:
6566 case Intrinsic::aarch64_sme_ld1h_horiz:
6567 case Intrinsic::aarch64_sme_ld1w_horiz:
6568 case Intrinsic::aarch64_sme_ld1d_horiz:
6569 case Intrinsic::aarch64_sme_ld1q_horiz:
6570 case Intrinsic::aarch64_sme_st1b_horiz:
6571 case Intrinsic::aarch64_sme_st1h_horiz:
6572 case Intrinsic::aarch64_sme_st1w_horiz:
6573 case Intrinsic::aarch64_sme_st1d_horiz:
6574 case Intrinsic::aarch64_sme_st1q_horiz: {
6575 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6576 if (!Idx || Idx->getOpcode() != Instruction::Add)
6577 return false;
6578 Ops.push_back(&II->getOperandUse(3));
6579 return true;
6580 }
6581 case Intrinsic::aarch64_neon_pmull:
6582 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6583 return false;
6584 Ops.push_back(&II->getOperandUse(0));
6585 Ops.push_back(&II->getOperandUse(1));
6586 return true;
6587 case Intrinsic::aarch64_neon_pmull64:
6588 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6589 II->getArgOperand(1)))
6590 return false;
6591 Ops.push_back(&II->getArgOperandUse(0));
6592 Ops.push_back(&II->getArgOperandUse(1));
6593 return true;
6594 case Intrinsic::masked_gather:
6595 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6596 return false;
6597 Ops.push_back(&II->getArgOperandUse(0));
6598 return true;
6599 case Intrinsic::masked_scatter:
6600 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6601 return false;
6602 Ops.push_back(&II->getArgOperandUse(1));
6603 return true;
6604 default:
6605 return false;
6606 }
6607 }
6608
6609 auto ShouldSinkCondition = [](Value *Cond,
6610 SmallVectorImpl<Use *> &Ops) -> bool {
6612 return false;
6614 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6615 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6616 return false;
6617 if (isa<CmpInst>(II->getOperand(0)))
6618 Ops.push_back(&II->getOperandUse(0));
6619 return true;
6620 };
6621
6622 switch (I->getOpcode()) {
6623 case Instruction::GetElementPtr:
6624 case Instruction::Add:
6625 case Instruction::Sub:
6626 // Sink vscales closer to uses for better isel
6627 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6628 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6629 Ops.push_back(&I->getOperandUse(Op));
6630 return true;
6631 }
6632 }
6633 break;
6634 case Instruction::Select: {
6635 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6636 return false;
6637
6638 Ops.push_back(&I->getOperandUse(0));
6639 return true;
6640 }
6641 case Instruction::Br: {
6642 if (cast<BranchInst>(I)->isUnconditional())
6643 return false;
6644
6645 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6646 return false;
6647
6648 Ops.push_back(&I->getOperandUse(0));
6649 return true;
6650 }
6651 default:
6652 break;
6653 }
6654
6655 if (!I->getType()->isVectorTy())
6656 return false;
6657
6658 switch (I->getOpcode()) {
6659 case Instruction::Sub:
6660 case Instruction::Add: {
6661 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6662 return false;
6663
6664 // If the exts' operands extract either the lower or upper elements, we
6665 // can sink them too.
6666 auto Ext1 = cast<Instruction>(I->getOperand(0));
6667 auto Ext2 = cast<Instruction>(I->getOperand(1));
6668 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6669 Ops.push_back(&Ext1->getOperandUse(0));
6670 Ops.push_back(&Ext2->getOperandUse(0));
6671 }
6672
6673 Ops.push_back(&I->getOperandUse(0));
6674 Ops.push_back(&I->getOperandUse(1));
6675
6676 return true;
6677 }
6678 case Instruction::Or: {
6679 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6680 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6681 if (ST->hasNEON()) {
6682 Instruction *OtherAnd, *IA, *IB;
6683 Value *MaskValue;
6684 // MainAnd refers to And instruction that has 'Not' as one of its operands
6685 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6686 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6687 m_Instruction(IA)))))) {
6688 if (match(OtherAnd,
6689 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6690 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6691 ? cast<Instruction>(I->getOperand(1))
6692 : cast<Instruction>(I->getOperand(0));
6693
6694 // Both Ands should be in same basic block as Or
6695 if (I->getParent() != MainAnd->getParent() ||
6696 I->getParent() != OtherAnd->getParent())
6697 return false;
6698
6699 // Non-mask operands of both Ands should also be in same basic block
6700 if (I->getParent() != IA->getParent() ||
6701 I->getParent() != IB->getParent())
6702 return false;
6703
6704 Ops.push_back(
6705 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6706 Ops.push_back(&I->getOperandUse(0));
6707 Ops.push_back(&I->getOperandUse(1));
6708
6709 return true;
6710 }
6711 }
6712 }
6713
6714 return false;
6715 }
6716 case Instruction::Mul: {
6717 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6718 auto *Ty = cast<VectorType>(V->getType());
6719 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6720 if (Ty->isScalableTy())
6721 return false;
6722
6723 // Indexed variants of Mul exist for i16 and i32 element types only.
6724 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6725 };
6726
6727 int NumZExts = 0, NumSExts = 0;
6728 for (auto &Op : I->operands()) {
6729 // Make sure we are not already sinking this operand
6730 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6731 continue;
6732
6733 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6734 auto *Ext = cast<Instruction>(Op);
6735 auto *ExtOp = Ext->getOperand(0);
6736 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6737 Ops.push_back(&Ext->getOperandUse(0));
6738 Ops.push_back(&Op);
6739
6740 if (isa<SExtInst>(Ext)) {
6741 NumSExts++;
6742 } else {
6743 NumZExts++;
6744 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6745 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
6746 I->getType()->getScalarSizeInBits())
6747 NumSExts++;
6748 }
6749
6750 continue;
6751 }
6752
6754 if (!Shuffle)
6755 continue;
6756
6757 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6758 // operand and the s/zext can help create indexed s/umull. This is
6759 // especially useful to prevent i64 mul being scalarized.
6760 if (isSplatShuffle(Shuffle) &&
6761 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6762 Ops.push_back(&Shuffle->getOperandUse(0));
6763 Ops.push_back(&Op);
6764 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6765 NumSExts++;
6766 else
6767 NumZExts++;
6768 continue;
6769 }
6770
6771 Value *ShuffleOperand = Shuffle->getOperand(0);
6772 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6773 if (!Insert)
6774 continue;
6775
6776 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6777 if (!OperandInstr)
6778 continue;
6779
6780 ConstantInt *ElementConstant =
6781 dyn_cast<ConstantInt>(Insert->getOperand(2));
6782 // Check that the insertelement is inserting into element 0
6783 if (!ElementConstant || !ElementConstant->isZero())
6784 continue;
6785
6786 unsigned Opcode = OperandInstr->getOpcode();
6787 if (Opcode == Instruction::SExt)
6788 NumSExts++;
6789 else if (Opcode == Instruction::ZExt)
6790 NumZExts++;
6791 else {
6792 // If we find that the top bits are known 0, then we can sink and allow
6793 // the backend to generate a umull.
6794 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6795 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6796 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6797 continue;
6798 NumZExts++;
6799 }
6800
6801 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6802 // the And, just to hoist it again back to the load.
6803 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6804 Ops.push_back(&Insert->getOperandUse(1));
6805 Ops.push_back(&Shuffle->getOperandUse(0));
6806 Ops.push_back(&Op);
6807 }
6808
6809 // It is profitable to sink if we found two of the same type of extends.
6810 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6811 return true;
6812
6813 // Otherwise, see if we should sink splats for indexed variants.
6814 if (!ShouldSinkSplatForIndexedVariant(I))
6815 return false;
6816
6817 Ops.clear();
6818 if (isSplatShuffle(I->getOperand(0)))
6819 Ops.push_back(&I->getOperandUse(0));
6820 if (isSplatShuffle(I->getOperand(1)))
6821 Ops.push_back(&I->getOperandUse(1));
6822
6823 return !Ops.empty();
6824 }
6825 case Instruction::FMul: {
6826 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6827 if (I->getType()->isScalableTy())
6828 return false;
6829
6830 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6831 !ST->hasFullFP16())
6832 return false;
6833
6834 // Sink splats for index lane variants
6835 if (isSplatShuffle(I->getOperand(0)))
6836 Ops.push_back(&I->getOperandUse(0));
6837 if (isSplatShuffle(I->getOperand(1)))
6838 Ops.push_back(&I->getOperandUse(1));
6839 return !Ops.empty();
6840 }
6841 default:
6842 return false;
6843 }
6844 return false;
6845}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
unsigned countLeadingOnes() const
Definition APInt.h:1625
void negate()
Negate this APInt in place.
Definition APInt.h:1469
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1762
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:760
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1107
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:595
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:580
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1926
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1714
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
This instruction inserts a single (scalar) element into a VectorType value.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:825
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:31
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:40
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:702
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:347
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:948
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:388
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:217
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:253
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isFixedLengthVector() const
Definition ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...