LLVM 19.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
23#include "llvm/Support/Debug.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31#define DEBUG_TYPE "aarch64tti"
32
33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34 cl::init(true), cl::Hidden);
35
36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
38
39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40 cl::init(10), cl::Hidden);
41
42static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
43 cl::init(15), cl::Hidden);
44
46 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
48
50 "call-penalty-sm-change", cl::init(5), cl::Hidden,
52 "Penalty of calling a function that requires a change to PSTATE.SM"));
53
55 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
56 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
57
58static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
59 cl::init(true), cl::Hidden);
60
61namespace {
62class TailFoldingOption {
63 // These bitfields will only ever be set to something non-zero in operator=,
64 // when setting the -sve-tail-folding option. This option should always be of
65 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
66 // InitialBits is one of (disabled|all|simple). EnableBits represents
67 // additional flags we're enabling, and DisableBits for those flags we're
68 // disabling. The default flag is tracked in the variable NeedsDefault, since
69 // at the time of setting the option we may not know what the default value
70 // for the CPU is.
71 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
72 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
73 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
74
75 // This value needs to be initialised to true in case the user does not
76 // explicitly set the -sve-tail-folding option.
77 bool NeedsDefault = true;
78
79 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
80
81 void setNeedsDefault(bool V) { NeedsDefault = V; }
82
83 void setEnableBit(TailFoldingOpts Bit) {
84 EnableBits |= Bit;
85 DisableBits &= ~Bit;
86 }
87
88 void setDisableBit(TailFoldingOpts Bit) {
89 EnableBits &= ~Bit;
90 DisableBits |= Bit;
91 }
92
93 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
94 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
95
96 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
97 "Initial bits should only include one of "
98 "(disabled|all|simple|default)");
99 Bits = NeedsDefault ? DefaultBits : InitialBits;
100 Bits |= EnableBits;
101 Bits &= ~DisableBits;
102
103 return Bits;
104 }
105
106 void reportError(std::string Opt) {
107 errs() << "invalid argument '" << Opt
108 << "' to -sve-tail-folding=; the option should be of the form\n"
109 " (disabled|all|default|simple)[+(reductions|recurrences"
110 "|reverse|noreductions|norecurrences|noreverse)]\n";
111 report_fatal_error("Unrecognised tail-folding option");
112 }
113
114public:
115
116 void operator=(const std::string &Val) {
117 // If the user explicitly sets -sve-tail-folding= then treat as an error.
118 if (Val.empty()) {
119 reportError("");
120 return;
121 }
122
123 // Since the user is explicitly setting the option we don't automatically
124 // need the default unless they require it.
125 setNeedsDefault(false);
126
127 SmallVector<StringRef, 4> TailFoldTypes;
128 StringRef(Val).split(TailFoldTypes, '+', -1, false);
129
130 unsigned StartIdx = 1;
131 if (TailFoldTypes[0] == "disabled")
132 setInitialBits(TailFoldingOpts::Disabled);
133 else if (TailFoldTypes[0] == "all")
134 setInitialBits(TailFoldingOpts::All);
135 else if (TailFoldTypes[0] == "default")
136 setNeedsDefault(true);
137 else if (TailFoldTypes[0] == "simple")
138 setInitialBits(TailFoldingOpts::Simple);
139 else {
140 StartIdx = 0;
141 setInitialBits(TailFoldingOpts::Disabled);
142 }
143
144 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
145 if (TailFoldTypes[I] == "reductions")
146 setEnableBit(TailFoldingOpts::Reductions);
147 else if (TailFoldTypes[I] == "recurrences")
148 setEnableBit(TailFoldingOpts::Recurrences);
149 else if (TailFoldTypes[I] == "reverse")
150 setEnableBit(TailFoldingOpts::Reverse);
151 else if (TailFoldTypes[I] == "noreductions")
152 setDisableBit(TailFoldingOpts::Reductions);
153 else if (TailFoldTypes[I] == "norecurrences")
154 setDisableBit(TailFoldingOpts::Recurrences);
155 else if (TailFoldTypes[I] == "noreverse")
156 setDisableBit(TailFoldingOpts::Reverse);
157 else
158 reportError(Val);
159 }
160 }
161
162 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
163 return (getBits(DefaultBits) & Required) == Required;
164 }
165};
166} // namespace
167
168TailFoldingOption TailFoldingOptionLoc;
169
171 "sve-tail-folding",
172 cl::desc(
173 "Control the use of vectorisation using tail-folding for SVE where the"
174 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
175 "\ndisabled (Initial) No loop types will vectorize using "
176 "tail-folding"
177 "\ndefault (Initial) Uses the default tail-folding settings for "
178 "the target CPU"
179 "\nall (Initial) All legal loop types will vectorize using "
180 "tail-folding"
181 "\nsimple (Initial) Use tail-folding for simple loops (not "
182 "reductions or recurrences)"
183 "\nreductions Use tail-folding for loops containing reductions"
184 "\nnoreductions Inverse of above"
185 "\nrecurrences Use tail-folding for loops containing fixed order "
186 "recurrences"
187 "\nnorecurrences Inverse of above"
188 "\nreverse Use tail-folding for loops requiring reversed "
189 "predicates"
190 "\nnoreverse Inverse of above"),
192
193// Experimental option that will only be fully functional when the
194// code-generator is changed to use SVE instead of NEON for all fixed-width
195// operations.
197 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
198
199// Experimental option that will only be fully functional when the cost-model
200// and code-generator have been changed to avoid using scalable vector
201// instructions that are not legal in streaming SVE mode.
203 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
204
205static bool isSMEABIRoutineCall(const CallInst &CI) {
206 const auto *F = CI.getCalledFunction();
207 return F && StringSwitch<bool>(F->getName())
208 .Case("__arm_sme_state", true)
209 .Case("__arm_tpidr2_save", true)
210 .Case("__arm_tpidr2_restore", true)
211 .Case("__arm_za_disable", true)
212 .Default(false);
213}
214
215/// Returns true if the function has explicit operations that can only be
216/// lowered using incompatible instructions for the selected mode. This also
217/// returns true if the function F may use or modify ZA state.
219 for (const BasicBlock &BB : *F) {
220 for (const Instruction &I : BB) {
221 // Be conservative for now and assume that any call to inline asm or to
222 // intrinsics could could result in non-streaming ops (e.g. calls to
223 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
224 // all native LLVM instructions can be lowered to compatible instructions.
225 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
226 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
227 isSMEABIRoutineCall(cast<CallInst>(I))))
228 return true;
229 }
230 }
231 return false;
232}
233
235 const Function *Callee) const {
236 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
237
238 // When inlining, we should consider the body of the function, not the
239 // interface.
240 if (CalleeAttrs.hasStreamingBody()) {
241 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
242 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
243 }
244
245 if (CalleeAttrs.isNewZA())
246 return false;
247
248 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
249 CallerAttrs.requiresSMChange(CalleeAttrs)) {
250 if (hasPossibleIncompatibleOps(Callee))
251 return false;
252 }
253
254 const TargetMachine &TM = getTLI()->getTargetMachine();
255
256 const FeatureBitset &CallerBits =
257 TM.getSubtargetImpl(*Caller)->getFeatureBits();
258 const FeatureBitset &CalleeBits =
259 TM.getSubtargetImpl(*Callee)->getFeatureBits();
260
261 // Inline a callee if its target-features are a subset of the callers
262 // target-features.
263 return (CallerBits & CalleeBits) == CalleeBits;
264}
265
267 const Function *Caller, const Function *Callee,
268 const ArrayRef<Type *> &Types) const {
269 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
270 return false;
271
272 // We need to ensure that argument promotion does not attempt to promote
273 // pointers to fixed-length vector types larger than 128 bits like
274 // <8 x float> (and pointers to aggregate types which have such fixed-length
275 // vector type members) into the values of the pointees. Such vector types
276 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
277 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
278 // types can be safely treated as 128-bit NEON types and they cannot be
279 // distinguished in IR.
280 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
281 auto FVTy = dyn_cast<FixedVectorType>(Ty);
282 return FVTy &&
283 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
284 }))
285 return false;
286
287 return true;
288}
289
290unsigned
292 unsigned DefaultCallPenalty) const {
293 // This function calculates a penalty for executing Call in F.
294 //
295 // There are two ways this function can be called:
296 // (1) F:
297 // call from F -> G (the call here is Call)
298 //
299 // For (1), Call.getCaller() == F, so it will always return a high cost if
300 // a streaming-mode change is required (thus promoting the need to inline the
301 // function)
302 //
303 // (2) F:
304 // call from F -> G (the call here is not Call)
305 // G:
306 // call from G -> H (the call here is Call)
307 //
308 // For (2), if after inlining the body of G into F the call to H requires a
309 // streaming-mode change, and the call to G from F would also require a
310 // streaming-mode change, then there is benefit to do the streaming-mode
311 // change only once and avoid inlining of G into F.
312 SMEAttrs FAttrs(*F);
313 SMEAttrs CalleeAttrs(Call);
314 if (FAttrs.requiresSMChange(CalleeAttrs)) {
315 if (F == Call.getCaller()) // (1)
316 return CallPenaltyChangeSM * DefaultCallPenalty;
317 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
318 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
319 }
320
321 return DefaultCallPenalty;
322}
323
328 ST->isNeonAvailable());
329}
330
331/// Calculate the cost of materializing a 64-bit value. This helper
332/// method might only calculate a fraction of a larger immediate. Therefore it
333/// is valid to return a cost of ZERO.
335 // Check if the immediate can be encoded within an instruction.
336 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
337 return 0;
338
339 if (Val < 0)
340 Val = ~Val;
341
342 // Calculate how many moves we will need to materialize this constant.
345 return Insn.size();
346}
347
348/// Calculate the cost of materializing the given constant.
351 assert(Ty->isIntegerTy());
352
353 unsigned BitSize = Ty->getPrimitiveSizeInBits();
354 if (BitSize == 0)
355 return ~0U;
356
357 // Sign-extend all constants to a multiple of 64-bit.
358 APInt ImmVal = Imm;
359 if (BitSize & 0x3f)
360 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
361
362 // Split the constant into 64-bit chunks and calculate the cost for each
363 // chunk.
365 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
366 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
367 int64_t Val = Tmp.getSExtValue();
368 Cost += getIntImmCost(Val);
369 }
370 // We need at least one instruction to materialze the constant.
371 return std::max<InstructionCost>(1, Cost);
372}
373
375 const APInt &Imm, Type *Ty,
377 Instruction *Inst) {
378 assert(Ty->isIntegerTy());
379
380 unsigned BitSize = Ty->getPrimitiveSizeInBits();
381 // There is no cost model for constants with a bit size of 0. Return TCC_Free
382 // here, so that constant hoisting will ignore this constant.
383 if (BitSize == 0)
384 return TTI::TCC_Free;
385
386 unsigned ImmIdx = ~0U;
387 switch (Opcode) {
388 default:
389 return TTI::TCC_Free;
390 case Instruction::GetElementPtr:
391 // Always hoist the base address of a GetElementPtr.
392 if (Idx == 0)
393 return 2 * TTI::TCC_Basic;
394 return TTI::TCC_Free;
395 case Instruction::Store:
396 ImmIdx = 0;
397 break;
398 case Instruction::Add:
399 case Instruction::Sub:
400 case Instruction::Mul:
401 case Instruction::UDiv:
402 case Instruction::SDiv:
403 case Instruction::URem:
404 case Instruction::SRem:
405 case Instruction::And:
406 case Instruction::Or:
407 case Instruction::Xor:
408 case Instruction::ICmp:
409 ImmIdx = 1;
410 break;
411 // Always return TCC_Free for the shift value of a shift instruction.
412 case Instruction::Shl:
413 case Instruction::LShr:
414 case Instruction::AShr:
415 if (Idx == 1)
416 return TTI::TCC_Free;
417 break;
418 case Instruction::Trunc:
419 case Instruction::ZExt:
420 case Instruction::SExt:
421 case Instruction::IntToPtr:
422 case Instruction::PtrToInt:
423 case Instruction::BitCast:
424 case Instruction::PHI:
425 case Instruction::Call:
426 case Instruction::Select:
427 case Instruction::Ret:
428 case Instruction::Load:
429 break;
430 }
431
432 if (Idx == ImmIdx) {
433 int NumConstants = (BitSize + 63) / 64;
435 return (Cost <= NumConstants * TTI::TCC_Basic)
436 ? static_cast<int>(TTI::TCC_Free)
437 : Cost;
438 }
440}
441
444 const APInt &Imm, Type *Ty,
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 // Most (all?) AArch64 intrinsics do not support folding immediates into the
455 // selected instruction, so we compute the materialization cost for the
456 // immediate directly.
457 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
459
460 switch (IID) {
461 default:
462 return TTI::TCC_Free;
463 case Intrinsic::sadd_with_overflow:
464 case Intrinsic::uadd_with_overflow:
465 case Intrinsic::ssub_with_overflow:
466 case Intrinsic::usub_with_overflow:
467 case Intrinsic::smul_with_overflow:
468 case Intrinsic::umul_with_overflow:
469 if (Idx == 1) {
470 int NumConstants = (BitSize + 63) / 64;
472 return (Cost <= NumConstants * TTI::TCC_Basic)
473 ? static_cast<int>(TTI::TCC_Free)
474 : Cost;
475 }
476 break;
477 case Intrinsic::experimental_stackmap:
478 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
479 return TTI::TCC_Free;
480 break;
481 case Intrinsic::experimental_patchpoint_void:
482 case Intrinsic::experimental_patchpoint:
483 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
484 return TTI::TCC_Free;
485 break;
486 case Intrinsic::experimental_gc_statepoint:
487 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
488 return TTI::TCC_Free;
489 break;
490 }
492}
493
496 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
497 if (TyWidth == 32 || TyWidth == 64)
499 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
500 return TTI::PSK_Software;
501}
502
503static bool isUnpackedVectorVT(EVT VecVT) {
504 return VecVT.isScalableVector() &&
506}
507
511 auto *RetTy = ICA.getReturnType();
512 switch (ICA.getID()) {
513 case Intrinsic::umin:
514 case Intrinsic::umax:
515 case Intrinsic::smin:
516 case Intrinsic::smax: {
517 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
518 MVT::v8i16, MVT::v2i32, MVT::v4i32,
519 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
520 MVT::nxv2i64};
522 // v2i64 types get converted to cmp+bif hence the cost of 2
523 if (LT.second == MVT::v2i64)
524 return LT.first * 2;
525 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
526 return LT.first;
527 break;
528 }
529 case Intrinsic::sadd_sat:
530 case Intrinsic::ssub_sat:
531 case Intrinsic::uadd_sat:
532 case Intrinsic::usub_sat: {
533 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
534 MVT::v8i16, MVT::v2i32, MVT::v4i32,
535 MVT::v2i64};
537 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
538 // need to extend the type, as it uses shr(qadd(shl, shl)).
539 unsigned Instrs =
540 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
541 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
542 return LT.first * Instrs;
543 break;
544 }
545 case Intrinsic::abs: {
546 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
547 MVT::v8i16, MVT::v2i32, MVT::v4i32,
548 MVT::v2i64};
550 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
551 return LT.first;
552 break;
553 }
554 case Intrinsic::bswap: {
555 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
556 MVT::v4i32, MVT::v2i64};
558 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
559 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
560 return LT.first;
561 break;
562 }
563 case Intrinsic::experimental_stepvector: {
564 InstructionCost Cost = 1; // Cost of the `index' instruction
566 // Legalisation of illegal vectors involves an `index' instruction plus
567 // (LT.first - 1) vector adds.
568 if (LT.first > 1) {
569 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
570 InstructionCost AddCost =
571 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
572 Cost += AddCost * (LT.first - 1);
573 }
574 return Cost;
575 }
576 case Intrinsic::vector_extract:
577 case Intrinsic::vector_insert: {
578 // If both the vector and subvector types are legal types and the index
579 // is 0, then this should be a no-op or simple operation; return a
580 // relatively low cost.
581
582 // If arguments aren't actually supplied, then we cannot determine the
583 // value of the index. We also want to skip predicate types.
584 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
586 break;
587
588 LLVMContext &C = RetTy->getContext();
589 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
590 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
591 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
592 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
593 // Skip this if either the vector or subvector types are unpacked
594 // SVE types; they may get lowered to stack stores and loads.
595 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
596 break;
597
599 getTLI()->getTypeConversion(C, SubVecVT);
601 getTLI()->getTypeConversion(C, VecVT);
602 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
603 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
604 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
605 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
606 return TTI::TCC_Free;
607 break;
608 }
609 case Intrinsic::bitreverse: {
610 static const CostTblEntry BitreverseTbl[] = {
611 {Intrinsic::bitreverse, MVT::i32, 1},
612 {Intrinsic::bitreverse, MVT::i64, 1},
613 {Intrinsic::bitreverse, MVT::v8i8, 1},
614 {Intrinsic::bitreverse, MVT::v16i8, 1},
615 {Intrinsic::bitreverse, MVT::v4i16, 2},
616 {Intrinsic::bitreverse, MVT::v8i16, 2},
617 {Intrinsic::bitreverse, MVT::v2i32, 2},
618 {Intrinsic::bitreverse, MVT::v4i32, 2},
619 {Intrinsic::bitreverse, MVT::v1i64, 2},
620 {Intrinsic::bitreverse, MVT::v2i64, 2},
621 };
622 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
623 const auto *Entry =
624 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
625 if (Entry) {
626 // Cost Model is using the legal type(i32) that i8 and i16 will be
627 // converted to +1 so that we match the actual lowering cost
628 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
629 TLI->getValueType(DL, RetTy, true) == MVT::i16)
630 return LegalisationCost.first * Entry->Cost + 1;
631
632 return LegalisationCost.first * Entry->Cost;
633 }
634 break;
635 }
636 case Intrinsic::ctpop: {
637 if (!ST->hasNEON()) {
638 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
639 return getTypeLegalizationCost(RetTy).first * 12;
640 }
641 static const CostTblEntry CtpopCostTbl[] = {
642 {ISD::CTPOP, MVT::v2i64, 4},
643 {ISD::CTPOP, MVT::v4i32, 3},
644 {ISD::CTPOP, MVT::v8i16, 2},
645 {ISD::CTPOP, MVT::v16i8, 1},
646 {ISD::CTPOP, MVT::i64, 4},
647 {ISD::CTPOP, MVT::v2i32, 3},
648 {ISD::CTPOP, MVT::v4i16, 2},
649 {ISD::CTPOP, MVT::v8i8, 1},
650 {ISD::CTPOP, MVT::i32, 5},
651 };
653 MVT MTy = LT.second;
654 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
655 // Extra cost of +1 when illegal vector types are legalized by promoting
656 // the integer type.
657 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
658 RetTy->getScalarSizeInBits()
659 ? 1
660 : 0;
661 return LT.first * Entry->Cost + ExtraCost;
662 }
663 break;
664 }
665 case Intrinsic::sadd_with_overflow:
666 case Intrinsic::uadd_with_overflow:
667 case Intrinsic::ssub_with_overflow:
668 case Intrinsic::usub_with_overflow:
669 case Intrinsic::smul_with_overflow:
670 case Intrinsic::umul_with_overflow: {
671 static const CostTblEntry WithOverflowCostTbl[] = {
672 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
673 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
674 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
675 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
676 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
677 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
678 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
679 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
680 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
681 {Intrinsic::usub_with_overflow, MVT::i8, 3},
682 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
683 {Intrinsic::usub_with_overflow, MVT::i16, 3},
684 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
685 {Intrinsic::usub_with_overflow, MVT::i32, 1},
686 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
687 {Intrinsic::usub_with_overflow, MVT::i64, 1},
688 {Intrinsic::smul_with_overflow, MVT::i8, 5},
689 {Intrinsic::umul_with_overflow, MVT::i8, 4},
690 {Intrinsic::smul_with_overflow, MVT::i16, 5},
691 {Intrinsic::umul_with_overflow, MVT::i16, 4},
692 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
693 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
694 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
695 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
696 };
697 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
698 if (MTy.isSimple())
699 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
700 MTy.getSimpleVT()))
701 return Entry->Cost;
702 break;
703 }
704 case Intrinsic::fptosi_sat:
705 case Intrinsic::fptoui_sat: {
706 if (ICA.getArgTypes().empty())
707 break;
708 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
709 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
710 EVT MTy = TLI->getValueType(DL, RetTy);
711 // Check for the legal types, which are where the size of the input and the
712 // output are the same, or we are using cvt f64->i32 or f32->i64.
713 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
714 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
715 LT.second == MVT::v2f64) &&
716 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
717 (LT.second == MVT::f64 && MTy == MVT::i32) ||
718 (LT.second == MVT::f32 && MTy == MVT::i64)))
719 return LT.first;
720 // Similarly for fp16 sizes
721 if (ST->hasFullFP16() &&
722 ((LT.second == MVT::f16 && MTy == MVT::i32) ||
723 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
724 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
725 return LT.first;
726
727 // Otherwise we use a legal convert followed by a min+max
728 if ((LT.second.getScalarType() == MVT::f32 ||
729 LT.second.getScalarType() == MVT::f64 ||
730 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
731 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
732 Type *LegalTy =
733 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
734 if (LT.second.isVector())
735 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
737 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
738 LegalTy, {LegalTy, LegalTy});
740 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
741 LegalTy, {LegalTy, LegalTy});
743 return LT.first * Cost;
744 }
745 break;
746 }
747 case Intrinsic::fshl:
748 case Intrinsic::fshr: {
749 if (ICA.getArgs().empty())
750 break;
751
752 // TODO: Add handling for fshl where third argument is not a constant.
753 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
754 if (!OpInfoZ.isConstant())
755 break;
756
757 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
758 if (OpInfoZ.isUniform()) {
759 // FIXME: The costs could be lower if the codegen is better.
760 static const CostTblEntry FshlTbl[] = {
761 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
762 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
763 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
764 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
765 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
766 // to avoid having to duplicate the costs.
767 const auto *Entry =
768 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
769 if (Entry)
770 return LegalisationCost.first * Entry->Cost;
771 }
772
773 auto TyL = getTypeLegalizationCost(RetTy);
774 if (!RetTy->isIntegerTy())
775 break;
776
777 // Estimate cost manually, as types like i8 and i16 will get promoted to
778 // i32 and CostTableLookup will ignore the extra conversion cost.
779 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
780 RetTy->getScalarSizeInBits() < 64) ||
781 (RetTy->getScalarSizeInBits() % 64 != 0);
782 unsigned ExtraCost = HigherCost ? 1 : 0;
783 if (RetTy->getScalarSizeInBits() == 32 ||
784 RetTy->getScalarSizeInBits() == 64)
785 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
786 // extr instruction.
787 else if (HigherCost)
788 ExtraCost = 1;
789 else
790 break;
791 return TyL.first + ExtraCost;
792 }
793 default:
794 break;
795 }
797}
798
799/// The function will remove redundant reinterprets casting in the presence
800/// of the control flow
801static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
802 IntrinsicInst &II) {
804 auto RequiredType = II.getType();
805
806 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
807 assert(PN && "Expected Phi Node!");
808
809 // Don't create a new Phi unless we can remove the old one.
810 if (!PN->hasOneUse())
811 return std::nullopt;
812
813 for (Value *IncValPhi : PN->incoming_values()) {
814 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
815 if (!Reinterpret ||
816 Reinterpret->getIntrinsicID() !=
817 Intrinsic::aarch64_sve_convert_to_svbool ||
818 RequiredType != Reinterpret->getArgOperand(0)->getType())
819 return std::nullopt;
820 }
821
822 // Create the new Phi
823 IC.Builder.SetInsertPoint(PN);
824 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
825 Worklist.push_back(PN);
826
827 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
828 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
829 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
830 Worklist.push_back(Reinterpret);
831 }
832
833 // Cleanup Phi Node and reinterprets
834 return IC.replaceInstUsesWith(II, NPN);
835}
836
837// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
838// => (binop (pred) (from_svbool _) (from_svbool _))
839//
840// The above transformation eliminates a `to_svbool` in the predicate
841// operand of bitwise operation `binop` by narrowing the vector width of
842// the operation. For example, it would convert a `<vscale x 16 x i1>
843// and` into a `<vscale x 4 x i1> and`. This is profitable because
844// to_svbool must zero the new lanes during widening, whereas
845// from_svbool is free.
846static std::optional<Instruction *>
848 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
849 if (!BinOp)
850 return std::nullopt;
851
852 auto IntrinsicID = BinOp->getIntrinsicID();
853 switch (IntrinsicID) {
854 case Intrinsic::aarch64_sve_and_z:
855 case Intrinsic::aarch64_sve_bic_z:
856 case Intrinsic::aarch64_sve_eor_z:
857 case Intrinsic::aarch64_sve_nand_z:
858 case Intrinsic::aarch64_sve_nor_z:
859 case Intrinsic::aarch64_sve_orn_z:
860 case Intrinsic::aarch64_sve_orr_z:
861 break;
862 default:
863 return std::nullopt;
864 }
865
866 auto BinOpPred = BinOp->getOperand(0);
867 auto BinOpOp1 = BinOp->getOperand(1);
868 auto BinOpOp2 = BinOp->getOperand(2);
869
870 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
871 if (!PredIntr ||
872 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
873 return std::nullopt;
874
875 auto PredOp = PredIntr->getOperand(0);
876 auto PredOpTy = cast<VectorType>(PredOp->getType());
877 if (PredOpTy != II.getType())
878 return std::nullopt;
879
880 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
881 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
882 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
883 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
884 if (BinOpOp1 == BinOpOp2)
885 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
886 else
887 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
888 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
889
890 auto NarrowedBinOp =
891 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
892 return IC.replaceInstUsesWith(II, NarrowedBinOp);
893}
894
895static std::optional<Instruction *>
897 // If the reinterpret instruction operand is a PHI Node
898 if (isa<PHINode>(II.getArgOperand(0)))
899 return processPhiNode(IC, II);
900
901 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
902 return BinOpCombine;
903
904 // Ignore converts to/from svcount_t.
905 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
906 isa<TargetExtType>(II.getType()))
907 return std::nullopt;
908
909 SmallVector<Instruction *, 32> CandidatesForRemoval;
910 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
911
912 const auto *IVTy = cast<VectorType>(II.getType());
913
914 // Walk the chain of conversions.
915 while (Cursor) {
916 // If the type of the cursor has fewer lanes than the final result, zeroing
917 // must take place, which breaks the equivalence chain.
918 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
919 if (CursorVTy->getElementCount().getKnownMinValue() <
920 IVTy->getElementCount().getKnownMinValue())
921 break;
922
923 // If the cursor has the same type as I, it is a viable replacement.
924 if (Cursor->getType() == IVTy)
925 EarliestReplacement = Cursor;
926
927 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
928
929 // If this is not an SVE conversion intrinsic, this is the end of the chain.
930 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
931 Intrinsic::aarch64_sve_convert_to_svbool ||
932 IntrinsicCursor->getIntrinsicID() ==
933 Intrinsic::aarch64_sve_convert_from_svbool))
934 break;
935
936 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
937 Cursor = IntrinsicCursor->getOperand(0);
938 }
939
940 // If no viable replacement in the conversion chain was found, there is
941 // nothing to do.
942 if (!EarliestReplacement)
943 return std::nullopt;
944
945 return IC.replaceInstUsesWith(II, EarliestReplacement);
946}
947
948static bool isAllActivePredicate(Value *Pred) {
949 // Look through convert.from.svbool(convert.to.svbool(...) chain.
950 Value *UncastedPred;
951 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
952 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
953 m_Value(UncastedPred)))))
954 // If the predicate has the same or less lanes than the uncasted
955 // predicate then we know the casting has no effect.
956 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
957 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
958 Pred = UncastedPred;
959
960 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
961 m_ConstantInt<AArch64SVEPredPattern::all>()));
962}
963
964static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
965 IntrinsicInst &II) {
966 // svsel(ptrue, x, y) => x
967 auto *OpPredicate = II.getOperand(0);
968 if (isAllActivePredicate(OpPredicate))
969 return IC.replaceInstUsesWith(II, II.getOperand(1));
970
971 auto Select =
972 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
973 return IC.replaceInstUsesWith(II, Select);
974}
975
976static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
977 IntrinsicInst &II) {
978 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
979 if (!Pg)
980 return std::nullopt;
981
982 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
983 return std::nullopt;
984
985 const auto PTruePattern =
986 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
987 if (PTruePattern != AArch64SVEPredPattern::vl1)
988 return std::nullopt;
989
990 // The intrinsic is inserting into lane zero so use an insert instead.
991 auto *IdxTy = Type::getInt64Ty(II.getContext());
992 auto *Insert = InsertElementInst::Create(
993 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
994 Insert->insertBefore(&II);
995 Insert->takeName(&II);
996
997 return IC.replaceInstUsesWith(II, Insert);
998}
999
1000static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1001 IntrinsicInst &II) {
1002 // Replace DupX with a regular IR splat.
1003 auto *RetTy = cast<ScalableVectorType>(II.getType());
1004 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1005 II.getArgOperand(0));
1006 Splat->takeName(&II);
1007 return IC.replaceInstUsesWith(II, Splat);
1008}
1009
1010static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1011 IntrinsicInst &II) {
1012 LLVMContext &Ctx = II.getContext();
1013
1014 // Check that the predicate is all active
1015 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1016 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1017 return std::nullopt;
1018
1019 const auto PTruePattern =
1020 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1021 if (PTruePattern != AArch64SVEPredPattern::all)
1022 return std::nullopt;
1023
1024 // Check that we have a compare of zero..
1025 auto *SplatValue =
1026 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1027 if (!SplatValue || !SplatValue->isZero())
1028 return std::nullopt;
1029
1030 // ..against a dupq
1031 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1032 if (!DupQLane ||
1033 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1034 return std::nullopt;
1035
1036 // Where the dupq is a lane 0 replicate of a vector insert
1037 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1038 return std::nullopt;
1039
1040 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1041 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1042 return std::nullopt;
1043
1044 // Where the vector insert is a fixed constant vector insert into undef at
1045 // index zero
1046 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1047 return std::nullopt;
1048
1049 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1050 return std::nullopt;
1051
1052 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1053 if (!ConstVec)
1054 return std::nullopt;
1055
1056 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1057 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1058 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1059 return std::nullopt;
1060
1061 unsigned NumElts = VecTy->getNumElements();
1062 unsigned PredicateBits = 0;
1063
1064 // Expand intrinsic operands to a 16-bit byte level predicate
1065 for (unsigned I = 0; I < NumElts; ++I) {
1066 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1067 if (!Arg)
1068 return std::nullopt;
1069 if (!Arg->isZero())
1070 PredicateBits |= 1 << (I * (16 / NumElts));
1071 }
1072
1073 // If all bits are zero bail early with an empty predicate
1074 if (PredicateBits == 0) {
1075 auto *PFalse = Constant::getNullValue(II.getType());
1076 PFalse->takeName(&II);
1077 return IC.replaceInstUsesWith(II, PFalse);
1078 }
1079
1080 // Calculate largest predicate type used (where byte predicate is largest)
1081 unsigned Mask = 8;
1082 for (unsigned I = 0; I < 16; ++I)
1083 if ((PredicateBits & (1 << I)) != 0)
1084 Mask |= (I % 8);
1085
1086 unsigned PredSize = Mask & -Mask;
1087 auto *PredType = ScalableVectorType::get(
1088 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1089
1090 // Ensure all relevant bits are set
1091 for (unsigned I = 0; I < 16; I += PredSize)
1092 if ((PredicateBits & (1 << I)) == 0)
1093 return std::nullopt;
1094
1095 auto *PTruePat =
1096 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1097 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1098 {PredType}, {PTruePat});
1099 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1100 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1101 auto *ConvertFromSVBool =
1102 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1103 {II.getType()}, {ConvertToSVBool});
1104
1105 ConvertFromSVBool->takeName(&II);
1106 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1107}
1108
1109static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1110 IntrinsicInst &II) {
1111 Value *Pg = II.getArgOperand(0);
1112 Value *Vec = II.getArgOperand(1);
1113 auto IntrinsicID = II.getIntrinsicID();
1114 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1115
1116 // lastX(splat(X)) --> X
1117 if (auto *SplatVal = getSplatValue(Vec))
1118 return IC.replaceInstUsesWith(II, SplatVal);
1119
1120 // If x and/or y is a splat value then:
1121 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1122 Value *LHS, *RHS;
1123 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1124 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1125 auto *OldBinOp = cast<BinaryOperator>(Vec);
1126 auto OpC = OldBinOp->getOpcode();
1127 auto *NewLHS =
1128 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1129 auto *NewRHS =
1130 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1132 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1133 return IC.replaceInstUsesWith(II, NewBinOp);
1134 }
1135 }
1136
1137 auto *C = dyn_cast<Constant>(Pg);
1138 if (IsAfter && C && C->isNullValue()) {
1139 // The intrinsic is extracting lane 0 so use an extract instead.
1140 auto *IdxTy = Type::getInt64Ty(II.getContext());
1141 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1142 Extract->insertBefore(&II);
1143 Extract->takeName(&II);
1144 return IC.replaceInstUsesWith(II, Extract);
1145 }
1146
1147 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1148 if (!IntrPG)
1149 return std::nullopt;
1150
1151 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1152 return std::nullopt;
1153
1154 const auto PTruePattern =
1155 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1156
1157 // Can the intrinsic's predicate be converted to a known constant index?
1158 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1159 if (!MinNumElts)
1160 return std::nullopt;
1161
1162 unsigned Idx = MinNumElts - 1;
1163 // Increment the index if extracting the element after the last active
1164 // predicate element.
1165 if (IsAfter)
1166 ++Idx;
1167
1168 // Ignore extracts whose index is larger than the known minimum vector
1169 // length. NOTE: This is an artificial constraint where we prefer to
1170 // maintain what the user asked for until an alternative is proven faster.
1171 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1172 if (Idx >= PgVTy->getMinNumElements())
1173 return std::nullopt;
1174
1175 // The intrinsic is extracting a fixed lane so use an extract instead.
1176 auto *IdxTy = Type::getInt64Ty(II.getContext());
1177 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1178 Extract->insertBefore(&II);
1179 Extract->takeName(&II);
1180 return IC.replaceInstUsesWith(II, Extract);
1181}
1182
1183static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1184 IntrinsicInst &II) {
1185 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1186 // integer variant across a variety of micro-architectures. Replace scalar
1187 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1188 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1189 // depending on the micro-architecture, but has been observed as generally
1190 // being faster, particularly when the CLAST[AB] op is a loop-carried
1191 // dependency.
1192 Value *Pg = II.getArgOperand(0);
1193 Value *Fallback = II.getArgOperand(1);
1194 Value *Vec = II.getArgOperand(2);
1195 Type *Ty = II.getType();
1196
1197 if (!Ty->isIntegerTy())
1198 return std::nullopt;
1199
1200 Type *FPTy;
1201 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1202 default:
1203 return std::nullopt;
1204 case 16:
1205 FPTy = IC.Builder.getHalfTy();
1206 break;
1207 case 32:
1208 FPTy = IC.Builder.getFloatTy();
1209 break;
1210 case 64:
1211 FPTy = IC.Builder.getDoubleTy();
1212 break;
1213 }
1214
1215 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1216 auto *FPVTy = VectorType::get(
1217 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1218 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1219 auto *FPII = IC.Builder.CreateIntrinsic(
1220 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1221 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1222 return IC.replaceInstUsesWith(II, FPIItoInt);
1223}
1224
1225static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1226 IntrinsicInst &II) {
1227 LLVMContext &Ctx = II.getContext();
1228 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1229 // can work with RDFFR_PP for ptest elimination.
1230 auto *AllPat =
1231 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1232 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1233 {II.getType()}, {AllPat});
1234 auto *RDFFR =
1235 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1236 RDFFR->takeName(&II);
1237 return IC.replaceInstUsesWith(II, RDFFR);
1238}
1239
1240static std::optional<Instruction *>
1242 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1243
1244 if (Pattern == AArch64SVEPredPattern::all) {
1245 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1246 auto *VScale = IC.Builder.CreateVScale(StepVal);
1247 VScale->takeName(&II);
1248 return IC.replaceInstUsesWith(II, VScale);
1249 }
1250
1251 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1252
1253 return MinNumElts && NumElts >= MinNumElts
1254 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1255 II, ConstantInt::get(II.getType(), MinNumElts)))
1256 : std::nullopt;
1257}
1258
1259static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1260 IntrinsicInst &II) {
1261 Value *PgVal = II.getArgOperand(0);
1262 Value *OpVal = II.getArgOperand(1);
1263
1264 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1265 // Later optimizations prefer this form.
1266 if (PgVal == OpVal &&
1267 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1268 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1269 Value *Ops[] = {PgVal, OpVal};
1270 Type *Tys[] = {PgVal->getType()};
1271
1272 auto *PTest =
1273 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1274 PTest->takeName(&II);
1275
1276 return IC.replaceInstUsesWith(II, PTest);
1277 }
1278
1279 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1280 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1281
1282 if (!Pg || !Op)
1283 return std::nullopt;
1284
1285 Intrinsic::ID OpIID = Op->getIntrinsicID();
1286
1287 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1288 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1289 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1290 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1291 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1292
1293 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1294
1295 PTest->takeName(&II);
1296 return IC.replaceInstUsesWith(II, PTest);
1297 }
1298
1299 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1300 // Later optimizations may rewrite sequence to use the flag-setting variant
1301 // of instruction X to remove PTEST.
1302 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1303 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1304 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1305 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1306 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1307 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1308 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1309 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1310 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1311 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1312 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1313 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1314 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1315 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1316 Type *Tys[] = {Pg->getType()};
1317
1318 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1319 PTest->takeName(&II);
1320
1321 return IC.replaceInstUsesWith(II, PTest);
1322 }
1323
1324 return std::nullopt;
1325}
1326
1327template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1328static std::optional<Instruction *>
1330 bool MergeIntoAddendOp) {
1331 Value *P = II.getOperand(0);
1332 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1333 if (MergeIntoAddendOp) {
1334 AddendOp = II.getOperand(1);
1335 Mul = II.getOperand(2);
1336 } else {
1337 AddendOp = II.getOperand(2);
1338 Mul = II.getOperand(1);
1339 }
1340
1341 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1342 m_Value(MulOp1))))
1343 return std::nullopt;
1344
1345 if (!Mul->hasOneUse())
1346 return std::nullopt;
1347
1348 Instruction *FMFSource = nullptr;
1349 if (II.getType()->isFPOrFPVectorTy()) {
1350 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1351 // Stop the combine when the flags on the inputs differ in case dropping
1352 // flags would lead to us missing out on more beneficial optimizations.
1353 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1354 return std::nullopt;
1355 if (!FAddFlags.allowContract())
1356 return std::nullopt;
1357 FMFSource = &II;
1358 }
1359
1360 CallInst *Res;
1361 if (MergeIntoAddendOp)
1362 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1363 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1364 else
1365 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1366 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1367
1368 return IC.replaceInstUsesWith(II, Res);
1369}
1370
1371static std::optional<Instruction *>
1373 Value *Pred = II.getOperand(0);
1374 Value *PtrOp = II.getOperand(1);
1375 Type *VecTy = II.getType();
1376
1377 if (isAllActivePredicate(Pred)) {
1378 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1379 Load->copyMetadata(II);
1380 return IC.replaceInstUsesWith(II, Load);
1381 }
1382
1383 CallInst *MaskedLoad =
1384 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1385 Pred, ConstantAggregateZero::get(VecTy));
1386 MaskedLoad->copyMetadata(II);
1387 return IC.replaceInstUsesWith(II, MaskedLoad);
1388}
1389
1390static std::optional<Instruction *>
1392 Value *VecOp = II.getOperand(0);
1393 Value *Pred = II.getOperand(1);
1394 Value *PtrOp = II.getOperand(2);
1395
1396 if (isAllActivePredicate(Pred)) {
1397 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1398 Store->copyMetadata(II);
1399 return IC.eraseInstFromFunction(II);
1400 }
1401
1402 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1403 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1404 MaskedStore->copyMetadata(II);
1405 return IC.eraseInstFromFunction(II);
1406}
1407
1409 switch (Intrinsic) {
1410 case Intrinsic::aarch64_sve_fmul_u:
1411 return Instruction::BinaryOps::FMul;
1412 case Intrinsic::aarch64_sve_fadd_u:
1413 return Instruction::BinaryOps::FAdd;
1414 case Intrinsic::aarch64_sve_fsub_u:
1415 return Instruction::BinaryOps::FSub;
1416 default:
1417 return Instruction::BinaryOpsEnd;
1418 }
1419}
1420
1421static std::optional<Instruction *>
1423 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1424 if (II.isStrictFP())
1425 return std::nullopt;
1426
1427 auto *OpPredicate = II.getOperand(0);
1428 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1429 if (BinOpCode == Instruction::BinaryOpsEnd ||
1430 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1431 m_ConstantInt<AArch64SVEPredPattern::all>())))
1432 return std::nullopt;
1435 auto BinOp =
1436 IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1437 return IC.replaceInstUsesWith(II, BinOp);
1438}
1439
1440// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1441// sve.add_u).
1442static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1443 Intrinsic::ID IID) {
1444 auto *OpPredicate = II.getOperand(0);
1445 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1446 m_ConstantInt<AArch64SVEPredPattern::all>())))
1447 return std::nullopt;
1448
1449 auto *Mod = II.getModule();
1450 auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()});
1451 II.setCalledFunction(NewDecl);
1452
1453 return &II;
1454}
1455
1456// Simplify operations where predicate has all inactive lanes or try to replace
1457// with _u form when all lanes are active
1458static std::optional<Instruction *>
1460 Intrinsic::ID IID) {
1461 if (match(II.getOperand(0), m_ZeroInt())) {
1462 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1463 // inactive for sv[func]_m
1464 return IC.replaceInstUsesWith(II, II.getOperand(1));
1465 }
1466 return instCombineSVEAllActive(II, IID);
1467}
1468
1469static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1470 IntrinsicInst &II) {
1471 if (auto II_U =
1472 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1473 return II_U;
1474 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1475 Intrinsic::aarch64_sve_mla>(
1476 IC, II, true))
1477 return MLA;
1478 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1479 Intrinsic::aarch64_sve_mad>(
1480 IC, II, false))
1481 return MAD;
1482 return std::nullopt;
1483}
1484
1485static std::optional<Instruction *>
1487 if (auto II_U =
1488 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1489 return II_U;
1490 if (auto FMLA =
1491 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1492 Intrinsic::aarch64_sve_fmla>(IC, II,
1493 true))
1494 return FMLA;
1495 if (auto FMAD =
1496 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1497 Intrinsic::aarch64_sve_fmad>(IC, II,
1498 false))
1499 return FMAD;
1500 if (auto FMLA =
1501 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1502 Intrinsic::aarch64_sve_fmla>(IC, II,
1503 true))
1504 return FMLA;
1505 return std::nullopt;
1506}
1507
1508static std::optional<Instruction *>
1510 if (auto FMLA =
1511 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1512 Intrinsic::aarch64_sve_fmla>(IC, II,
1513 true))
1514 return FMLA;
1515 if (auto FMAD =
1516 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1517 Intrinsic::aarch64_sve_fmad>(IC, II,
1518 false))
1519 return FMAD;
1520 if (auto FMLA_U =
1521 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1522 Intrinsic::aarch64_sve_fmla_u>(
1523 IC, II, true))
1524 return FMLA_U;
1525 return instCombineSVEVectorBinOp(IC, II);
1526}
1527
1528static std::optional<Instruction *>
1530 if (auto II_U =
1531 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1532 return II_U;
1533 if (auto FMLS =
1534 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1535 Intrinsic::aarch64_sve_fmls>(IC, II,
1536 true))
1537 return FMLS;
1538 if (auto FMSB =
1539 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1540 Intrinsic::aarch64_sve_fnmsb>(
1541 IC, II, false))
1542 return FMSB;
1543 if (auto FMLS =
1544 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1545 Intrinsic::aarch64_sve_fmls>(IC, II,
1546 true))
1547 return FMLS;
1548 return std::nullopt;
1549}
1550
1551static std::optional<Instruction *>
1553 if (auto FMLS =
1554 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1555 Intrinsic::aarch64_sve_fmls>(IC, II,
1556 true))
1557 return FMLS;
1558 if (auto FMSB =
1559 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1560 Intrinsic::aarch64_sve_fnmsb>(
1561 IC, II, false))
1562 return FMSB;
1563 if (auto FMLS_U =
1564 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1565 Intrinsic::aarch64_sve_fmls_u>(
1566 IC, II, true))
1567 return FMLS_U;
1568 return instCombineSVEVectorBinOp(IC, II);
1569}
1570
1571static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1572 IntrinsicInst &II) {
1573 if (auto II_U =
1574 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1575 return II_U;
1576 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1577 Intrinsic::aarch64_sve_mls>(
1578 IC, II, true))
1579 return MLS;
1580 return std::nullopt;
1581}
1582
1583static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1584 IntrinsicInst &II,
1585 Intrinsic::ID IID) {
1586 auto *OpPredicate = II.getOperand(0);
1587 auto *OpMultiplicand = II.getOperand(1);
1588 auto *OpMultiplier = II.getOperand(2);
1589
1590 // Return true if a given instruction is a unit splat value, false otherwise.
1591 auto IsUnitSplat = [](auto *I) {
1592 auto *SplatValue = getSplatValue(I);
1593 if (!SplatValue)
1594 return false;
1595 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1596 };
1597
1598 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1599 // with a unit splat value, false otherwise.
1600 auto IsUnitDup = [](auto *I) {
1601 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1602 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1603 return false;
1604
1605 auto *SplatValue = IntrI->getOperand(2);
1606 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1607 };
1608
1609 if (IsUnitSplat(OpMultiplier)) {
1610 // [f]mul pg %n, (dupx 1) => %n
1611 OpMultiplicand->takeName(&II);
1612 return IC.replaceInstUsesWith(II, OpMultiplicand);
1613 } else if (IsUnitDup(OpMultiplier)) {
1614 // [f]mul pg %n, (dup pg 1) => %n
1615 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1616 auto *DupPg = DupInst->getOperand(1);
1617 // TODO: this is naive. The optimization is still valid if DupPg
1618 // 'encompasses' OpPredicate, not only if they're the same predicate.
1619 if (OpPredicate == DupPg) {
1620 OpMultiplicand->takeName(&II);
1621 return IC.replaceInstUsesWith(II, OpMultiplicand);
1622 }
1623 }
1624
1625 return instCombineSVEVectorBinOp(IC, II);
1626}
1627
1628static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1629 IntrinsicInst &II) {
1630 Value *UnpackArg = II.getArgOperand(0);
1631 auto *RetTy = cast<ScalableVectorType>(II.getType());
1632 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1633 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1634
1635 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1636 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1637 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1638 ScalarArg =
1639 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1640 Value *NewVal =
1641 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1642 NewVal->takeName(&II);
1643 return IC.replaceInstUsesWith(II, NewVal);
1644 }
1645
1646 return std::nullopt;
1647}
1648static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1649 IntrinsicInst &II) {
1650 auto *OpVal = II.getOperand(0);
1651 auto *OpIndices = II.getOperand(1);
1652 VectorType *VTy = cast<VectorType>(II.getType());
1653
1654 // Check whether OpIndices is a constant splat value < minimal element count
1655 // of result.
1656 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1657 if (!SplatValue ||
1658 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1659 return std::nullopt;
1660
1661 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1662 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1663 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1664 auto *VectorSplat =
1665 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1666
1667 VectorSplat->takeName(&II);
1668 return IC.replaceInstUsesWith(II, VectorSplat);
1669}
1670
1671static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1672 IntrinsicInst &II) {
1673 Value *A, *B;
1674 Type *RetTy = II.getType();
1675 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1676 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1677
1678 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1679 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1680 if ((match(II.getArgOperand(0),
1681 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1682 match(II.getArgOperand(1),
1683 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1684 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1685 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1686 auto *TyA = cast<ScalableVectorType>(A->getType());
1687 if (TyA == B->getType() &&
1689 auto *SubVec = IC.Builder.CreateInsertVector(
1691 auto *ConcatVec = IC.Builder.CreateInsertVector(
1692 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1693 ConcatVec->takeName(&II);
1694 return IC.replaceInstUsesWith(II, ConcatVec);
1695 }
1696 }
1697
1698 return std::nullopt;
1699}
1700
1701static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1702 IntrinsicInst &II) {
1703 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1704 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1705 Value *A, *B;
1706 if (match(II.getArgOperand(0),
1707 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1708 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1709 m_Specific(A), m_Specific(B))))
1710 return IC.replaceInstUsesWith(
1711 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1712
1713 return std::nullopt;
1714}
1715
1716static std::optional<Instruction *>
1718 Value *Mask = II.getOperand(0);
1719 Value *BasePtr = II.getOperand(1);
1720 Value *Index = II.getOperand(2);
1721 Type *Ty = II.getType();
1722 Value *PassThru = ConstantAggregateZero::get(Ty);
1723
1724 // Contiguous gather => masked load.
1725 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1726 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1727 Value *IndexBase;
1728 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1729 m_Value(IndexBase), m_SpecificInt(1)))) {
1730 Align Alignment =
1731 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1732
1733 Type *VecPtrTy = PointerType::getUnqual(Ty);
1734 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1735 BasePtr, IndexBase);
1736 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1737 CallInst *MaskedLoad =
1738 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1739 MaskedLoad->takeName(&II);
1740 return IC.replaceInstUsesWith(II, MaskedLoad);
1741 }
1742
1743 return std::nullopt;
1744}
1745
1746static std::optional<Instruction *>
1748 Value *Val = II.getOperand(0);
1749 Value *Mask = II.getOperand(1);
1750 Value *BasePtr = II.getOperand(2);
1751 Value *Index = II.getOperand(3);
1752 Type *Ty = Val->getType();
1753
1754 // Contiguous scatter => masked store.
1755 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1756 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1757 Value *IndexBase;
1758 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1759 m_Value(IndexBase), m_SpecificInt(1)))) {
1760 Align Alignment =
1761 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1762
1763 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1764 BasePtr, IndexBase);
1765 Type *VecPtrTy = PointerType::getUnqual(Ty);
1766 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1767
1768 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1769
1770 return IC.eraseInstFromFunction(II);
1771 }
1772
1773 return std::nullopt;
1774}
1775
1776static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1777 IntrinsicInst &II) {
1779 Value *Pred = II.getOperand(0);
1780 Value *Vec = II.getOperand(1);
1781 Value *DivVec = II.getOperand(2);
1782
1783 Value *SplatValue = getSplatValue(DivVec);
1784 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1785 if (!SplatConstantInt)
1786 return std::nullopt;
1787 APInt Divisor = SplatConstantInt->getValue();
1788
1789 if (Divisor.isPowerOf2()) {
1790 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1791 auto ASRD = IC.Builder.CreateIntrinsic(
1792 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1793 return IC.replaceInstUsesWith(II, ASRD);
1794 }
1795 if (Divisor.isNegatedPowerOf2()) {
1796 Divisor.negate();
1797 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1798 auto ASRD = IC.Builder.CreateIntrinsic(
1799 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1800 auto NEG = IC.Builder.CreateIntrinsic(
1801 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1802 return IC.replaceInstUsesWith(II, NEG);
1803 }
1804
1805 return std::nullopt;
1806}
1807
1808bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1809 size_t VecSize = Vec.size();
1810 if (VecSize == 1)
1811 return true;
1812 if (!isPowerOf2_64(VecSize))
1813 return false;
1814 size_t HalfVecSize = VecSize / 2;
1815
1816 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1817 RHS != Vec.end(); LHS++, RHS++) {
1818 if (*LHS != nullptr && *RHS != nullptr) {
1819 if (*LHS == *RHS)
1820 continue;
1821 else
1822 return false;
1823 }
1824 if (!AllowPoison)
1825 return false;
1826 if (*LHS == nullptr && *RHS != nullptr)
1827 *LHS = *RHS;
1828 }
1829
1830 Vec.resize(HalfVecSize);
1831 SimplifyValuePattern(Vec, AllowPoison);
1832 return true;
1833}
1834
1835// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1836// to dupqlane(f64(C)) where C is A concatenated with B
1837static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1838 IntrinsicInst &II) {
1839 Value *CurrentInsertElt = nullptr, *Default = nullptr;
1840 if (!match(II.getOperand(0),
1841 m_Intrinsic<Intrinsic::vector_insert>(
1842 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1843 !isa<FixedVectorType>(CurrentInsertElt->getType()))
1844 return std::nullopt;
1845 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1846
1847 // Insert the scalars into a container ordered by InsertElement index
1848 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1849 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1850 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1851 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1852 CurrentInsertElt = InsertElt->getOperand(0);
1853 }
1854
1855 bool AllowPoison =
1856 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1857 if (!SimplifyValuePattern(Elts, AllowPoison))
1858 return std::nullopt;
1859
1860 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1861 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1862 for (size_t I = 0; I < Elts.size(); I++) {
1863 if (Elts[I] == nullptr)
1864 continue;
1865 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
1866 IC.Builder.getInt64(I));
1867 }
1868 if (InsertEltChain == nullptr)
1869 return std::nullopt;
1870
1871 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1872 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1873 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1874 // be narrowed back to the original type.
1875 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1876 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1877 IIScalableTy->getMinNumElements() /
1878 PatternWidth;
1879
1880 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
1881 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1882 auto *WideShuffleMaskTy =
1883 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
1884
1885 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
1886 auto InsertSubvector = IC.Builder.CreateInsertVector(
1887 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1888 auto WideBitcast =
1889 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1890 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
1891 auto WideShuffle = IC.Builder.CreateShuffleVector(
1892 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1893 auto NarrowBitcast =
1894 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1895
1896 return IC.replaceInstUsesWith(II, NarrowBitcast);
1897}
1898
1899static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1900 IntrinsicInst &II) {
1901 Value *A = II.getArgOperand(0);
1902 Value *B = II.getArgOperand(1);
1903 if (A == B)
1904 return IC.replaceInstUsesWith(II, A);
1905
1906 return std::nullopt;
1907}
1908
1909static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1910 IntrinsicInst &II) {
1911 Value *Pred = II.getOperand(0);
1912 Value *Vec = II.getOperand(1);
1913 Value *Shift = II.getOperand(2);
1914
1915 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1916 Value *AbsPred, *MergedValue;
1917 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1918 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1919 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1920 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1921
1922 return std::nullopt;
1923
1924 // Transform is valid if any of the following are true:
1925 // * The ABS merge value is an undef or non-negative
1926 // * The ABS predicate is all active
1927 // * The ABS predicate and the SRSHL predicates are the same
1928 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
1929 AbsPred != Pred && !isAllActivePredicate(AbsPred))
1930 return std::nullopt;
1931
1932 // Only valid when the shift amount is non-negative, otherwise the rounding
1933 // behaviour of SRSHL cannot be ignored.
1934 if (!match(Shift, m_NonNegative()))
1935 return std::nullopt;
1936
1937 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
1938 {II.getType()}, {Pred, Vec, Shift});
1939
1940 return IC.replaceInstUsesWith(II, LSL);
1941}
1942
1943std::optional<Instruction *>
1945 IntrinsicInst &II) const {
1946 Intrinsic::ID IID = II.getIntrinsicID();
1947 switch (IID) {
1948 default:
1949 break;
1950 case Intrinsic::aarch64_neon_fmaxnm:
1951 case Intrinsic::aarch64_neon_fminnm:
1952 return instCombineMaxMinNM(IC, II);
1953 case Intrinsic::aarch64_sve_convert_from_svbool:
1954 return instCombineConvertFromSVBool(IC, II);
1955 case Intrinsic::aarch64_sve_dup:
1956 return instCombineSVEDup(IC, II);
1957 case Intrinsic::aarch64_sve_dup_x:
1958 return instCombineSVEDupX(IC, II);
1959 case Intrinsic::aarch64_sve_cmpne:
1960 case Intrinsic::aarch64_sve_cmpne_wide:
1961 return instCombineSVECmpNE(IC, II);
1962 case Intrinsic::aarch64_sve_rdffr:
1963 return instCombineRDFFR(IC, II);
1964 case Intrinsic::aarch64_sve_lasta:
1965 case Intrinsic::aarch64_sve_lastb:
1966 return instCombineSVELast(IC, II);
1967 case Intrinsic::aarch64_sve_clasta_n:
1968 case Intrinsic::aarch64_sve_clastb_n:
1969 return instCombineSVECondLast(IC, II);
1970 case Intrinsic::aarch64_sve_cntd:
1971 return instCombineSVECntElts(IC, II, 2);
1972 case Intrinsic::aarch64_sve_cntw:
1973 return instCombineSVECntElts(IC, II, 4);
1974 case Intrinsic::aarch64_sve_cnth:
1975 return instCombineSVECntElts(IC, II, 8);
1976 case Intrinsic::aarch64_sve_cntb:
1977 return instCombineSVECntElts(IC, II, 16);
1978 case Intrinsic::aarch64_sve_ptest_any:
1979 case Intrinsic::aarch64_sve_ptest_first:
1980 case Intrinsic::aarch64_sve_ptest_last:
1981 return instCombineSVEPTest(IC, II);
1982 case Intrinsic::aarch64_sve_fabd:
1983 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
1984 case Intrinsic::aarch64_sve_fadd:
1985 return instCombineSVEVectorFAdd(IC, II);
1986 case Intrinsic::aarch64_sve_fadd_u:
1987 return instCombineSVEVectorFAddU(IC, II);
1988 case Intrinsic::aarch64_sve_fdiv:
1989 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
1990 case Intrinsic::aarch64_sve_fmax:
1991 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
1992 case Intrinsic::aarch64_sve_fmaxnm:
1993 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
1994 case Intrinsic::aarch64_sve_fmin:
1995 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
1996 case Intrinsic::aarch64_sve_fminnm:
1997 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
1998 case Intrinsic::aarch64_sve_fmla:
1999 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2000 case Intrinsic::aarch64_sve_fmls:
2001 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2002 case Intrinsic::aarch64_sve_fmul:
2003 if (auto II_U =
2004 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2005 return II_U;
2006 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2007 case Intrinsic::aarch64_sve_fmul_u:
2008 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2009 case Intrinsic::aarch64_sve_fmulx:
2010 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2011 case Intrinsic::aarch64_sve_fnmla:
2012 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2013 case Intrinsic::aarch64_sve_fnmls:
2014 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2015 case Intrinsic::aarch64_sve_fsub:
2016 return instCombineSVEVectorFSub(IC, II);
2017 case Intrinsic::aarch64_sve_fsub_u:
2018 return instCombineSVEVectorFSubU(IC, II);
2019 case Intrinsic::aarch64_sve_add:
2020 return instCombineSVEVectorAdd(IC, II);
2021 case Intrinsic::aarch64_sve_add_u:
2022 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2023 Intrinsic::aarch64_sve_mla_u>(
2024 IC, II, true);
2025 case Intrinsic::aarch64_sve_mla:
2026 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2027 case Intrinsic::aarch64_sve_mls:
2028 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2029 case Intrinsic::aarch64_sve_mul:
2030 if (auto II_U =
2031 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2032 return II_U;
2033 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2034 case Intrinsic::aarch64_sve_mul_u:
2035 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2036 case Intrinsic::aarch64_sve_sabd:
2037 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2038 case Intrinsic::aarch64_sve_smax:
2039 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2040 case Intrinsic::aarch64_sve_smin:
2041 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2042 case Intrinsic::aarch64_sve_smulh:
2043 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2044 case Intrinsic::aarch64_sve_sub:
2045 return instCombineSVEVectorSub(IC, II);
2046 case Intrinsic::aarch64_sve_sub_u:
2047 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2048 Intrinsic::aarch64_sve_mls_u>(
2049 IC, II, true);
2050 case Intrinsic::aarch64_sve_uabd:
2051 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2052 case Intrinsic::aarch64_sve_umax:
2053 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2054 case Intrinsic::aarch64_sve_umin:
2055 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2056 case Intrinsic::aarch64_sve_umulh:
2057 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2058 case Intrinsic::aarch64_sve_asr:
2059 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2060 case Intrinsic::aarch64_sve_lsl:
2061 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2062 case Intrinsic::aarch64_sve_lsr:
2063 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2064 case Intrinsic::aarch64_sve_and:
2065 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2066 case Intrinsic::aarch64_sve_bic:
2067 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2068 case Intrinsic::aarch64_sve_eor:
2069 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2070 case Intrinsic::aarch64_sve_orr:
2071 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2072 case Intrinsic::aarch64_sve_sqsub:
2073 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2074 case Intrinsic::aarch64_sve_uqsub:
2075 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2076 case Intrinsic::aarch64_sve_tbl:
2077 return instCombineSVETBL(IC, II);
2078 case Intrinsic::aarch64_sve_uunpkhi:
2079 case Intrinsic::aarch64_sve_uunpklo:
2080 case Intrinsic::aarch64_sve_sunpkhi:
2081 case Intrinsic::aarch64_sve_sunpklo:
2082 return instCombineSVEUnpack(IC, II);
2083 case Intrinsic::aarch64_sve_uzp1:
2084 return instCombineSVEUzp1(IC, II);
2085 case Intrinsic::aarch64_sve_zip1:
2086 case Intrinsic::aarch64_sve_zip2:
2087 return instCombineSVEZip(IC, II);
2088 case Intrinsic::aarch64_sve_ld1_gather_index:
2089 return instCombineLD1GatherIndex(IC, II);
2090 case Intrinsic::aarch64_sve_st1_scatter_index:
2091 return instCombineST1ScatterIndex(IC, II);
2092 case Intrinsic::aarch64_sve_ld1:
2093 return instCombineSVELD1(IC, II, DL);
2094 case Intrinsic::aarch64_sve_st1:
2095 return instCombineSVEST1(IC, II, DL);
2096 case Intrinsic::aarch64_sve_sdiv:
2097 return instCombineSVESDIV(IC, II);
2098 case Intrinsic::aarch64_sve_sel:
2099 return instCombineSVESel(IC, II);
2100 case Intrinsic::aarch64_sve_srshl:
2101 return instCombineSVESrshl(IC, II);
2102 case Intrinsic::aarch64_sve_dupq_lane:
2103 return instCombineSVEDupqLane(IC, II);
2104 }
2105
2106 return std::nullopt;
2107}
2108
2110 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2111 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2112 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2113 SimplifyAndSetOp) const {
2114 switch (II.getIntrinsicID()) {
2115 default:
2116 break;
2117 case Intrinsic::aarch64_neon_fcvtxn:
2118 case Intrinsic::aarch64_neon_rshrn:
2119 case Intrinsic::aarch64_neon_sqrshrn:
2120 case Intrinsic::aarch64_neon_sqrshrun:
2121 case Intrinsic::aarch64_neon_sqshrn:
2122 case Intrinsic::aarch64_neon_sqshrun:
2123 case Intrinsic::aarch64_neon_sqxtn:
2124 case Intrinsic::aarch64_neon_sqxtun:
2125 case Intrinsic::aarch64_neon_uqrshrn:
2126 case Intrinsic::aarch64_neon_uqshrn:
2127 case Intrinsic::aarch64_neon_uqxtn:
2128 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2129 break;
2130 }
2131
2132 return std::nullopt;
2133}
2134
2137 switch (K) {
2139 return TypeSize::getFixed(64);
2142 return TypeSize::getFixed(0);
2143
2144 if (ST->hasSVE())
2145 return TypeSize::getFixed(
2146 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2147
2148 return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
2151 return TypeSize::getScalable(0);
2152
2153 return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
2154 }
2155 llvm_unreachable("Unsupported register kind");
2156}
2157
2158bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2160 Type *SrcOverrideTy) {
2161 // A helper that returns a vector type from the given type. The number of
2162 // elements in type Ty determines the vector width.
2163 auto toVectorTy = [&](Type *ArgTy) {
2164 return VectorType::get(ArgTy->getScalarType(),
2165 cast<VectorType>(DstTy)->getElementCount());
2166 };
2167
2168 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2169 // i32, i64]. SVE doesn't generally have the same set of instructions to
2170 // perform an extend with the add/sub/mul. There are SMULLB style
2171 // instructions, but they operate on top/bottom, requiring some sort of lane
2172 // interleaving to be used with zext/sext.
2173 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2174 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2175 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2176 return false;
2177
2178 // Determine if the operation has a widening variant. We consider both the
2179 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2180 // instructions.
2181 //
2182 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2183 // verify that their extending operands are eliminated during code
2184 // generation.
2185 Type *SrcTy = SrcOverrideTy;
2186 switch (Opcode) {
2187 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2188 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2189 // The second operand needs to be an extend
2190 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2191 if (!SrcTy)
2192 SrcTy =
2193 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2194 } else
2195 return false;
2196 break;
2197 case Instruction::Mul: { // SMULL(2), UMULL(2)
2198 // Both operands need to be extends of the same type.
2199 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2200 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2201 if (!SrcTy)
2202 SrcTy =
2203 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2204 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2205 // If one of the operands is a Zext and the other has enough zero bits to
2206 // be treated as unsigned, we can still general a umull, meaning the zext
2207 // is free.
2208 KnownBits Known =
2209 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2210 if (Args[0]->getType()->getScalarSizeInBits() -
2211 Known.Zero.countLeadingOnes() >
2212 DstTy->getScalarSizeInBits() / 2)
2213 return false;
2214 if (!SrcTy)
2215 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2216 DstTy->getScalarSizeInBits() / 2));
2217 } else
2218 return false;
2219 break;
2220 }
2221 default:
2222 return false;
2223 }
2224
2225 // Legalize the destination type and ensure it can be used in a widening
2226 // operation.
2227 auto DstTyL = getTypeLegalizationCost(DstTy);
2228 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2229 return false;
2230
2231 // Legalize the source type and ensure it can be used in a widening
2232 // operation.
2233 assert(SrcTy && "Expected some SrcTy");
2234 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2235 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2236 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2237 return false;
2238
2239 // Get the total number of vector elements in the legalized types.
2240 InstructionCost NumDstEls =
2241 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2242 InstructionCost NumSrcEls =
2243 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2244
2245 // Return true if the legalized types have the same number of vector elements
2246 // and the destination element type size is twice that of the source type.
2247 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2248}
2249
2250// s/urhadd instructions implement the following pattern, making the
2251// extends free:
2252// %x = add ((zext i8 -> i16), 1)
2253// %y = (zext i8 -> i16)
2254// trunc i16 (lshr (add %x, %y), 1) -> i8
2255//
2257 Type *Src) {
2258 // The source should be a legal vector type.
2259 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2260 (Src->isScalableTy() && !ST->hasSVE2()))
2261 return false;
2262
2263 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2264 return false;
2265
2266 // Look for trunc/shl/add before trying to match the pattern.
2267 const Instruction *Add = ExtUser;
2268 auto *AddUser =
2269 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2270 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2271 Add = AddUser;
2272
2273 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2274 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2275 return false;
2276
2277 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2278 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2279 Src->getScalarSizeInBits() !=
2280 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2281 return false;
2282
2283 // Try to match the whole pattern. Ext could be either the first or second
2284 // m_ZExtOrSExt matched.
2285 Instruction *Ex1, *Ex2;
2286 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2287 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2288 return false;
2289
2290 // Ensure both extends are of the same type
2291 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2292 Ex1->getOpcode() == Ex2->getOpcode())
2293 return true;
2294
2295 return false;
2296}
2297
2299 Type *Src,
2302 const Instruction *I) {
2303 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2304 assert(ISD && "Invalid opcode");
2305 // If the cast is observable, and it is used by a widening instruction (e.g.,
2306 // uaddl, saddw, etc.), it may be free.
2307 if (I && I->hasOneUser()) {
2308 auto *SingleUser = cast<Instruction>(*I->user_begin());
2309 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2310 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2311 // For adds only count the second operand as free if both operands are
2312 // extends but not the same operation. (i.e both operands are not free in
2313 // add(sext, zext)).
2314 if (SingleUser->getOpcode() == Instruction::Add) {
2315 if (I == SingleUser->getOperand(1) ||
2316 (isa<CastInst>(SingleUser->getOperand(1)) &&
2317 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2318 return 0;
2319 } else // Others are free so long as isWideningInstruction returned true.
2320 return 0;
2321 }
2322
2323 // The cast will be free for the s/urhadd instructions
2324 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2325 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2326 return 0;
2327 }
2328
2329 // TODO: Allow non-throughput costs that aren't binary.
2330 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2332 return Cost == 0 ? 0 : 1;
2333 return Cost;
2334 };
2335
2336 EVT SrcTy = TLI->getValueType(DL, Src);
2337 EVT DstTy = TLI->getValueType(DL, Dst);
2338
2339 if (!SrcTy.isSimple() || !DstTy.isSimple())
2340 return AdjustCost(
2341 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2342
2343 static const TypeConversionCostTblEntry
2344 ConversionTbl[] = {
2345 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2346 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2347 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2348 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2349 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2350 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2351 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2352 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2353 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2354 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2355 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2356 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2357 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2358 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2359 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2360 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2361 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2362 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2363 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2364 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2365
2366 // Truncations on nxvmiN
2367 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
2368 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
2369 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
2370 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
2371 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
2372 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
2373 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
2374 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
2375 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
2376 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
2377 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
2378 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
2379 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
2380 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
2381 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
2382 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
2383
2384 // The number of shll instructions for the extension.
2385 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2386 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2387 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2388 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2389 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2390 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2391 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2392 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2393 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2394 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2395 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2396 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2397 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2398 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2399 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2400 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2401
2402 // LowerVectorINT_TO_FP:
2403 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2404 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2405 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2406 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2407 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2408 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2409
2410 // Complex: to v2f32
2411 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2412 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2413 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2414 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2415 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2416 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2417
2418 // Complex: to v4f32
2419 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
2420 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2421 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
2422 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2423
2424 // Complex: to v8f32
2425 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2426 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2427 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2428 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2429
2430 // Complex: to v16f32
2431 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2432 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2433
2434 // Complex: to v2f64
2435 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2436 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2437 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2438 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2439 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2440 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2441
2442 // Complex: to v4f64
2443 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2444 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2445
2446 // LowerVectorFP_TO_INT
2447 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
2448 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2449 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2450 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
2451 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2452 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2453
2454 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2455 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
2456 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
2457 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
2458 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
2459 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
2460 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
2461
2462 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2463 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
2464 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
2465 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
2466 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
2467
2468 // Complex, from nxv2f32.
2469 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2470 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2471 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2472 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2473 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2474 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2475 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2476 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2477
2478 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2479 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
2480 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
2481 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
2482 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
2483 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
2484 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
2485
2486 // Complex, from nxv2f64.
2487 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2488 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2489 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2490 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2491 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2492 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2493 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2494 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2495
2496 // Complex, from nxv4f32.
2497 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2498 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2499 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2500 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2501 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2502 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2503 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2504 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2505
2506 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2507 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2508 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2509 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2510 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2511
2512 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2513 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2514 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2515 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2516 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2517 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2518 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2519
2520 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2521 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2522 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2523 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2524 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2525
2526 // Complex, from nxv8f16.
2527 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2528 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2529 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2530 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2531 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2532 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2533 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2534 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2535
2536 // Complex, from nxv4f16.
2537 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2538 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2539 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2540 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2541 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2542 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2543 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2544 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2545
2546 // Complex, from nxv2f16.
2547 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2548 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2549 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2550 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2551 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2552 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2553 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2554 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2555
2556 // Truncate from nxvmf32 to nxvmf16.
2557 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2558 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2559 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2560
2561 // Truncate from nxvmf64 to nxvmf16.
2562 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2563 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2564 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2565
2566 // Truncate from nxvmf64 to nxvmf32.
2567 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2568 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2569 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2570
2571 // Extend from nxvmf16 to nxvmf32.
2572 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2573 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2574 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2575
2576 // Extend from nxvmf16 to nxvmf64.
2577 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2578 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2579 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2580
2581 // Extend from nxvmf32 to nxvmf64.
2582 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2583 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2584 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2585
2586 // Bitcasts from float to integer
2587 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
2588 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
2589 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
2590
2591 // Bitcasts from integer to float
2592 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
2593 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
2594 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
2595
2596 // Add cost for extending to illegal -too wide- scalable vectors.
2597 // zero/sign extend are implemented by multiple unpack operations,
2598 // where each operation has a cost of 1.
2599 { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2600 { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2601 { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2602 { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2603 { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2604 { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2605
2606 { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2607 { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2608 { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2609 { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2610 { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2611 { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2612 };
2613
2614 // We have to estimate a cost of fixed length operation upon
2615 // SVE registers(operations) with the number of registers required
2616 // for a fixed type to be represented upon SVE registers.
2617 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
2618 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
2619 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
2620 ST->useSVEForFixedLengthVectors(WiderTy)) {
2621 std::pair<InstructionCost, MVT> LT =
2622 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
2623 unsigned NumElements = AArch64::SVEBitsPerBlock /
2624 LT.second.getVectorElementType().getSizeInBits();
2625 return AdjustCost(
2626 LT.first *
2628 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
2629 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
2630 CostKind, I));
2631 }
2632
2633 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2634 DstTy.getSimpleVT(),
2635 SrcTy.getSimpleVT()))
2636 return AdjustCost(Entry->Cost);
2637
2638 static const TypeConversionCostTblEntry FP16Tbl[] = {
2639 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2640 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
2641 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2642 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
2643 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2644 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
2645 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2646 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
2647 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2648 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
2649 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2650 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
2651 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2652 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
2653 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2654 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
2655 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2656 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
2657 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
2658 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
2659 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2660 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2661 };
2662
2663 if (ST->hasFullFP16())
2664 if (const auto *Entry = ConvertCostTableLookup(
2665 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2666 return AdjustCost(Entry->Cost);
2667
2668 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2669 CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
2670 TLI->getTypeAction(Src->getContext(), SrcTy) ==
2672 TLI->getTypeAction(Dst->getContext(), DstTy) ==
2674 // The standard behaviour in the backend for these cases is to split the
2675 // extend up into two parts:
2676 // 1. Perform an extending load or masked load up to the legal type.
2677 // 2. Extend the loaded data to the final type.
2678 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
2679 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
2681 Opcode, LegalTy, Src, CCH, CostKind, I);
2683 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
2684 return Part1 + Part2;
2685 }
2686
2687 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
2688 // but we also want to include the TTI::CastContextHint::Masked case too.
2689 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2690 CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
2691 TLI->isTypeLegal(DstTy))
2693
2694 return AdjustCost(
2695 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2696}
2697
2699 Type *Dst,
2700 VectorType *VecTy,
2701 unsigned Index) {
2702
2703 // Make sure we were given a valid extend opcode.
2704 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2705 "Invalid opcode");
2706
2707 // We are extending an element we extract from a vector, so the source type
2708 // of the extend is the element type of the vector.
2709 auto *Src = VecTy->getElementType();
2710
2711 // Sign- and zero-extends are for integer types only.
2712 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2713
2714 // Get the cost for the extract. We compute the cost (if any) for the extend
2715 // below.
2717 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2718 CostKind, Index, nullptr, nullptr);
2719
2720 // Legalize the types.
2721 auto VecLT = getTypeLegalizationCost(VecTy);
2722 auto DstVT = TLI->getValueType(DL, Dst);
2723 auto SrcVT = TLI->getValueType(DL, Src);
2724
2725 // If the resulting type is still a vector and the destination type is legal,
2726 // we may get the extension for free. If not, get the default cost for the
2727 // extend.
2728 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2729 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2730 CostKind);
2731
2732 // The destination type should be larger than the element type. If not, get
2733 // the default cost for the extend.
2734 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2735 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2736 CostKind);
2737
2738 switch (Opcode) {
2739 default:
2740 llvm_unreachable("Opcode should be either SExt or ZExt");
2741
2742 // For sign-extends, we only need a smov, which performs the extension
2743 // automatically.
2744 case Instruction::SExt:
2745 return Cost;
2746
2747 // For zero-extends, the extend is performed automatically by a umov unless
2748 // the destination type is i64 and the element type is i8 or i16.
2749 case Instruction::ZExt:
2750 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2751 return Cost;
2752 }
2753
2754 // If we are unable to perform the extend for free, get the default cost.
2755 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2756 CostKind);
2757}
2758
2761 const Instruction *I) {
2763 return Opcode == Instruction::PHI ? 0 : 1;
2764 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2765 // Branches are assumed to be predicted.
2766 return 0;
2767}
2768
2769InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
2770 Type *Val,
2771 unsigned Index,
2772 bool HasRealUse) {
2773 assert(Val->isVectorTy() && "This must be a vector type");
2774
2775 if (Index != -1U) {
2776 // Legalize the type.
2777 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2778
2779 // This type is legalized to a scalar type.
2780 if (!LT.second.isVector())
2781 return 0;
2782
2783 // The type may be split. For fixed-width vectors we can normalize the
2784 // index to the new type.
2785 if (LT.second.isFixedLengthVector()) {
2786 unsigned Width = LT.second.getVectorNumElements();
2787 Index = Index % Width;
2788 }
2789
2790 // The element at index zero is already inside the vector.
2791 // - For a physical (HasRealUse==true) insert-element or extract-element
2792 // instruction that extracts integers, an explicit FPR -> GPR move is
2793 // needed. So it has non-zero cost.
2794 // - For the rest of cases (virtual instruction or element type is float),
2795 // consider the instruction free.
2796 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2797 return 0;
2798
2799 // This is recognising a LD1 single-element structure to one lane of one
2800 // register instruction. I.e., if this is an `insertelement` instruction,
2801 // and its second operand is a load, then we will generate a LD1, which
2802 // are expensive instructions.
2803 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
2804 return ST->getVectorInsertExtractBaseCost() + 1;
2805
2806 // i1 inserts and extract will include an extra cset or cmp of the vector
2807 // value. Increase the cost by 1 to account.
2808 if (Val->getScalarSizeInBits() == 1)
2809 return ST->getVectorInsertExtractBaseCost() + 1;
2810
2811 // FIXME:
2812 // If the extract-element and insert-element instructions could be
2813 // simplified away (e.g., could be combined into users by looking at use-def
2814 // context), they have no cost. This is not done in the first place for
2815 // compile-time considerations.
2816 }
2817
2818 // All other insert/extracts cost this much.
2819 return ST->getVectorInsertExtractBaseCost();
2820}
2821
2824 unsigned Index, Value *Op0,
2825 Value *Op1) {
2826 bool HasRealUse =
2827 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
2828 return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
2829}
2830
2832 Type *Val,
2834 unsigned Index) {
2835 return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
2836}
2837
2839 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
2841 if (isa<ScalableVectorType>(Ty))
2843 if (Ty->getElementType()->isFloatingPointTy())
2844 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
2845 CostKind);
2846 return DemandedElts.popcount() * (Insert + Extract) *
2848}
2849
2851 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2854 const Instruction *CxtI) {
2855
2856 // TODO: Handle more cost kinds.
2858 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2859 Op2Info, Args, CxtI);
2860
2861 // Legalize the type.
2862 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2863 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2864
2865 switch (ISD) {
2866 default:
2867 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2868 Op2Info);
2869 case ISD::SDIV:
2870 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2871 // On AArch64, scalar signed division by constants power-of-two are
2872 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
2873 // The OperandValue properties many not be same as that of previous
2874 // operation; conservatively assume OP_None.
2876 Instruction::Add, Ty, CostKind,
2877 Op1Info.getNoProps(), Op2Info.getNoProps());
2878 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2879 Op1Info.getNoProps(), Op2Info.getNoProps());
2881 Instruction::Select, Ty, CostKind,
2882 Op1Info.getNoProps(), Op2Info.getNoProps());
2883 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2884 Op1Info.getNoProps(), Op2Info.getNoProps());
2885 return Cost;
2886 }
2887 [[fallthrough]];
2888 case ISD::UDIV: {
2889 if (Op2Info.isConstant() && Op2Info.isUniform()) {
2890 auto VT = TLI->getValueType(DL, Ty);
2891 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
2892 // Vector signed division by constant are expanded to the
2893 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2894 // to MULHS + SUB + SRL + ADD + SRL.
2896 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2898 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2900 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2901 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2902 }
2903 }
2904
2906 Opcode, Ty, CostKind, Op1Info, Op2Info);
2907 if (Ty->isVectorTy()) {
2908 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
2909 // SDIV/UDIV operations are lowered using SVE, then we can have less
2910 // costs.
2911 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
2912 ->getPrimitiveSizeInBits()
2913 .getFixedValue() < 128) {
2914 EVT VT = TLI->getValueType(DL, Ty);
2915 static const CostTblEntry DivTbl[]{
2916 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
2917 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
2918 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
2919 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
2920 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
2921 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
2922
2923 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2924 if (nullptr != Entry)
2925 return Entry->Cost;
2926 }
2927 // For 8/16-bit elements, the cost is higher because the type
2928 // requires promotion and possibly splitting:
2929 if (LT.second.getScalarType() == MVT::i8)
2930 Cost *= 8;
2931 else if (LT.second.getScalarType() == MVT::i16)
2932 Cost *= 4;
2933 return Cost;
2934 } else {
2935 // If one of the operands is a uniform constant then the cost for each
2936 // element is Cost for insertion, extraction and division.
2937 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2938 // operation with scalar type
2939 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
2940 (Op2Info.isConstant() && Op2Info.isUniform())) {
2941 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2943 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
2944 return (4 + DivCost) * VTy->getNumElements();
2945 }
2946 }
2947 // On AArch64, without SVE, vector divisions are expanded
2948 // into scalar divisions of each pair of elements.
2949 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
2950 CostKind, Op1Info, Op2Info);
2951 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
2952 Op1Info, Op2Info);
2953 }
2954
2955 // TODO: if one of the arguments is scalar, then it's not necessary to
2956 // double the cost of handling the vector elements.
2957 Cost += Cost;
2958 }
2959 return Cost;
2960 }
2961 case ISD::MUL:
2962 // When SVE is available, then we can lower the v2i64 operation using
2963 // the SVE mul instruction, which has a lower cost.
2964 if (LT.second == MVT::v2i64 && ST->hasSVE())
2965 return LT.first;
2966
2967 // When SVE is not available, there is no MUL.2d instruction,
2968 // which means mul <2 x i64> is expensive as elements are extracted
2969 // from the vectors and the muls scalarized.
2970 // As getScalarizationOverhead is a bit too pessimistic, we
2971 // estimate the cost for a i64 vector directly here, which is:
2972 // - four 2-cost i64 extracts,
2973 // - two 2-cost i64 inserts, and
2974 // - two 1-cost muls.
2975 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
2976 // LT.first = 2 the cost is 28. If both operands are extensions it will not
2977 // need to scalarize so the cost can be cheaper (smull or umull).
2978 // so the cost can be cheaper (smull or umull).
2979 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
2980 return LT.first;
2981 return LT.first * 14;
2982 case ISD::ADD:
2983 case ISD::XOR:
2984 case ISD::OR:
2985 case ISD::AND:
2986 case ISD::SRL:
2987 case ISD::SRA:
2988 case ISD::SHL:
2989 // These nodes are marked as 'custom' for combining purposes only.
2990 // We know that they are legal. See LowerAdd in ISelLowering.
2991 return LT.first;
2992
2993 case ISD::FNEG:
2994 case ISD::FADD:
2995 case ISD::FSUB:
2996 // Increase the cost for half and bfloat types if not architecturally
2997 // supported.
2998 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
2999 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3000 return 2 * LT.first;
3001 if (!Ty->getScalarType()->isFP128Ty())
3002 return LT.first;
3003 [[fallthrough]];
3004 case ISD::FMUL:
3005 case ISD::FDIV:
3006 // These nodes are marked as 'custom' just to lower them to SVE.
3007 // We know said lowering will incur no additional cost.
3008 if (!Ty->getScalarType()->isFP128Ty())
3009 return 2 * LT.first;
3010
3011 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3012 Op2Info);
3013 case ISD::FREM:
3014 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3015 // those functions are not declared in the module.
3016 if (!Ty->isVectorTy())
3017 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3018 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3019 Op2Info);
3020 }
3021}
3022
3024 ScalarEvolution *SE,
3025 const SCEV *Ptr) {
3026 // Address computations in vectorized code with non-consecutive addresses will
3027 // likely result in more instructions compared to scalar code where the
3028 // computation can more often be merged into the index mode. The resulting
3029 // extra micro-ops can significantly decrease throughput.
3030 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3031 int MaxMergeDistance = 64;
3032
3033 if (Ty->isVectorTy() && SE &&
3034 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
3035 return NumVectorInstToHideOverhead;
3036
3037 // In many cases the address computation is not merged into the instruction
3038 // addressing mode.
3039 return 1;
3040}
3041
3043 Type *CondTy,
3044 CmpInst::Predicate VecPred,
3046 const Instruction *I) {
3047 // TODO: Handle other cost kinds.
3049 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3050 I);
3051
3052 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3053 // We don't lower some vector selects well that are wider than the register
3054 // width.
3055 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3056 // We would need this many instructions to hide the scalarization happening.
3057 const int AmortizationCost = 20;
3058
3059 // If VecPred is not set, check if we can get a predicate from the context
3060 // instruction, if its type matches the requested ValTy.
3061 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3062 CmpInst::Predicate CurrentPred;
3063 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3064 m_Value())))
3065 VecPred = CurrentPred;
3066 }
3067 // Check if we have a compare/select chain that can be lowered using
3068 // a (F)CMxx & BFI pair.
3069 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3070 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3071 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3072 VecPred == CmpInst::FCMP_UNE) {
3073 static const auto ValidMinMaxTys = {
3074 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3075 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3076 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3077
3078 auto LT = getTypeLegalizationCost(ValTy);
3079 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3080 (ST->hasFullFP16() &&
3081 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3082 return LT.first;
3083 }
3084
3085 static const TypeConversionCostTblEntry
3086 VectorSelectTbl[] = {
3087 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3088 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3089 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3090 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3091 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3092 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3093 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3094 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3095 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3096 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3097 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3098 };
3099
3100 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3101 EVT SelValTy = TLI->getValueType(DL, ValTy);
3102 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3103 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3104 SelCondTy.getSimpleVT(),
3105 SelValTy.getSimpleVT()))
3106 return Entry->Cost;
3107 }
3108 }
3109
3110 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3111 auto LT = getTypeLegalizationCost(ValTy);
3112 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3113 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3114 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3115 }
3116
3117 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3118 // FIXME: This can apply to more conditions and add/sub if it can be shown to
3119 // be profitable.
3120 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3121 ICmpInst::isEquality(VecPred) &&
3122 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3123 match(I->getOperand(1), m_Zero()) &&
3124 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3125 return 0;
3126
3127 // The base case handles scalable vectors fine for now, since it treats the
3128 // cost as 1 * legalization cost.
3129 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3130}
3131
3133AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3135 if (ST->requiresStrictAlign()) {
3136 // TODO: Add cost modeling for strict align. Misaligned loads expand to
3137 // a bunch of instructions when strict align is enabled.
3138 return Options;
3139 }
3140 Options.AllowOverlappingLoads = true;
3141 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3142 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3143 // TODO: Though vector loads usually perform well on AArch64, in some targets
3144 // they may wake up the FP unit, which raises the power consumption. Perhaps
3145 // they could be used with no holds barred (-O3).
3146 Options.LoadSizes = {8, 4, 2, 1};
3147 Options.AllowedTailExpansions = {3, 5, 6};
3148 return Options;
3149}
3150
3152 return ST->hasSVE();
3153}
3154
3157 Align Alignment, unsigned AddressSpace,
3159 if (useNeonVector(Src))
3160 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3161 CostKind);
3162 auto LT = getTypeLegalizationCost(Src);
3163 if (!LT.first.isValid())
3165
3166 // The code-generator is currently not able to handle scalable vectors
3167 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3168 // it. This change will be removed when code-generation for these types is
3169 // sufficiently reliable.
3170 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
3172
3173 return LT.first;
3174}
3175
3176static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
3177 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
3178}
3179
3181 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3182 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3183 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3184 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3185 Alignment, CostKind, I);
3186 auto *VT = cast<VectorType>(DataTy);
3187 auto LT = getTypeLegalizationCost(DataTy);
3188 if (!LT.first.isValid())
3190
3191 if (!LT.second.isVector() ||
3192 !isElementTypeLegalForScalableVector(VT->getElementType()))
3194
3195 // The code-generator is currently not able to handle scalable vectors
3196 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3197 // it. This change will be removed when code-generation for these types is
3198 // sufficiently reliable.
3199 if (cast<VectorType>(DataTy)->getElementCount() ==
3202
3203 ElementCount LegalVF = LT.second.getVectorElementCount();
3204 InstructionCost MemOpCost =
3205 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3206 {TTI::OK_AnyValue, TTI::OP_None}, I);
3207 // Add on an overhead cost for using gathers/scatters.
3208 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
3209 // point we may want a per-CPU overhead.
3210 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
3211 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3212}
3213
3215 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3216}
3217
3219 MaybeAlign Alignment,
3220 unsigned AddressSpace,
3222 TTI::OperandValueInfo OpInfo,
3223 const Instruction *I) {
3224 EVT VT = TLI->getValueType(DL, Ty, true);
3225 // Type legalization can't handle structs
3226 if (VT == MVT::Other)
3227 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3228 CostKind);
3229
3230 auto LT = getTypeLegalizationCost(Ty);
3231 if (!LT.first.isValid())
3233
3234 // The code-generator is currently not able to handle scalable vectors
3235 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3236 // it. This change will be removed when code-generation for these types is
3237 // sufficiently reliable.
3238 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3239 if (VTy->getElementCount() == ElementCount::getScalable(1))
3241
3242 // TODO: consider latency as well for TCK_SizeAndLatency.
3244 return LT.first;
3245
3247 return 1;
3248
3249 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3250 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3251 // Unaligned stores are extremely inefficient. We don't split all
3252 // unaligned 128-bit stores because the negative impact that has shown in
3253 // practice on inlined block copy code.
3254 // We make such stores expensive so that we will only vectorize if there
3255 // are 6 other instructions getting vectorized.
3256 const int AmortizationCost = 6;
3257
3258 return LT.first * 2 * AmortizationCost;
3259 }
3260
3261 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3262 if (Ty->isPtrOrPtrVectorTy())
3263 return LT.first;
3264
3265 if (useNeonVector(Ty)) {
3266 // Check truncating stores and extending loads.
3267 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3268 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3269 if (VT == MVT::v4i8)
3270 return 2;
3271 // Otherwise we need to scalarize.
3272 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3273 }
3274 EVT EltVT = VT.getVectorElementType();
3275 unsigned EltSize = EltVT.getScalarSizeInBits();
3276 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3277 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3278 *Alignment != Align(1))
3279 return LT.first;
3280 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3281 // widening to v4i8, which produces suboptimal results.
3282 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3283 return LT.first;
3284
3285 // Check non-power-of-2 loads/stores for legal vector element types with
3286 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3287 // operations on smaller power-of-2 ops, including ld1/st1.
3288 LLVMContext &C = Ty->getContext();
3290 SmallVector<EVT> TypeWorklist;
3291 TypeWorklist.push_back(VT);
3292 while (!TypeWorklist.empty()) {
3293 EVT CurrVT = TypeWorklist.pop_back_val();
3294 unsigned CurrNumElements = CurrVT.getVectorNumElements();
3295 if (isPowerOf2_32(CurrNumElements)) {
3296 Cost += 1;
3297 continue;
3298 }
3299
3300 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3301 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3302 TypeWorklist.push_back(
3303 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3304 }
3305 return Cost;
3306 }
3307
3308 return LT.first;
3309}
3310
3312 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3313 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3314 bool UseMaskForCond, bool UseMaskForGaps) {
3315 assert(Factor >= 2 && "Invalid interleave factor");
3316 auto *VecVTy = cast<VectorType>(VecTy);
3317
3318 if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
3320
3321 // Vectorization for masked interleaved accesses is only enabled for scalable
3322 // VF.
3323 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3325
3326 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3327 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3328 auto *SubVecTy =
3329 VectorType::get(VecVTy->getElementType(),
3330 VecVTy->getElementCount().divideCoefficientBy(Factor));
3331
3332 // ldN/stN only support legal vector types of size 64 or 128 in bits.
3333 // Accesses having vector types that are a multiple of 128 bits can be
3334 // matched to more than one ldN/stN instruction.
3335 bool UseScalable;
3336 if (MinElts % Factor == 0 &&
3337 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3338 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
3339 }
3340
3341 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3342 Alignment, AddressSpace, CostKind,
3343 UseMaskForCond, UseMaskForGaps);
3344}
3345
3350 for (auto *I : Tys) {
3351 if (!I->isVectorTy())
3352 continue;
3353 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
3354 128)
3355 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
3356 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
3357 }
3358 return Cost;
3359}
3360
3362 return ST->getMaxInterleaveFactor();
3363}
3364
3365// For Falkor, we want to avoid having too many strided loads in a loop since
3366// that can exhaust the HW prefetcher resources. We adjust the unroller
3367// MaxCount preference below to attempt to ensure unrolling doesn't create too
3368// many strided loads.
3369static void
3372 enum { MaxStridedLoads = 7 };
3373 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
3374 int StridedLoads = 0;
3375 // FIXME? We could make this more precise by looking at the CFG and
3376 // e.g. not counting loads in each side of an if-then-else diamond.
3377 for (const auto BB : L->blocks()) {
3378 for (auto &I : *BB) {
3379 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
3380 if (!LMemI)
3381 continue;
3382
3383 Value *PtrValue = LMemI->getPointerOperand();
3384 if (L->isLoopInvariant(PtrValue))
3385 continue;
3386
3387 const SCEV *LSCEV = SE.getSCEV(PtrValue);
3388 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
3389 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
3390 continue;
3391
3392 // FIXME? We could take pairing of unrolled load copies into account
3393 // by looking at the AddRec, but we would probably have to limit this
3394 // to loops with no stores or other memory optimization barriers.
3395 ++StridedLoads;
3396 // We've seen enough strided loads that seeing more won't make a
3397 // difference.
3398 if (StridedLoads > MaxStridedLoads / 2)
3399 return StridedLoads;
3400 }
3401 }
3402 return StridedLoads;
3403 };
3404
3405 int StridedLoads = countStridedLoads(L, SE);
3406 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
3407 << " strided loads\n");
3408 // Pick the largest power of 2 unroll count that won't result in too many
3409 // strided loads.
3410 if (StridedLoads) {
3411 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
3412 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
3413 << UP.MaxCount << '\n');
3414 }
3415}
3416
3420 // Enable partial unrolling and runtime unrolling.
3421 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
3422
3423 UP.UpperBound = true;
3424
3425 // For inner loop, it is more likely to be a hot one, and the runtime check
3426 // can be promoted out from LICM pass, so the overhead is less, let's try
3427 // a larger threshold to unroll more loops.
3428 if (L->getLoopDepth() > 1)
3429 UP.PartialThreshold *= 2;
3430
3431 // Disable partial & runtime unrolling on -Os.
3433
3437
3438 // Scan the loop: don't unroll loops with calls as this could prevent
3439 // inlining. Don't unroll vector loops either, as they don't benefit much from
3440 // unrolling.
3441 for (auto *BB : L->getBlocks()) {
3442 for (auto &I : *BB) {
3443 // Don't unroll vectorised loop.
3444 if (I.getType()->isVectorTy())
3445 return;
3446
3447 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
3448 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
3449 if (!isLoweredToCall(F))
3450 continue;
3451 }
3452 return;
3453 }
3454 }
3455 }
3456
3457 // Enable runtime unrolling for in-order models
3458 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
3459 // checking for that case, we can ensure that the default behaviour is
3460 // unchanged
3462 !ST->getSchedModel().isOutOfOrder()) {
3463 UP.Runtime = true;
3464 UP.Partial = true;
3465 UP.UnrollRemainder = true;
3467
3468 UP.UnrollAndJam = true;
3470 }
3471}
3472
3476}
3477
3479 Type *ExpectedType) {
3480 switch (Inst->getIntrinsicID()) {
3481 default:
3482 return nullptr;
3483 case Intrinsic::aarch64_neon_st2:
3484 case Intrinsic::aarch64_neon_st3:
3485 case Intrinsic::aarch64_neon_st4: {
3486 // Create a struct type
3487 StructType *ST = dyn_cast<StructType>(ExpectedType);
3488 if (!ST)
3489 return nullptr;
3490 unsigned NumElts = Inst->arg_size() - 1;
3491 if (ST->getNumElements() != NumElts)
3492 return nullptr;
3493 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3494 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
3495 return nullptr;
3496 }
3497 Value *Res = PoisonValue::get(ExpectedType);
3498 IRBuilder<> Builder(Inst);
3499 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3500 Value *L = Inst->getArgOperand(i);
3501 Res = Builder.CreateInsertValue(Res, L, i);
3502 }
3503 return Res;
3504 }
3505 case Intrinsic::aarch64_neon_ld2:
3506 case Intrinsic::aarch64_neon_ld3:
3507 case Intrinsic::aarch64_neon_ld4:
3508 if (Inst->getType() == ExpectedType)
3509 return Inst;
3510 return nullptr;
3511 }
3512}
3513
3515 MemIntrinsicInfo &Info) {
3516 switch (Inst->getIntrinsicID()) {
3517 default:
3518 break;
3519 case Intrinsic::aarch64_neon_ld2:
3520 case Intrinsic::aarch64_neon_ld3:
3521 case Intrinsic::aarch64_neon_ld4:
3522 Info.ReadMem = true;
3523 Info.WriteMem = false;
3524 Info.PtrVal = Inst->getArgOperand(0);
3525 break;
3526 case Intrinsic::aarch64_neon_st2:
3527 case Intrinsic::aarch64_neon_st3:
3528 case Intrinsic::aarch64_neon_st4:
3529 Info.ReadMem = false;
3530 Info.WriteMem = true;
3531 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
3532 break;
3533 }
3534
3535 switch (Inst->getIntrinsicID()) {
3536 default:
3537 return false;
3538 case Intrinsic::aarch64_neon_ld2:
3539 case Intrinsic::aarch64_neon_st2:
3540 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
3541 break;
3542 case Intrinsic::aarch64_neon_ld3:
3543 case Intrinsic::aarch64_neon_st3:
3544 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
3545 break;
3546 case Intrinsic::aarch64_neon_ld4:
3547 case Intrinsic::aarch64_neon_st4:
3548 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
3549 break;
3550 }
3551 return true;
3552}
3553
3554/// See if \p I should be considered for address type promotion. We check if \p
3555/// I is a sext with right type and used in memory accesses. If it used in a
3556/// "complex" getelementptr, we allow it to be promoted without finding other
3557/// sext instructions that sign extended the same initial value. A getelementptr
3558/// is considered as "complex" if it has more than 2 operands.
3560 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
3561 bool Considerable = false;
3562 AllowPromotionWithoutCommonHeader = false;
3563 if (!isa<SExtInst>(&I))
3564 return false;
3565 Type *ConsideredSExtType =
3566 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3567 if (I.getType() != ConsideredSExtType)
3568 return false;
3569 // See if the sext is the one with the right type and used in at least one
3570 // GetElementPtrInst.
3571 for (const User *U : I.users()) {
3572 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3573 Considerable = true;
3574 // A getelementptr is considered as "complex" if it has more than 2
3575 // operands. We will promote a SExt used in such complex GEP as we
3576 // expect some computation to be merged if they are done on 64 bits.
3577 if (GEPInst->getNumOperands() > 2) {
3578 AllowPromotionWithoutCommonHeader = true;
3579 break;
3580 }
3581 }
3582 }
3583 return Considerable;
3584}
3585
3587 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3588 if (!VF.isScalable())
3589 return true;
3590
3591 Type *Ty = RdxDesc.getRecurrenceType();
3593 return false;
3594
3595 switch (RdxDesc.getRecurrenceKind()) {
3596 case RecurKind::Add:
3597 case RecurKind::FAdd:
3598 case RecurKind::And:
3599 case RecurKind::Or:
3600 case RecurKind::Xor:
3601 case RecurKind::SMin:
3602 case RecurKind::SMax:
3603 case RecurKind::UMin:
3604 case RecurKind::UMax:
3605 case RecurKind::FMin:
3606 case RecurKind::FMax:
3607 case RecurKind::FMulAdd:
3608 case RecurKind::IAnyOf:
3609 case RecurKind::FAnyOf:
3610 return true;
3611 default:
3612 return false;
3613 }
3614}
3615
3618 FastMathFlags FMF,
3620 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3621
3622 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3623 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
3624
3625 InstructionCost LegalizationCost = 0;
3626 if (LT.first > 1) {
3627 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
3628 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
3629 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3630 }
3631
3632 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
3633}
3634
3636 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3637 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3638 InstructionCost LegalizationCost = 0;
3639 if (LT.first > 1) {
3640 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3641 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
3642 LegalizationCost *= LT.first - 1;
3643 }
3644
3645 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3646 assert(ISD && "Invalid opcode");
3647 // Add the final reduction cost for the legal horizontal reduction
3648 switch (ISD) {
3649 case ISD::ADD:
3650 case ISD::AND:
3651 case ISD::OR:
3652 case ISD::XOR:
3653 case ISD::FADD:
3654 return LegalizationCost + 2;
3655 default:
3657 }
3658}
3659
3662 std::optional<FastMathFlags> FMF,
3665 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3666 InstructionCost BaseCost =
3667 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3668 // Add on extra cost to reflect the extra overhead on some CPUs. We still
3669 // end up vectorizing for more computationally intensive loops.
3670 return BaseCost + FixedVTy->getNumElements();
3671 }
3672
3673 if (Opcode != Instruction::FAdd)
3675
3676 auto *VTy = cast<ScalableVectorType>(ValTy);
3678 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3679 Cost *= getMaxNumElements(VTy->getElementCount());
3680 return Cost;
3681 }
3682
3683 if (isa<ScalableVectorType>(ValTy))
3684 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3685
3686 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3687 MVT MTy = LT.second;
3688 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3689 assert(ISD && "Invalid opcode");
3690
3691 // Horizontal adds can use the 'addv' instruction. We model the cost of these
3692 // instructions as twice a normal vector add, plus 1 for each legalization
3693 // step (LT.first). This is the only arithmetic vector reduction operation for
3694 // which we have an instruction.
3695 // OR, XOR and AND costs should match the codegen from:
3696 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3697 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3698 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3699 static const CostTblEntry CostTblNoPairwise[]{
3700 {ISD::ADD, MVT::v8i8, 2},
3701 {ISD::ADD, MVT::v16i8, 2},
3702 {ISD::ADD, MVT::v4i16, 2},
3703 {ISD::ADD, MVT::v8i16, 2},
3704 {ISD::ADD, MVT::v4i32, 2},
3705 {ISD::ADD, MVT::v2i64, 2},
3706 {ISD::OR, MVT::v8i8, 15},
3707 {ISD::OR, MVT::v16i8, 17},
3708 {ISD::OR, MVT::v4i16, 7},
3709 {ISD::OR, MVT::v8i16, 9},
3710 {ISD::OR, MVT::v2i32, 3},
3711 {ISD::OR, MVT::v4i32, 5},
3712 {ISD::OR, MVT::v2i64, 3},
3713 {ISD::XOR, MVT::v8i8, 15},
3714 {ISD::XOR, MVT::v16i8, 17},
3715 {ISD::XOR, MVT::v4i16, 7},
3716 {ISD::XOR, MVT::v8i16, 9},
3717 {ISD::XOR, MVT::v2i32, 3},
3718 {ISD::XOR, MVT::v4i32, 5},
3719 {ISD::XOR, MVT::v2i64, 3},
3720 {ISD::AND, MVT::v8i8, 15},
3721 {ISD::AND, MVT::v16i8, 17},
3722 {ISD::AND, MVT::v4i16, 7},
3723 {ISD::AND, MVT::v8i16, 9},
3724 {ISD::AND, MVT::v2i32, 3},
3725 {ISD::AND, MVT::v4i32, 5},
3726 {ISD::AND, MVT::v2i64, 3},
3727 };
3728 switch (ISD) {
3729 default:
3730 break;
3731 case ISD::ADD:
3732 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3733 return (LT.first - 1) + Entry->Cost;
3734 break;
3735 case ISD::XOR:
3736 case ISD::AND:
3737 case ISD::OR:
3738 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3739 if (!Entry)
3740 break;
3741 auto *ValVTy = cast<FixedVectorType>(ValTy);
3742 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3743 isPowerOf2_32(ValVTy->getNumElements())) {
3744 InstructionCost ExtraCost = 0;
3745 if (LT.first != 1) {
3746 // Type needs to be split, so there is an extra cost of LT.first - 1
3747 // arithmetic ops.
3748 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3749 MTy.getVectorNumElements());
3750 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3751 ExtraCost *= LT.first - 1;
3752 }
3753 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
3754 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
3755 return Cost + ExtraCost;
3756 }
3757 break;
3758 }
3759 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3760}
3761
3763 static const CostTblEntry ShuffleTbl[] = {
3764 { TTI::SK_Splice, MVT::nxv16i8, 1 },
3765 { TTI::SK_Splice, MVT::nxv8i16, 1 },
3766 { TTI::SK_Splice, MVT::nxv4i32, 1 },
3767 { TTI::SK_Splice, MVT::nxv2i64, 1 },
3768 { TTI::SK_Splice, MVT::nxv2f16, 1 },
3769 { TTI::SK_Splice, MVT::nxv4f16, 1 },
3770 { TTI::SK_Splice, MVT::nxv8f16, 1 },
3771 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
3772 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
3773 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
3774 { TTI::SK_Splice, MVT::nxv2f32, 1 },
3775 { TTI::SK_Splice, MVT::nxv4f32, 1 },
3776 { TTI::SK_Splice, MVT::nxv2f64, 1 },
3777 };
3778
3779 // The code-generator is currently not able to handle scalable vectors
3780 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3781 // it. This change will be removed when code-generation for these types is
3782 // sufficiently reliable.
3785
3786 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3787 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3789 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3790 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3791 : LT.second;
3792 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3793 InstructionCost LegalizationCost = 0;
3794 if (Index < 0) {
3795 LegalizationCost =
3796 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3798 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3800 }
3801
3802 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3803 // Cost performed on a promoted type.
3804 if (LT.second.getScalarType() == MVT::i1) {
3805 LegalizationCost +=
3806 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3808 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3810 }
3811 const auto *Entry =
3812 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3813 assert(Entry && "Illegal Type for Splice");
3814 LegalizationCost += Entry->Cost;
3815 return LegalizationCost * LT.first;
3816}
3817
3821 ArrayRef<const Value *> Args, const Instruction *CxtI) {
3822 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3823
3824 // If we have a Mask, and the LT is being legalized somehow, split the Mask
3825 // into smaller vectors and sum the cost of each shuffle.
3826 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
3827 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
3828 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
3829
3830 // Check for ST3/ST4 instructions, which are represented in llvm IR as
3831 // store(interleaving-shuffle). The shuffle cost could potentially be free,
3832 // but we model it with a cost of LT.first so that LD3/LD3 have a higher
3833 // cost than just the store.
3834 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
3836 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
3838 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
3839 return LT.first;
3840
3841 unsigned TpNumElts = Mask.size();
3842 unsigned LTNumElts = LT.second.getVectorNumElements();
3843 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
3844 VectorType *NTp =
3845 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
3847 for (unsigned N = 0; N < NumVecs; N++) {
3848 SmallVector<int> NMask;
3849 // Split the existing mask into chunks of size LTNumElts. Track the source
3850 // sub-vectors to ensure the result has at most 2 inputs.
3851 unsigned Source1, Source2;
3852 unsigned NumSources = 0;
3853 for (unsigned E = 0; E < LTNumElts; E++) {
3854 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
3856 if (MaskElt < 0) {
3858 continue;
3859 }
3860
3861 // Calculate which source from the input this comes from and whether it
3862 // is new to us.
3863 unsigned Source = MaskElt / LTNumElts;
3864 if (NumSources == 0) {
3865 Source1 = Source;
3866 NumSources = 1;
3867 } else if (NumSources == 1 && Source != Source1) {
3868 Source2 = Source;
3869 NumSources = 2;
3870 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3871 NumSources++;
3872 }
3873
3874 // Add to the new mask. For the NumSources>2 case these are not correct,
3875 // but are only used for the modular lane number.
3876 if (Source == Source1)
3877 NMask.push_back(MaskElt % LTNumElts);
3878 else if (Source == Source2)
3879 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
3880 else
3881 NMask.push_back(MaskElt % LTNumElts);
3882 }
3883 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
3884 // getShuffleCost. If not then cost it using the worst case.
3885 if (NumSources <= 2)
3886 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
3888 NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
3889 else if (any_of(enumerate(NMask), [&](const auto &ME) {
3890 return ME.value() % LTNumElts == ME.index();
3891 }))
3892 Cost += LTNumElts - 1;
3893 else
3894 Cost += LTNumElts;
3895 }
3896 return Cost;
3897 }
3898
3899 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
3900 // Treat extractsubvector as single op permutation.
3901 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
3902 if (IsExtractSubvector && LT.second.isFixedLengthVector())
3904
3905 // Check for broadcast loads, which are supported by the LD1R instruction.
3906 // In terms of code-size, the shuffle vector is free when a load + dup get
3907 // folded into a LD1R. That's what we check and return here. For performance
3908 // and reciprocal throughput, a LD1R is not completely free. In this case, we
3909 // return the cost for the broadcast below (i.e. 1 for most/all types), so
3910 // that we model the load + dup sequence slightly higher because LD1R is a
3911 // high latency instruction.
3912 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
3913 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
3914 if (IsLoad && LT.second.isVector() &&
3916 LT.second.getVectorElementCount()))
3917 return 0;
3918 }
3919
3920 // If we have 4 elements for the shuffle and a Mask, get the cost straight
3921 // from the perfect shuffle tables.
3922 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
3923 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
3924 all_of(Mask, [](int E) { return E < 8; }))
3925 return getPerfectShuffleCost(Mask);
3926
3927 // Check for identity masks, which we can treat as free.
3928 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
3929 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
3930 all_of(enumerate(Mask), [](const auto &M) {
3931 return M.value() < 0 || M.value() == (int)M.index();
3932 }))
3933 return 0;
3934
3935 // Check for other shuffles that are not SK_ kinds but we have native
3936 // instructions for, for example ZIP and UZP.
3937 unsigned Unused;
3938 if (LT.second.isFixedLengthVector() &&
3939 LT.second.getVectorNumElements() == Mask.size() &&
3940 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
3941 (isZIPMask(Mask, LT.second, Unused) ||
3942 isUZPMask(Mask, LT.second, Unused)))
3943 return 1;
3944
3945 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
3946 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
3947 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
3948 static const CostTblEntry ShuffleTbl[] = {
3949 // Broadcast shuffle kinds can be performed with 'dup'.
3950 {TTI::SK_Broadcast, MVT::v8i8, 1},
3951 {TTI::SK_Broadcast, MVT::v16i8, 1},
3952 {TTI::SK_Broadcast, MVT::v4i16, 1},
3953 {TTI::SK_Broadcast, MVT::v8i16, 1},
3954 {TTI::SK_Broadcast, MVT::v2i32, 1},
3955 {TTI::SK_Broadcast, MVT::v4i32, 1},
3956 {TTI::SK_Broadcast, MVT::v2i64, 1},
3957 {TTI::SK_Broadcast, MVT::v4f16, 1},
3958 {TTI::SK_Broadcast, MVT::v8f16, 1},
3959 {TTI::SK_Broadcast, MVT::v2f32, 1},
3960 {TTI::SK_Broadcast, MVT::v4f32, 1},
3961 {TTI::SK_Broadcast, MVT::v2f64, 1},
3962 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
3963 // 'zip1/zip2' instructions.
3964 {TTI::SK_Transpose, MVT::v8i8, 1},
3965 {TTI::SK_Transpose, MVT::v16i8, 1},
3966 {TTI::SK_Transpose, MVT::v4i16, 1},
3967 {TTI::SK_Transpose, MVT::v8i16, 1},
3968 {TTI::SK_Transpose, MVT::v2i32, 1},
3969 {TTI::SK_Transpose, MVT::v4i32, 1},
3970 {TTI::SK_Transpose, MVT::v2i64, 1},
3971 {TTI::SK_Transpose, MVT::v4f16, 1},
3972 {TTI::SK_Transpose, MVT::v8f16, 1},
3973 {TTI::SK_Transpose, MVT::v2f32, 1},
3974 {TTI::SK_Transpose, MVT::v4f32, 1},
3975 {TTI::SK_Transpose, MVT::v2f64, 1},
3976 // Select shuffle kinds.
3977 // TODO: handle vXi8/vXi16.
3978 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
3979 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
3980 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
3981 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
3982 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
3983 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
3984 // PermuteSingleSrc shuffle kinds.
3985 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
3986 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
3987 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
3988 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
3989 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
3990 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
3991 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
3992 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
3993 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
3994 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
3995 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
3996 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
3997 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
3998 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
3999 // Reverse can be lowered with `rev`.
4000 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4001 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4002 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4003 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4004 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4005 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
4006 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
4007 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
4008 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
4009 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
4010 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
4011 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
4012 // Splice can all be lowered as `ext`.
4013 {TTI::SK_Splice, MVT::v2i32, 1},
4014 {TTI::SK_Splice, MVT::v4i32, 1},
4015 {TTI::SK_Splice, MVT::v2i64, 1},
4016 {TTI::SK_Splice, MVT::v2f32, 1},
4017 {TTI::SK_Splice, MVT::v4f32, 1},
4018 {TTI::SK_Splice, MVT::v2f64, 1},
4019 {TTI::SK_Splice, MVT::v8f16, 1},
4020 {TTI::SK_Splice, MVT::v8bf16, 1},
4021 {TTI::SK_Splice, MVT::v8i16, 1},
4022 {TTI::SK_Splice, MVT::v16i8, 1},
4023 {TTI::SK_Splice, MVT::v4bf16, 1},
4024 {TTI::SK_Splice, MVT::v4f16, 1},
4025 {TTI::SK_Splice, MVT::v4i16, 1},
4026 {TTI::SK_Splice, MVT::v8i8, 1},
4027 // Broadcast shuffle kinds for scalable vectors
4028 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4029 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4030 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4031 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4032 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4033 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4034 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4035 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4036 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4037 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4038 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4039 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4040 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4041 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4042 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4043 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4044 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4045 // Handle the cases for vector.reverse with scalable vectors
4046 {TTI::SK_Reverse, MVT::nxv16i8, 1},
4047 {TTI::SK_Reverse, MVT::nxv8i16, 1},
4048 {TTI::SK_Reverse, MVT::nxv4i32, 1},
4049 {TTI::SK_Reverse, MVT::nxv2i64, 1},
4050 {TTI::SK_Reverse, MVT::nxv2f16, 1},
4051 {TTI::SK_Reverse, MVT::nxv4f16, 1},
4052 {TTI::SK_Reverse, MVT::nxv8f16, 1},
4053 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
4054 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
4055 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
4056 {TTI::SK_Reverse, MVT::nxv2f32, 1},
4057 {TTI::SK_Reverse, MVT::nxv4f32, 1},
4058 {TTI::SK_Reverse, MVT::nxv2f64, 1},
4059 {TTI::SK_Reverse, MVT::nxv16i1, 1},
4060 {TTI::SK_Reverse, MVT::nxv8i1, 1},
4061 {TTI::SK_Reverse, MVT::nxv4i1, 1},
4062 {TTI::SK_Reverse, MVT::nxv2i1, 1},
4063 };
4064 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
4065 return LT.first * Entry->Cost;
4066 }
4067
4068 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
4069 return getSpliceCost(Tp, Index);
4070
4071 // Inserting a subvector can often be done with either a D, S or H register
4072 // move, so long as the inserted vector is "aligned".
4073 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
4074 LT.second.getSizeInBits() <= 128 && SubTp) {
4075 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
4076 if (SubLT.second.isVector()) {
4077 int NumElts = LT.second.getVectorNumElements();
4078 int NumSubElts = SubLT.second.getVectorNumElements();
4079 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
4080 return SubLT.first;
4081 }
4082 }
4083
4084 // Restore optimal kind.
4085 if (IsExtractSubvector)
4087 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
4088 CxtI);
4089}
4090
4093 const auto &Strides = DenseMap<Value *, const SCEV *>();
4094 for (BasicBlock *BB : TheLoop->blocks()) {
4095 // Scan the instructions in the block and look for addresses that are
4096 // consecutive and decreasing.
4097 for (Instruction &I : *BB) {
4098 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
4100 Type *AccessTy = getLoadStoreType(&I);
4101 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
4102 /*ShouldCheckWrap=*/false)
4103 .value_or(0) < 0)
4104 return true;
4105 }
4106 }
4107 }
4108 return false;
4109}
4110
4112 if (!ST->hasSVE())
4113 return false;
4114
4115 // We don't currently support vectorisation with interleaving for SVE - with
4116 // such loops we're better off not using tail-folding. This gives us a chance
4117 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4118 if (TFI->IAI->hasGroups())
4119 return false;
4120
4122 if (TFI->LVL->getReductionVars().size())
4123 Required |= TailFoldingOpts::Reductions;
4124 if (TFI->LVL->getFixedOrderRecurrences().size())
4125 Required |= TailFoldingOpts::Recurrences;
4126
4127 // We call this to discover whether any load/store pointers in the loop have
4128 // negative strides. This will require extra work to reverse the loop
4129 // predicate, which may be expensive.
4132 Required |= TailFoldingOpts::Reverse;
4133 if (Required == TailFoldingOpts::Disabled)
4134 Required |= TailFoldingOpts::Simple;
4135
4137 Required))
4138 return false;
4139
4140 // Don't tail-fold for tight loops where we would be better off interleaving
4141 // with an unpredicated loop.
4142 unsigned NumInsns = 0;
4143 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
4144 NumInsns += BB->sizeWithoutDebug();
4145 }
4146
4147 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
4148 return NumInsns >= SVETailFoldInsnThreshold;
4149}
4150
4153 int64_t BaseOffset, bool HasBaseReg,
4154 int64_t Scale, unsigned AddrSpace) const {
4155 // Scaling factors are not free at all.
4156 // Operands | Rt Latency
4157 // -------------------------------------------
4158 // Rt, [Xn, Xm] | 4
4159 // -------------------------------------------
4160 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
4161 // Rt, [Xn, Wm, <extend> #imm] |
4163 AM.BaseGV = BaseGV;
4164 AM.BaseOffs = BaseOffset;
4165 AM.HasBaseReg = HasBaseReg;
4166 AM.Scale = Scale;
4167 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
4168 // Scale represents reg2 * scale, thus account for 1 if
4169 // it is not equal to 0 or 1.
4170 return AM.Scale != 0 && AM.Scale != 1;
4171 return -1;
4172}
4173
4175 // For the binary operators (e.g. or) we need to be more careful than
4176 // selects, here we only transform them if they are already at a natural
4177 // break point in the code - the end of a block with an unconditional
4178 // terminator.
4179 if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
4180 isa<BranchInst>(I->getNextNode()) &&
4181 cast<BranchInst>(I->getNextNode())->isUnconditional())
4182 return true;
4184}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static bool hasPossibleIncompatibleOps(const Function *F)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
amdgpu AMDGPU Register Bank Select
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
IntegerType * Int32Ty
#define P(N)
const char LLVMTargetMachineRef TM
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getFastMathFlags(const MachineInstr &I)
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
unsigned getMinSVEVectorSizeInBits() const
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:76
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1620
unsigned countLeadingOnes() const
Definition: APInt.h:1574
void negate()
Negate this APInt in place.
Definition: APInt.h:1421
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1703
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:582
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:762
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:654
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:855
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:339
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name, BasicBlock::iterator InsertBefore)
Definition: InstrTypes.h:298
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1467
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1715
bool isStrictFP() const
Determine if the call requires strict floating point semantics.
Definition: InstrTypes.h:2201
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1660
unsigned arg_size() const
Definition: InstrTypes.h:1658
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1754
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:966
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:969
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:972
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:970
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:971
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:973
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:982
bool isIntPredicate() const
Definition: InstrTypes.h:1096
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1663
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:145
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool allowContract() const
Definition: FMF.h:70
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Definition: IRBuilder.cpp:88
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2462
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2513
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1045
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2450
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:559
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1214