23#include "llvm/IR/IntrinsicsAArch64.h"
33#define DEBUG_TYPE "aarch64tti"
54 "Penalty of calling a function that requires a change to PSTATE.SM"));
58 cl::desc(
"Penalty of inlining a call that requires a change to PSTATE.SM"));
69 cl::desc(
"The cost of a histcnt instruction"));
73 cl::desc(
"The number of instructions to search for a redundant dmb"));
76class TailFoldingOption {
91 bool NeedsDefault =
true;
95 void setNeedsDefault(
bool V) { NeedsDefault =
V; }
110 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
111 "Initial bits should only include one of "
112 "(disabled|all|simple|default)");
113 Bits = NeedsDefault ? DefaultBits : InitialBits;
115 Bits &= ~DisableBits;
121 errs() <<
"invalid argument '" << Opt
122 <<
"' to -sve-tail-folding=; the option should be of the form\n"
123 " (disabled|all|default|simple)[+(reductions|recurrences"
124 "|reverse|noreductions|norecurrences|noreverse)]\n";
130 void operator=(
const std::string &Val) {
139 setNeedsDefault(
false);
144 unsigned StartIdx = 1;
145 if (TailFoldTypes[0] ==
"disabled")
146 setInitialBits(TailFoldingOpts::Disabled);
147 else if (TailFoldTypes[0] ==
"all")
148 setInitialBits(TailFoldingOpts::All);
149 else if (TailFoldTypes[0] ==
"default")
150 setNeedsDefault(
true);
151 else if (TailFoldTypes[0] ==
"simple")
152 setInitialBits(TailFoldingOpts::Simple);
155 setInitialBits(TailFoldingOpts::Disabled);
158 for (
unsigned I = StartIdx;
I < TailFoldTypes.
size();
I++) {
159 if (TailFoldTypes[
I] ==
"reductions")
160 setEnableBit(TailFoldingOpts::Reductions);
161 else if (TailFoldTypes[
I] ==
"recurrences")
162 setEnableBit(TailFoldingOpts::Recurrences);
163 else if (TailFoldTypes[
I] ==
"reverse")
164 setEnableBit(TailFoldingOpts::Reverse);
165 else if (TailFoldTypes[
I] ==
"noreductions")
166 setDisableBit(TailFoldingOpts::Reductions);
167 else if (TailFoldTypes[
I] ==
"norecurrences")
168 setDisableBit(TailFoldingOpts::Recurrences);
169 else if (TailFoldTypes[
I] ==
"noreverse")
170 setDisableBit(TailFoldingOpts::Reverse);
187 "Control the use of vectorisation using tail-folding for SVE where the"
188 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
189 "\ndisabled (Initial) No loop types will vectorize using "
191 "\ndefault (Initial) Uses the default tail-folding settings for "
193 "\nall (Initial) All legal loop types will vectorize using "
195 "\nsimple (Initial) Use tail-folding for simple loops (not "
196 "reductions or recurrences)"
197 "\nreductions Use tail-folding for loops containing reductions"
198 "\nnoreductions Inverse of above"
199 "\nrecurrences Use tail-folding for loops containing fixed order "
201 "\nnorecurrences Inverse of above"
202 "\nreverse Use tail-folding for loops requiring reversed "
204 "\nnoreverse Inverse of above"),
222 .
Case(
"__arm_sme_state",
true)
223 .
Case(
"__arm_tpidr2_save",
true)
224 .
Case(
"__arm_tpidr2_restore",
true)
225 .
Case(
"__arm_za_disable",
true)
239 if (isa<CallInst>(
I) && !
I.isDebugOrPseudoInst() &&
240 (cast<CallInst>(
I).isInlineAsm() || isa<IntrinsicInst>(
I) ||
250 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
262 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
263 CallerAttrs.requiresSMChange(CalleeAttrs) ||
264 CallerAttrs.requiresPreservingZT0(CalleeAttrs) ||
265 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs)) {
288 auto FVTy = dyn_cast<FixedVectorType>(Ty);
290 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
299 unsigned DefaultCallPenalty)
const {
322 if (
F == Call.getCaller())
328 return DefaultCallPenalty;
367 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
372 for (
unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
378 return std::max<InstructionCost>(1,
Cost);
393 unsigned ImmIdx = ~0U;
397 case Instruction::GetElementPtr:
402 case Instruction::Store:
405 case Instruction::Add:
406 case Instruction::Sub:
407 case Instruction::Mul:
408 case Instruction::UDiv:
409 case Instruction::SDiv:
410 case Instruction::URem:
411 case Instruction::SRem:
412 case Instruction::And:
413 case Instruction::Or:
414 case Instruction::Xor:
415 case Instruction::ICmp:
419 case Instruction::Shl:
420 case Instruction::LShr:
421 case Instruction::AShr:
425 case Instruction::Trunc:
426 case Instruction::ZExt:
427 case Instruction::SExt:
428 case Instruction::IntToPtr:
429 case Instruction::PtrToInt:
430 case Instruction::BitCast:
431 case Instruction::PHI:
432 case Instruction::Call:
433 case Instruction::Select:
434 case Instruction::Ret:
435 case Instruction::Load:
440 int NumConstants = (BitSize + 63) / 64;
464 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
470 case Intrinsic::sadd_with_overflow:
471 case Intrinsic::uadd_with_overflow:
472 case Intrinsic::ssub_with_overflow:
473 case Intrinsic::usub_with_overflow:
474 case Intrinsic::smul_with_overflow:
475 case Intrinsic::umul_with_overflow:
477 int NumConstants = (BitSize + 63) / 64;
484 case Intrinsic::experimental_stackmap:
485 if ((
Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
488 case Intrinsic::experimental_patchpoint_void:
489 case Intrinsic::experimental_patchpoint:
490 if ((
Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
493 case Intrinsic::experimental_gc_statepoint:
494 if ((
Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
504 if (TyWidth == 32 || TyWidth == 64)
518 unsigned TotalHistCnts = 1;
527 if (
VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
528 unsigned EC = VTy->getElementCount().getKnownMinValue();
533 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
535 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
539 TotalHistCnts = EC / NaturalVectorWidth;
553 if (
auto *VTy = dyn_cast<ScalableVectorType>(
RetTy))
557 switch (ICA.
getID()) {
558 case Intrinsic::experimental_vector_histogram_add:
562 case Intrinsic::umin:
563 case Intrinsic::umax:
564 case Intrinsic::smin:
565 case Intrinsic::smax: {
566 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
567 MVT::v8i16, MVT::v2i32, MVT::v4i32,
568 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
572 if (LT.second == MVT::v2i64)
574 if (
any_of(ValidMinMaxTys, [<](
MVT M) {
return M == LT.second; }))
578 case Intrinsic::sadd_sat:
579 case Intrinsic::ssub_sat:
580 case Intrinsic::uadd_sat:
581 case Intrinsic::usub_sat: {
582 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
583 MVT::v8i16, MVT::v2i32, MVT::v4i32,
589 LT.second.getScalarSizeInBits() ==
RetTy->getScalarSizeInBits() ? 1 : 4;
590 if (
any_of(ValidSatTys, [<](
MVT M) {
return M == LT.second; }))
591 return LT.first * Instrs;
594 case Intrinsic::abs: {
595 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
596 MVT::v8i16, MVT::v2i32, MVT::v4i32,
599 if (
any_of(ValidAbsTys, [<](
MVT M) {
return M == LT.second; }))
603 case Intrinsic::bswap: {
604 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
605 MVT::v4i32, MVT::v2i64};
607 if (
any_of(ValidAbsTys, [<](
MVT M) {
return M == LT.second; }) &&
608 LT.second.getScalarSizeInBits() ==
RetTy->getScalarSizeInBits())
612 case Intrinsic::stepvector: {
621 Cost += AddCost * (LT.first - 1);
625 case Intrinsic::vector_extract:
626 case Intrinsic::vector_insert: {
639 bool IsExtract = ICA.
getID() == Intrinsic::vector_extract;
658 case Intrinsic::bitreverse: {
660 {Intrinsic::bitreverse, MVT::i32, 1},
661 {Intrinsic::bitreverse, MVT::i64, 1},
662 {Intrinsic::bitreverse, MVT::v8i8, 1},
663 {Intrinsic::bitreverse, MVT::v16i8, 1},
664 {Intrinsic::bitreverse, MVT::v4i16, 2},
665 {Intrinsic::bitreverse, MVT::v8i16, 2},
666 {Intrinsic::bitreverse, MVT::v2i32, 2},
667 {Intrinsic::bitreverse, MVT::v4i32, 2},
668 {Intrinsic::bitreverse, MVT::v1i64, 2},
669 {Intrinsic::bitreverse, MVT::v2i64, 2},
679 return LegalisationCost.first * Entry->Cost + 1;
681 return LegalisationCost.first * Entry->Cost;
685 case Intrinsic::ctpop: {
686 if (!ST->hasNEON()) {
707 RetTy->getScalarSizeInBits()
710 return LT.first * Entry->Cost + ExtraCost;
714 case Intrinsic::sadd_with_overflow:
715 case Intrinsic::uadd_with_overflow:
716 case Intrinsic::ssub_with_overflow:
717 case Intrinsic::usub_with_overflow:
718 case Intrinsic::smul_with_overflow:
719 case Intrinsic::umul_with_overflow: {
721 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
722 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
723 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
724 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
725 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
726 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
727 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
728 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
729 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
730 {Intrinsic::usub_with_overflow, MVT::i8, 3},
731 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
732 {Intrinsic::usub_with_overflow, MVT::i16, 3},
733 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
734 {Intrinsic::usub_with_overflow, MVT::i32, 1},
735 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
736 {Intrinsic::usub_with_overflow, MVT::i64, 1},
737 {Intrinsic::smul_with_overflow, MVT::i8, 5},
738 {Intrinsic::umul_with_overflow, MVT::i8, 4},
739 {Intrinsic::smul_with_overflow, MVT::i16, 5},
740 {Intrinsic::umul_with_overflow, MVT::i16, 4},
741 {Intrinsic::smul_with_overflow, MVT::i32, 2},
742 {Intrinsic::umul_with_overflow, MVT::i32, 2},
743 {Intrinsic::smul_with_overflow, MVT::i64, 3},
744 {Intrinsic::umul_with_overflow, MVT::i64, 3},
753 case Intrinsic::fptosi_sat:
754 case Intrinsic::fptoui_sat: {
757 bool IsSigned = ICA.
getID() == Intrinsic::fptosi_sat;
762 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
763 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
764 LT.second == MVT::v2f64)) {
766 (LT.second == MVT::f64 && MTy == MVT::i32) ||
767 (LT.second == MVT::f32 && MTy == MVT::i64)))
776 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
783 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
784 (LT.second == MVT::f16 && MTy == MVT::i64) ||
785 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
799 if ((LT.second.getScalarType() == MVT::f32 ||
800 LT.second.getScalarType() == MVT::f64 ||
801 LT.second.getScalarType() == MVT::f16) &&
805 if (LT.second.isVector())
809 LegalTy, {LegalTy, LegalTy});
812 LegalTy, {LegalTy, LegalTy});
814 return LT.first *
Cost +
815 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
822 if (LT.second.isVector()) {
834 Type *CondTy =
RetTy->getWithNewBitWidth(1);
840 return LT.first *
Cost;
842 case Intrinsic::fshl:
843 case Intrinsic::fshr: {
856 {Intrinsic::fshl, MVT::v4i32, 3},
857 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
858 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
859 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
865 return LegalisationCost.first * Entry->Cost;
869 if (!
RetTy->isIntegerTy())
874 bool HigherCost = (
RetTy->getScalarSizeInBits() != 32 &&
875 RetTy->getScalarSizeInBits() < 64) ||
876 (
RetTy->getScalarSizeInBits() % 64 != 0);
877 unsigned ExtraCost = HigherCost ? 1 : 0;
878 if (
RetTy->getScalarSizeInBits() == 32 ||
879 RetTy->getScalarSizeInBits() == 64)
886 return TyL.first + ExtraCost;
888 case Intrinsic::get_active_lane_mask: {
893 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
904 return RetTy->getNumElements() * 2;
909 case Intrinsic::experimental_vector_match: {
910 auto *NeedleTy = cast<FixedVectorType>(ICA.
getArgTypes()[1]);
912 unsigned SearchSize = NeedleTy->getNumElements();
913 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
920 if (isa<FixedVectorType>(
RetTy))
937 auto RequiredType =
II.getType();
939 auto *PN = dyn_cast<PHINode>(
II.getArgOperand(0));
940 assert(PN &&
"Expected Phi Node!");
943 if (!PN->hasOneUse())
946 for (
Value *IncValPhi : PN->incoming_values()) {
947 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
949 Reinterpret->getIntrinsicID() !=
950 Intrinsic::aarch64_sve_convert_to_svbool ||
951 RequiredType != Reinterpret->getArgOperand(0)->getType())
960 for (
unsigned I = 0;
I < PN->getNumIncomingValues();
I++) {
961 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(
I));
962 NPN->
addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(
I));
979static std::optional<Instruction *>
981 auto BinOp = dyn_cast<IntrinsicInst>(
II.getOperand(0));
985 auto IntrinsicID = BinOp->getIntrinsicID();
986 switch (IntrinsicID) {
987 case Intrinsic::aarch64_sve_and_z:
988 case Intrinsic::aarch64_sve_bic_z:
989 case Intrinsic::aarch64_sve_eor_z:
990 case Intrinsic::aarch64_sve_nand_z:
991 case Intrinsic::aarch64_sve_nor_z:
992 case Intrinsic::aarch64_sve_orn_z:
993 case Intrinsic::aarch64_sve_orr_z:
999 auto BinOpPred = BinOp->getOperand(0);
1000 auto BinOpOp1 = BinOp->getOperand(1);
1001 auto BinOpOp2 = BinOp->getOperand(2);
1003 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1005 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1006 return std::nullopt;
1008 auto PredOp = PredIntr->getOperand(0);
1009 auto PredOpTy = cast<VectorType>(PredOp->getType());
1010 if (PredOpTy !=
II.getType())
1011 return std::nullopt;
1015 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1016 NarrowedBinOpArgs.
push_back(NarrowBinOpOp1);
1017 if (BinOpOp1 == BinOpOp2)
1018 NarrowedBinOpArgs.
push_back(NarrowBinOpOp1);
1021 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1023 auto NarrowedBinOp =
1028static std::optional<Instruction *>
1031 if (isa<PHINode>(
II.getArgOperand(0)))
1035 return BinOpCombine;
1038 if (isa<TargetExtType>(
II.getArgOperand(0)->getType()) ||
1039 isa<TargetExtType>(
II.getType()))
1040 return std::nullopt;
1043 Value *Cursor =
II.getOperand(0), *EarliestReplacement =
nullptr;
1045 const auto *IVTy = cast<VectorType>(
II.getType());
1051 const auto *CursorVTy = cast<VectorType>(Cursor->
getType());
1052 if (CursorVTy->getElementCount().getKnownMinValue() <
1053 IVTy->getElementCount().getKnownMinValue())
1057 if (Cursor->
getType() == IVTy)
1058 EarliestReplacement = Cursor;
1060 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1063 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1064 Intrinsic::aarch64_sve_convert_to_svbool ||
1065 IntrinsicCursor->getIntrinsicID() ==
1066 Intrinsic::aarch64_sve_convert_from_svbool))
1069 CandidatesForRemoval.
insert(CandidatesForRemoval.
begin(), IntrinsicCursor);
1070 Cursor = IntrinsicCursor->getOperand(0);
1075 if (!EarliestReplacement)
1076 return std::nullopt;
1083 Value *UncastedPred;
1084 if (
match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1085 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1089 if (cast<ScalableVectorType>(Pred->
getType())->getMinNumElements() <=
1090 cast<ScalableVectorType>(UncastedPred->
getType())->getMinNumElements())
1091 Pred = UncastedPred;
1093 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1094 m_ConstantInt<AArch64SVEPredPattern::all>()));
1099static std::optional<Instruction *>
1101 bool hasInactiveVector) {
1102 int PredOperand = hasInactiveVector ? 1 : 0;
1103 int ReplaceOperand = hasInactiveVector ? 0 : 1;
1108 return std::nullopt;
1113static std::optional<Instruction *>
1116 !isa<llvm::UndefValue>(
II.getOperand(0)) &&
1117 !isa<llvm::PoisonValue>(
II.getOperand(0))) {
1125static std::optional<Instruction *>
1131 return std::nullopt;
1136static std::optional<Instruction *>
1141 if (
RetTy->isStructTy()) {
1142 auto StructT = cast<StructType>(
RetTy);
1143 auto VecT = StructT->getElementType(0);
1145 for (
unsigned i = 0; i < StructT->getNumElements(); i++) {
1146 ZerVec.
push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1147 : ConstantInt::get(VecT, 0));
1152 : ConstantInt::get(
II.getType(), 0);
1157 return std::nullopt;
1163 auto *OpPredicate =
II.getOperand(0);
1176 return std::nullopt;
1179 return std::nullopt;
1181 const auto PTruePattern =
1182 cast<ConstantInt>(Pg->
getOperand(0))->getZExtValue();
1183 if (PTruePattern != AArch64SVEPredPattern::vl1)
1184 return std::nullopt;
1189 II.getArgOperand(0),
II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1190 Insert->insertBefore(&
II);
1191 Insert->takeName(&
II);
1199 auto *
RetTy = cast<ScalableVectorType>(
II.getType());
1201 II.getArgOperand(0));
1215 auto *Pg = dyn_cast<IntrinsicInst>(
II.getArgOperand(0));
1216 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1217 return std::nullopt;
1219 const auto PTruePattern =
1220 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1221 if (PTruePattern != AArch64SVEPredPattern::all)
1222 return std::nullopt;
1227 if (!SplatValue || !SplatValue->isZero())
1228 return std::nullopt;
1231 auto *DupQLane = dyn_cast<IntrinsicInst>(
II.getArgOperand(1));
1233 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1234 return std::nullopt;
1237 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1238 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1239 return std::nullopt;
1241 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1242 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1243 return std::nullopt;
1247 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1248 return std::nullopt;
1250 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1251 return std::nullopt;
1253 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1255 return std::nullopt;
1257 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1258 auto *OutTy = dyn_cast<ScalableVectorType>(
II.getType());
1259 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1260 return std::nullopt;
1262 unsigned NumElts = VecTy->getNumElements();
1263 unsigned PredicateBits = 0;
1266 for (
unsigned I = 0;
I < NumElts; ++
I) {
1267 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(
I));
1269 return std::nullopt;
1271 PredicateBits |= 1 << (
I * (16 / NumElts));
1275 if (PredicateBits == 0) {
1277 PFalse->takeName(&
II);
1283 for (
unsigned I = 0;
I < 16; ++
I)
1284 if ((PredicateBits & (1 <<
I)) != 0)
1287 unsigned PredSize = Mask & -Mask;
1292 for (
unsigned I = 0;
I < 16;
I += PredSize)
1293 if ((PredicateBits & (1 <<
I)) == 0)
1294 return std::nullopt;
1299 {PredType}, {PTruePat});
1301 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1302 auto *ConvertFromSVBool =
1304 {
II.getType()}, {ConvertToSVBool});
1312 Value *Pg =
II.getArgOperand(0);
1313 Value *Vec =
II.getArgOperand(1);
1314 auto IntrinsicID =
II.getIntrinsicID();
1315 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1326 auto *OldBinOp = cast<BinaryOperator>(Vec);
1327 auto OpC = OldBinOp->getOpcode();
1333 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(),
II.getIterator());
1338 auto *
C = dyn_cast<Constant>(Pg);
1339 if (IsAfter &&
C &&
C->isNullValue()) {
1343 Extract->insertBefore(&
II);
1344 Extract->takeName(&
II);
1348 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1350 return std::nullopt;
1352 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1353 return std::nullopt;
1355 const auto PTruePattern =
1356 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1361 return std::nullopt;
1363 unsigned Idx = MinNumElts - 1;
1372 auto *PgVTy = cast<ScalableVectorType>(Pg->
getType());
1373 if (
Idx >= PgVTy->getMinNumElements())
1374 return std::nullopt;
1379 Extract->insertBefore(&
II);
1380 Extract->takeName(&
II);
1393 Value *Pg =
II.getArgOperand(0);
1395 Value *Vec =
II.getArgOperand(2);
1399 return std::nullopt;
1404 return std::nullopt;
1418 FPTy, cast<VectorType>(Vec->
getType())->getElementCount());
1421 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1434 {
II.getType()}, {AllPat});
1441static std::optional<Instruction *>
1443 const auto Pattern = cast<ConstantInt>(
II.getArgOperand(0))->getZExtValue();
1445 if (
Pattern == AArch64SVEPredPattern::all) {
1446 Constant *StepVal = ConstantInt::get(
II.getType(), NumElts);
1454 return MinNumElts && NumElts >= MinNumElts
1456 II, ConstantInt::get(
II.getType(), MinNumElts)))
1462 Value *PgVal =
II.getArgOperand(0);
1463 Value *OpVal =
II.getArgOperand(1);
1467 if (PgVal == OpVal &&
1468 (
II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1469 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1470 Value *Ops[] = {PgVal, OpVal};
1484 return std::nullopt;
1488 if (Pg->
getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1489 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1503 if ((Pg ==
Op) && (
II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1504 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1505 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1506 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1507 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1508 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1509 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1510 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1511 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1512 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1513 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1514 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1515 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1525 return std::nullopt;
1528template <Intrinsic::ID MulOpc,
typename Intrinsic::ID FuseOpc>
1529static std::optional<Instruction *>
1531 bool MergeIntoAddendOp) {
1533 Value *MulOp0, *MulOp1, *AddendOp, *
Mul;
1534 if (MergeIntoAddendOp) {
1535 AddendOp =
II.getOperand(1);
1536 Mul =
II.getOperand(2);
1538 AddendOp =
II.getOperand(2);
1539 Mul =
II.getOperand(1);
1544 return std::nullopt;
1546 if (!
Mul->hasOneUse())
1547 return std::nullopt;
1550 if (
II.getType()->isFPOrFPVectorTy()) {
1555 return std::nullopt;
1557 return std::nullopt;
1562 if (MergeIntoAddendOp)
1564 {
P, AddendOp, MulOp0, MulOp1}, FMFSource);
1567 {
P, MulOp0, MulOp1, AddendOp}, FMFSource);
1572static std::optional<Instruction *>
1574 Value *Pred =
II.getOperand(0);
1575 Value *PtrOp =
II.getOperand(1);
1576 Type *VecTy =
II.getType();
1584 Load->copyMetadata(
II);
1595static std::optional<Instruction *>
1597 Value *VecOp =
II.getOperand(0);
1598 Value *Pred =
II.getOperand(1);
1599 Value *PtrOp =
II.getOperand(2);
1603 Store->copyMetadata(
II);
1614 switch (Intrinsic) {
1615 case Intrinsic::aarch64_sve_fmul_u:
1616 return Instruction::BinaryOps::FMul;
1617 case Intrinsic::aarch64_sve_fadd_u:
1618 return Instruction::BinaryOps::FAdd;
1619 case Intrinsic::aarch64_sve_fsub_u:
1620 return Instruction::BinaryOps::FSub;
1622 return Instruction::BinaryOpsEnd;
1626static std::optional<Instruction *>
1629 if (
II.isStrictFP())
1630 return std::nullopt;
1632 auto *OpPredicate =
II.getOperand(0);
1634 if (BinOpCode == Instruction::BinaryOpsEnd ||
1635 !
match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1636 m_ConstantInt<AArch64SVEPredPattern::all>())))
1637 return std::nullopt;
1649 auto *OpPredicate =
II.getOperand(0);
1650 if (!
match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1651 m_ConstantInt<AArch64SVEPredPattern::all>())))
1652 return std::nullopt;
1654 auto *
Mod =
II.getModule();
1656 II.setCalledFunction(NewDecl);
1663static std::optional<Instruction *>
1680 Intrinsic::aarch64_sve_mla>(
1684 Intrinsic::aarch64_sve_mad>(
1687 return std::nullopt;
1690static std::optional<Instruction *>
1697 Intrinsic::aarch64_sve_fmla>(IC,
II,
1702 Intrinsic::aarch64_sve_fmad>(IC,
II,
1707 Intrinsic::aarch64_sve_fmla>(IC,
II,
1710 return std::nullopt;
1713static std::optional<Instruction *>
1717 Intrinsic::aarch64_sve_fmla>(IC,
II,
1722 Intrinsic::aarch64_sve_fmad>(IC,
II,
1727 Intrinsic::aarch64_sve_fmla_u>(
1733static std::optional<Instruction *>
1740 Intrinsic::aarch64_sve_fmls>(IC,
II,
1745 Intrinsic::aarch64_sve_fnmsb>(
1750 Intrinsic::aarch64_sve_fmls>(IC,
II,
1753 return std::nullopt;
1756static std::optional<Instruction *>
1760 Intrinsic::aarch64_sve_fmls>(IC,
II,
1765 Intrinsic::aarch64_sve_fnmsb>(
1770 Intrinsic::aarch64_sve_fmls_u>(
1782 Intrinsic::aarch64_sve_mls>(
1785 return std::nullopt;
1791 auto *OpPredicate =
II.getOperand(0);
1792 auto *OpMultiplicand =
II.getOperand(1);
1793 auto *OpMultiplier =
II.getOperand(2);
1796 auto IsUnitSplat = [](
auto *
I) {
1805 auto IsUnitDup = [](
auto *
I) {
1806 auto *IntrI = dyn_cast<IntrinsicInst>(
I);
1807 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1810 auto *SplatValue = IntrI->getOperand(2);
1814 if (IsUnitSplat(OpMultiplier)) {
1816 OpMultiplicand->takeName(&
II);
1818 }
else if (IsUnitDup(OpMultiplier)) {
1820 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1821 auto *DupPg = DupInst->getOperand(1);
1824 if (OpPredicate == DupPg) {
1825 OpMultiplicand->takeName(&
II);
1835 Value *UnpackArg =
II.getArgOperand(0);
1836 auto *
RetTy = cast<ScalableVectorType>(
II.getType());
1837 bool IsSigned =
II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1838 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1851 return std::nullopt;
1855 auto *OpVal =
II.getOperand(0);
1856 auto *OpIndices =
II.getOperand(1);
1861 auto *SplatValue = dyn_cast_or_null<ConstantInt>(
getSplatValue(OpIndices));
1863 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1864 return std::nullopt;
1880 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1881 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1885 if ((
match(
II.getArgOperand(0),
1886 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(
m_Value(
A)))) &&
1888 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(
m_Value(
B))))) ||
1891 auto *TyA = cast<ScalableVectorType>(
A->getType());
1892 if (TyA ==
B->getType() &&
1903 return std::nullopt;
1911 if (
match(
II.getArgOperand(0),
1913 match(
II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1916 II, (
II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ?
A :
B));
1918 return std::nullopt;
1921static std::optional<Instruction *>
1923 Value *Mask =
II.getOperand(0);
1924 Value *BasePtr =
II.getOperand(1);
1925 Value *Index =
II.getOperand(2);
1937 if (
match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1940 BasePtr->getPointerAlignment(
II.getDataLayout());
1944 BasePtr, IndexBase);
1952 return std::nullopt;
1955static std::optional<Instruction *>
1957 Value *Val =
II.getOperand(0);
1958 Value *Mask =
II.getOperand(1);
1959 Value *BasePtr =
II.getOperand(2);
1960 Value *Index =
II.getOperand(3);
1967 if (
match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1970 BasePtr->getPointerAlignment(
II.getDataLayout());
1973 BasePtr, IndexBase);
1982 return std::nullopt;
1988 Value *Pred =
II.getOperand(0);
1989 Value *Vec =
II.getOperand(1);
1990 Value *DivVec =
II.getOperand(2);
1993 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1994 if (!SplatConstantInt)
1995 return std::nullopt;
1999 if (DivisorValue == -1)
2000 return std::nullopt;
2001 if (DivisorValue == 1)
2007 Intrinsic::aarch64_sve_asrd, {
II.getType()}, {Pred, Vec, DivisorLog2});
2014 Intrinsic::aarch64_sve_asrd, {
II.getType()}, {Pred, Vec, DivisorLog2});
2016 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2020 return std::nullopt;
2024 size_t VecSize = Vec.
size();
2029 size_t HalfVecSize = VecSize / 2;
2033 if (*
LHS !=
nullptr && *
RHS !=
nullptr) {
2041 if (*
LHS ==
nullptr && *
RHS !=
nullptr)
2056 m_Intrinsic<Intrinsic::vector_insert>(
2058 !isa<FixedVectorType>(CurrentInsertElt->
getType()))
2059 return std::nullopt;
2060 auto IIScalableTy = cast<ScalableVectorType>(
II.getType());
2064 while (
auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2065 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2066 Elts[
Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2067 CurrentInsertElt = InsertElt->getOperand(0);
2071 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(
Default);
2073 return std::nullopt;
2077 for (
size_t I = 0;
I < Elts.
size();
I++) {
2078 if (Elts[
I] ==
nullptr)
2083 if (InsertEltChain ==
nullptr)
2084 return std::nullopt;
2090 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.
size();
2091 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2092 IIScalableTy->getMinNumElements() /
2097 auto *WideShuffleMaskTy =
2108 auto NarrowBitcast =
2121 return std::nullopt;
2126 Value *Pred =
II.getOperand(0);
2127 Value *Vec =
II.getOperand(1);
2128 Value *Shift =
II.getOperand(2);
2131 Value *AbsPred, *MergedValue;
2132 if (!
match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2134 !
match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2137 return std::nullopt;
2145 return std::nullopt;
2150 return std::nullopt;
2153 {
II.getType()}, {Pred, Vec, Shift});
2160 Value *Vec =
II.getOperand(0);
2165 return std::nullopt;
2171 auto *NI =
II.getNextNonDebugInstruction();
2174 return !
I->mayReadOrWriteMemory() && !
I->mayHaveSideEffects();
2176 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2177 auto *NIBB = NI->getParent();
2178 NI = NI->getNextNonDebugInstruction();
2180 if (
auto *SuccBB = NIBB->getUniqueSuccessor())
2181 NI = SuccBB->getFirstNonPHIOrDbgOrLifetime();
2186 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2187 if (NextII &&
II.isIdenticalTo(NextII))
2190 return std::nullopt;
2193std::optional<Instruction *>
2200 case Intrinsic::aarch64_dmb:
2202 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
2203 case Intrinsic::aarch64_sve_fcvt_f16f32:
2204 case Intrinsic::aarch64_sve_fcvt_f16f64:
2205 case Intrinsic::aarch64_sve_fcvt_f32f16:
2206 case Intrinsic::aarch64_sve_fcvt_f32f64:
2207 case Intrinsic::aarch64_sve_fcvt_f64f16:
2208 case Intrinsic::aarch64_sve_fcvt_f64f32:
2209 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
2210 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
2211 case Intrinsic::aarch64_sve_fcvtx_f32f64:
2212 case Intrinsic::aarch64_sve_fcvtzs:
2213 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
2214 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
2215 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
2216 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
2217 case Intrinsic::aarch64_sve_fcvtzu:
2218 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
2219 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
2220 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
2221 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
2222 case Intrinsic::aarch64_sve_scvtf:
2223 case Intrinsic::aarch64_sve_scvtf_f16i32:
2224 case Intrinsic::aarch64_sve_scvtf_f16i64:
2225 case Intrinsic::aarch64_sve_scvtf_f32i64:
2226 case Intrinsic::aarch64_sve_scvtf_f64i32:
2227 case Intrinsic::aarch64_sve_ucvtf:
2228 case Intrinsic::aarch64_sve_ucvtf_f16i32:
2229 case Intrinsic::aarch64_sve_ucvtf_f16i64:
2230 case Intrinsic::aarch64_sve_ucvtf_f32i64:
2231 case Intrinsic::aarch64_sve_ucvtf_f64i32:
2233 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
2234 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
2235 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
2236 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
2238 case Intrinsic::aarch64_sve_st1_scatter:
2239 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
2240 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
2241 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
2242 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
2243 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
2244 case Intrinsic::aarch64_sve_st1dq:
2245 case Intrinsic::aarch64_sve_st1q_scatter_index:
2246 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
2247 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
2248 case Intrinsic::aarch64_sve_st1wq:
2249 case Intrinsic::aarch64_sve_stnt1:
2250 case Intrinsic::aarch64_sve_stnt1_scatter:
2251 case Intrinsic::aarch64_sve_stnt1_scatter_index:
2252 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
2253 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
2255 case Intrinsic::aarch64_sve_st2:
2256 case Intrinsic::aarch64_sve_st2q:
2258 case Intrinsic::aarch64_sve_st3:
2259 case Intrinsic::aarch64_sve_st3q:
2261 case Intrinsic::aarch64_sve_st4:
2262 case Intrinsic::aarch64_sve_st4q:
2264 case Intrinsic::aarch64_sve_addqv:
2265 case Intrinsic::aarch64_sve_and_z:
2266 case Intrinsic::aarch64_sve_bic_z:
2267 case Intrinsic::aarch64_sve_brka_z:
2268 case Intrinsic::aarch64_sve_brkb_z:
2269 case Intrinsic::aarch64_sve_brkn_z:
2270 case Intrinsic::aarch64_sve_brkpa_z:
2271 case Intrinsic::aarch64_sve_brkpb_z:
2272 case Intrinsic::aarch64_sve_cntp:
2273 case Intrinsic::aarch64_sve_compact:
2274 case Intrinsic::aarch64_sve_eor_z:
2275 case Intrinsic::aarch64_sve_eorv:
2276 case Intrinsic::aarch64_sve_eorqv:
2277 case Intrinsic::aarch64_sve_nand_z:
2278 case Intrinsic::aarch64_sve_nor_z:
2279 case Intrinsic::aarch64_sve_orn_z:
2280 case Intrinsic::aarch64_sve_orr_z:
2281 case Intrinsic::aarch64_sve_orv:
2282 case Intrinsic::aarch64_sve_orqv:
2283 case Intrinsic::aarch64_sve_pnext:
2284 case Intrinsic::aarch64_sve_rdffr_z:
2285 case Intrinsic::aarch64_sve_saddv:
2286 case Intrinsic::aarch64_sve_uaddv:
2287 case Intrinsic::aarch64_sve_umaxv:
2288 case Intrinsic::aarch64_sve_umaxqv:
2289 case Intrinsic::aarch64_sve_cmpeq:
2290 case Intrinsic::aarch64_sve_cmpeq_wide:
2291 case Intrinsic::aarch64_sve_cmpge:
2292 case Intrinsic::aarch64_sve_cmpge_wide:
2293 case Intrinsic::aarch64_sve_cmpgt:
2294 case Intrinsic::aarch64_sve_cmpgt_wide:
2295 case Intrinsic::aarch64_sve_cmphi:
2296 case Intrinsic::aarch64_sve_cmphi_wide:
2297 case Intrinsic::aarch64_sve_cmphs:
2298 case Intrinsic::aarch64_sve_cmphs_wide:
2299 case Intrinsic::aarch64_sve_cmple_wide:
2300 case Intrinsic::aarch64_sve_cmplo_wide:
2301 case Intrinsic::aarch64_sve_cmpls_wide:
2302 case Intrinsic::aarch64_sve_cmplt_wide:
2303 case Intrinsic::aarch64_sve_facge:
2304 case Intrinsic::aarch64_sve_facgt:
2305 case Intrinsic::aarch64_sve_fcmpeq:
2306 case Intrinsic::aarch64_sve_fcmpge:
2307 case Intrinsic::aarch64_sve_fcmpgt:
2308 case Intrinsic::aarch64_sve_fcmpne:
2309 case Intrinsic::aarch64_sve_fcmpuo:
2310 case Intrinsic::aarch64_sve_ld1_gather:
2311 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2312 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2313 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2314 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2315 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2316 case Intrinsic::aarch64_sve_ld1q_gather_index:
2317 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2318 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2319 case Intrinsic::aarch64_sve_ld1ro:
2320 case Intrinsic::aarch64_sve_ld1rq:
2321 case Intrinsic::aarch64_sve_ld1udq:
2322 case Intrinsic::aarch64_sve_ld1uwq:
2323 case Intrinsic::aarch64_sve_ld2_sret:
2324 case Intrinsic::aarch64_sve_ld2q_sret:
2325 case Intrinsic::aarch64_sve_ld3_sret:
2326 case Intrinsic::aarch64_sve_ld3q_sret:
2327 case Intrinsic::aarch64_sve_ld4_sret:
2328 case Intrinsic::aarch64_sve_ld4q_sret:
2329 case Intrinsic::aarch64_sve_ldff1:
2330 case Intrinsic::aarch64_sve_ldff1_gather:
2331 case Intrinsic::aarch64_sve_ldff1_gather_index:
2332 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2333 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2334 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2335 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2336 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2337 case Intrinsic::aarch64_sve_ldnf1:
2338 case Intrinsic::aarch64_sve_ldnt1:
2339 case Intrinsic::aarch64_sve_ldnt1_gather:
2340 case Intrinsic::aarch64_sve_ldnt1_gather_index:
2341 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2342 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2344 case Intrinsic::aarch64_sve_prf:
2345 case Intrinsic::aarch64_sve_prfb_gather_index:
2346 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
2347 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
2348 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
2349 case Intrinsic::aarch64_sve_prfd_gather_index:
2350 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
2351 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
2352 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
2353 case Intrinsic::aarch64_sve_prfh_gather_index:
2354 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
2355 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
2356 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
2357 case Intrinsic::aarch64_sve_prfw_gather_index:
2358 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
2359 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
2360 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
2362 case Intrinsic::aarch64_neon_fmaxnm:
2363 case Intrinsic::aarch64_neon_fminnm:
2365 case Intrinsic::aarch64_sve_convert_from_svbool:
2367 case Intrinsic::aarch64_sve_dup:
2369 case Intrinsic::aarch64_sve_dup_x:
2371 case Intrinsic::aarch64_sve_cmpne:
2372 case Intrinsic::aarch64_sve_cmpne_wide:
2374 case Intrinsic::aarch64_sve_rdffr:
2376 case Intrinsic::aarch64_sve_lasta:
2377 case Intrinsic::aarch64_sve_lastb:
2379 case Intrinsic::aarch64_sve_clasta_n:
2380 case Intrinsic::aarch64_sve_clastb_n:
2382 case Intrinsic::aarch64_sve_cntd:
2384 case Intrinsic::aarch64_sve_cntw:
2386 case Intrinsic::aarch64_sve_cnth:
2388 case Intrinsic::aarch64_sve_cntb:
2390 case Intrinsic::aarch64_sve_ptest_any:
2391 case Intrinsic::aarch64_sve_ptest_first:
2392 case Intrinsic::aarch64_sve_ptest_last:
2394 case Intrinsic::aarch64_sve_fabd:
2396 case Intrinsic::aarch64_sve_fadd:
2398 case Intrinsic::aarch64_sve_fadd_u:
2400 case Intrinsic::aarch64_sve_fdiv:
2402 case Intrinsic::aarch64_sve_fmax:
2404 case Intrinsic::aarch64_sve_fmaxnm:
2406 case Intrinsic::aarch64_sve_fmin:
2408 case Intrinsic::aarch64_sve_fminnm:
2410 case Intrinsic::aarch64_sve_fmla:
2412 case Intrinsic::aarch64_sve_fmls:
2414 case Intrinsic::aarch64_sve_fmul:
2419 case Intrinsic::aarch64_sve_fmul_u:
2421 case Intrinsic::aarch64_sve_fmulx:
2423 case Intrinsic::aarch64_sve_fnmla:
2425 case Intrinsic::aarch64_sve_fnmls:
2427 case Intrinsic::aarch64_sve_fsub:
2429 case Intrinsic::aarch64_sve_fsub_u:
2431 case Intrinsic::aarch64_sve_add:
2433 case Intrinsic::aarch64_sve_add_u:
2435 Intrinsic::aarch64_sve_mla_u>(
2437 case Intrinsic::aarch64_sve_mla:
2439 case Intrinsic::aarch64_sve_mls:
2441 case Intrinsic::aarch64_sve_mul:
2446 case Intrinsic::aarch64_sve_mul_u:
2448 case Intrinsic::aarch64_sve_sabd:
2450 case Intrinsic::aarch64_sve_smax:
2452 case Intrinsic::aarch64_sve_smin:
2454 case Intrinsic::aarch64_sve_smulh:
2456 case Intrinsic::aarch64_sve_sub:
2458 case Intrinsic::aarch64_sve_sub_u:
2460 Intrinsic::aarch64_sve_mls_u>(
2462 case Intrinsic::aarch64_sve_uabd:
2464 case Intrinsic::aarch64_sve_umax:
2466 case Intrinsic::aarch64_sve_umin:
2468 case Intrinsic::aarch64_sve_umulh:
2470 case Intrinsic::aarch64_sve_asr:
2472 case Intrinsic::aarch64_sve_lsl:
2474 case Intrinsic::aarch64_sve_lsr:
2476 case Intrinsic::aarch64_sve_and:
2478 case Intrinsic::aarch64_sve_bic:
2480 case Intrinsic::aarch64_sve_eor:
2482 case Intrinsic::aarch64_sve_orr:
2484 case Intrinsic::aarch64_sve_sqsub:
2486 case Intrinsic::aarch64_sve_uqsub:
2488 case Intrinsic::aarch64_sve_tbl:
2490 case Intrinsic::aarch64_sve_uunpkhi:
2491 case Intrinsic::aarch64_sve_uunpklo:
2492 case Intrinsic::aarch64_sve_sunpkhi:
2493 case Intrinsic::aarch64_sve_sunpklo:
2495 case Intrinsic::aarch64_sve_uzp1:
2497 case Intrinsic::aarch64_sve_zip1:
2498 case Intrinsic::aarch64_sve_zip2:
2500 case Intrinsic::aarch64_sve_ld1_gather_index:
2502 case Intrinsic::aarch64_sve_st1_scatter_index:
2504 case Intrinsic::aarch64_sve_ld1:
2506 case Intrinsic::aarch64_sve_st1:
2508 case Intrinsic::aarch64_sve_sdiv:
2510 case Intrinsic::aarch64_sve_sel:
2512 case Intrinsic::aarch64_sve_srshl:
2514 case Intrinsic::aarch64_sve_dupq_lane:
2516 case Intrinsic::aarch64_sve_insr:
2520 return std::nullopt;
2527 SimplifyAndSetOp)
const {
2528 switch (
II.getIntrinsicID()) {
2531 case Intrinsic::aarch64_neon_fcvtxn:
2532 case Intrinsic::aarch64_neon_rshrn:
2533 case Intrinsic::aarch64_neon_sqrshrn:
2534 case Intrinsic::aarch64_neon_sqrshrun:
2535 case Intrinsic::aarch64_neon_sqshrn:
2536 case Intrinsic::aarch64_neon_sqshrun:
2537 case Intrinsic::aarch64_neon_sqxtn:
2538 case Intrinsic::aarch64_neon_sqxtun:
2539 case Intrinsic::aarch64_neon_uqrshrn:
2540 case Intrinsic::aarch64_neon_uqshrn:
2541 case Intrinsic::aarch64_neon_uqxtn:
2542 SimplifyAndSetOp(&
II, 0, OrigDemandedElts, UndefElts);
2546 return std::nullopt;
2578bool AArch64TTIImpl::isWideningInstruction(
Type *DstTy,
unsigned Opcode,
2580 Type *SrcOverrideTy) {
2585 cast<VectorType>(DstTy)->getElementCount());
2595 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2605 Type *SrcTy = SrcOverrideTy;
2607 case Instruction::Add:
2608 case Instruction::Sub:
2610 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2617 case Instruction::Mul: {
2619 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2620 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2624 }
else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2653 assert(SrcTy &&
"Expected some SrcTy");
2655 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2661 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2663 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2667 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2680 (Src->isScalableTy() && !ST->hasSVE2()))
2689 dyn_cast_or_null<Instruction>(
Add->getUniqueUndroppableUser());
2690 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2693 auto *Shr = dyn_cast_or_null<Instruction>(
Add->getUniqueUndroppableUser());
2694 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2697 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2698 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2699 Src->getScalarSizeInBits() !=
2700 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2724 assert(ISD &&
"Invalid opcode");
2727 if (
I &&
I->hasOneUser()) {
2728 auto *SingleUser = cast<Instruction>(*
I->user_begin());
2730 if (isWideningInstruction(Dst, SingleUser->getOpcode(),
Operands, Src)) {
2734 if (SingleUser->getOpcode() == Instruction::Add) {
2735 if (
I == SingleUser->getOperand(1) ||
2736 (isa<CastInst>(SingleUser->getOperand(1)) &&
2737 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2744 if ((isa<ZExtInst>(
I) || isa<SExtInst>(
I)) &&
2752 return Cost == 0 ? 0 : 1;
3078 EVT WiderTy = SrcTy.
bitsGT(DstTy) ? SrcTy : DstTy;
3082 std::pair<InstructionCost, MVT> LT =
3084 unsigned NumElements =
3096 return AdjustCost(Entry->Cost);
3123 if (ST->hasFullFP16())
3126 return AdjustCost(Entry->Cost);
3142 Opcode, LegalTy, Src, CCH,
CostKind,
I);
3145 return Part1 + Part2;
3165 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3173 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) &&
"Invalid type");
3179 CostKind, Index,
nullptr,
nullptr);
3189 if (!VecLT.second.isVector() || !TLI->
isTypeLegal(DstVT))
3195 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3205 case Instruction::SExt:
3210 case Instruction::ZExt:
3211 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3224 return Opcode == Instruction::PHI ? 0 : 1;
3231 unsigned Opcode,
Type *Val,
unsigned Index,
bool HasRealUse,
3233 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3241 if (!LT.second.isVector())
3246 if (LT.second.isFixedLengthVector()) {
3247 unsigned Width = LT.second.getVectorNumElements();
3248 Index = Index % Width;
3264 if (
I && dyn_cast<LoadInst>(
I->getOperand(1)))
3294 auto ExtractCanFuseWithFmul = [&]() {
3301 auto IsAllowedScalarTy = [&](
const Type *
T) {
3302 return T->isFloatTy() ||
T->isDoubleTy() ||
3303 (
T->isHalfTy() && ST->hasFullFP16());
3307 auto IsUserFMulScalarTy = [](
const Value *EEUser) {
3309 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
3310 return BO && BO->getOpcode() == BinaryOperator::FMul &&
3311 !BO->getType()->isVectorTy();
3316 auto IsExtractLaneEquivalentToZero = [&](
unsigned Idx,
unsigned EltSz) {
3320 return Idx == 0 || (RegWidth != 0 && (
Idx * EltSz) % RegWidth == 0);
3325 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->
getScalarType()))
3330 for (
auto *U :
Scalar->users()) {
3331 if (!IsUserFMulScalarTy(U))
3335 UserToExtractIdx[
U];
3337 if (UserToExtractIdx.
empty())
3339 for (
auto &[S, U, L] : ScalarUserAndIdx) {
3340 for (
auto *U : S->users()) {
3341 if (UserToExtractIdx.
find(U) != UserToExtractIdx.
end()) {
3342 auto *
FMul = cast<BinaryOperator>(U);
3343 auto *Op0 =
FMul->getOperand(0);
3344 auto *Op1 =
FMul->getOperand(1);
3345 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
3346 UserToExtractIdx[
U] =
L;
3352 for (
auto &[U, L] : UserToExtractIdx) {
3358 const auto *EE = cast<ExtractElementInst>(
I);
3360 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3364 return !EE->users().empty() &&
all_of(EE->users(), [&](
const User *U) {
3365 if (!IsUserFMulScalarTy(U))
3370 const auto *BO = cast<BinaryOperator>(U);
3371 const auto *OtherEE = dyn_cast<ExtractElementInst>(
3372 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3374 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3377 return IsExtractLaneEquivalentToZero(
3378 cast<ConstantInt>(OtherEE->getIndexOperand())
3381 OtherEE->getType()->getScalarSizeInBits());
3389 if (Opcode == Instruction::ExtractElement && (
I || Scalar) &&
3390 ExtractCanFuseWithFmul())
3394 return ST->getVectorInsertExtractBaseCost();
3399 unsigned Index,
Value *Op0,
3402 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3403 return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse);
3409 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3410 return getVectorInstrCostHelper(Opcode, Val, Index,
false,
nullptr, Scalar,
3418 return getVectorInstrCostHelper(
I.getOpcode(), Val, Index,
3425 if (isa<ScalableVectorType>(Ty))
3430 return DemandedElts.
popcount() * (Insert + Extract) *
3444 if (
auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3451 Op2Info, Args, CxtI);
3493 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
3500 if (!VT.isVector() && VT.getSizeInBits() > 64)
3504 Opcode, Ty,
CostKind, Op1Info, Op2Info);
3509 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
3510 ->getPrimitiveSizeInBits()
3511 .getFixedValue() < 128) {
3522 if (
nullptr != Entry)
3527 if (LT.second.getScalarType() == MVT::i8)
3529 else if (LT.second.getScalarType() == MVT::i16)
3539 if (
auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
3542 return (4 + DivCost) * VTy->getNumElements();
3562 if (LT.second == MVT::v2i64 && ST->hasSVE())
3577 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3579 return LT.first * 14;
3594 (Ty->
isHalfTy() && ST->hasFullFP16())) &&
3607 return 2 * LT.first;
3616 return 2 * LT.first;
3638 int MaxMergeDistance = 64;
3642 return NumVectorInstToHideOverhead;
3656 Op1Info, Op2Info,
I);
3661 if (isa<FixedVectorType>(ValTy) && ISD ==
ISD::SELECT) {
3663 const int AmortizationCost = 20;
3671 VecPred = CurrentPred;
3679 static const auto ValidMinMaxTys = {
3680 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3681 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3682 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3685 if (
any_of(ValidMinMaxTys, [<](
MVT M) {
return M == LT.second; }) ||
3686 (ST->hasFullFP16() &&
3687 any_of(ValidFP16MinMaxTys, [<](
MVT M) {
return M == LT.second; })))
3692 VectorSelectTbl[] = {
3701 {
ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3702 {
ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3703 {
ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3716 if (isa<FixedVectorType>(ValTy) && ISD ==
ISD::SETCC) {
3719 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3720 return LT.first * 4;
3736 Op1Info, Op2Info,
I);
3742 if (ST->requiresStrictAlign()) {
3747 Options.AllowOverlappingLoads =
true;
3753 Options.LoadSizes = {8, 4, 2, 1};
3754 Options.AllowedTailExpansions = {3, 5, 6};
3759 return ST->hasSVE();
3770 if (!LT.first.isValid())
3774 auto *VT = cast<VectorType>(Src);
3775 if (VT->getElementType()->isIntegerTy(1))
3792 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3793 "Should be called on only load or stores.");
3795 case Instruction::Load:
3798 return ST->getGatherOverhead();
3800 case Instruction::Store:
3803 return ST->getScatterOverhead();
3811 unsigned Opcode,
Type *DataTy,
const Value *
Ptr,
bool VariableMask,
3816 auto *VT = cast<VectorType>(DataTy);
3818 if (!LT.first.isValid())
3822 if (!LT.second.isVector() ||
3824 VT->getElementType()->isIntegerTy(1))
3834 ElementCount LegalVF = LT.second.getVectorElementCount();
3837 {TTI::OK_AnyValue, TTI::OP_None},
I);
3855 if (VT == MVT::Other)
3860 if (!LT.first.isValid())
3868 if (
auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3870 (VTy->getElementType()->isIntegerTy(1) &&
3871 !VTy->getElementCount().isKnownMultipleOf(
3882 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3883 LT.second.is128BitVector() && (!Alignment || *Alignment <
Align(16))) {
3889 const int AmortizationCost = 6;
3891 return LT.first * 2 * AmortizationCost;
3902 if (VT == MVT::v4i8)
3905 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3909 if (!
isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3911 *Alignment !=
Align(1))
3925 while (!TypeWorklist.
empty()) {
3947 bool UseMaskForCond,
bool UseMaskForGaps) {
3948 assert(Factor >= 2 &&
"Invalid interleave factor");
3949 auto *VecVTy = cast<VectorType>(VecTy);
3956 if (!VecTy->
isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3959 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3960 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3963 VecVTy->getElementCount().divideCoefficientBy(Factor));
3969 if (MinElts % Factor == 0 &&
3976 UseMaskForCond, UseMaskForGaps);
3983 for (
auto *
I : Tys) {
3984 if (!
I->isVectorTy())
3986 if (
I->getScalarSizeInBits() * cast<FixedVectorType>(
I)->getNumElements() ==
4005 enum { MaxStridedLoads = 7 };
4007 int StridedLoads = 0;
4010 for (
const auto BB : L->blocks()) {
4011 for (
auto &
I : *BB) {
4012 LoadInst *LMemI = dyn_cast<LoadInst>(&
I);
4017 if (L->isLoopInvariant(PtrValue))
4021 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4022 if (!LSCEVAddRec || !LSCEVAddRec->
isAffine())
4031 if (StridedLoads > MaxStridedLoads / 2)
4032 return StridedLoads;
4035 return StridedLoads;
4038 int StridedLoads = countStridedLoads(L, SE);
4040 <<
" strided loads\n");
4062 if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
4066 if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4074 for (
auto *BB : L->getBlocks()) {
4075 for (
auto &
I : *BB) {
4076 if (!isa<IntrinsicInst>(&
I) && isa<CallBase>(&
I))
4090 if (Header == L->getLoopLatch()) {
4096 for (
auto *BB : L->blocks()) {
4097 for (
auto &
I : *BB) {
4104 if (isa<LoadInst>(&
I))
4113 unsigned MaxInstsPerLine = 16;
4115 unsigned BestUC = 1;
4116 unsigned SizeWithBestUC = BestUC *
Size;
4118 unsigned SizeWithUC = UC *
Size;
4119 if (SizeWithUC > 48)
4121 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4122 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4124 SizeWithBestUC = BestUC *
Size;
4130 return LoadedValues.
contains(SI->getOperand(0));
4141 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
4142 auto *Latch = L->getLoopLatch();
4144 if (!Term || !Term->isConditional() || Preds.
size() == 1 ||
4151 if (isa<PHINode>(
I) || L->isLoopInvariant(
I) ||
Depth > 8)
4154 if (isa<LoadInst>(
I))
4158 auto *I = dyn_cast<Instruction>(V);
4159 return I && DependsOnLoopLoad(I, Depth + 1);
4166 DependsOnLoopLoad(
I, 0)) {
4182 if (L->getLoopDepth() > 1)
4190 case AArch64Subtarget::AppleA14:
4191 case AArch64Subtarget::AppleA15:
4192 case AArch64Subtarget::AppleA16:
4193 case AArch64Subtarget::AppleM4:
4196 case AArch64Subtarget::Falkor:
4207 for (
auto *BB : L->getBlocks()) {
4208 for (
auto &
I : *BB) {
4210 if (
I.getType()->isVectorTy())
4213 if (isa<CallInst>(
I) || isa<InvokeInst>(
I)) {
4228 !ST->getSchedModel().isOutOfOrder()) {
4245 Type *ExpectedType) {
4249 case Intrinsic::aarch64_neon_st2:
4250 case Intrinsic::aarch64_neon_st3:
4251 case Intrinsic::aarch64_neon_st4: {
4253 StructType *ST = dyn_cast<StructType>(ExpectedType);
4256 unsigned NumElts = Inst->
arg_size() - 1;
4257 if (ST->getNumElements() != NumElts)
4259 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
4265 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
4271 case Intrinsic::aarch64_neon_ld2:
4272 case Intrinsic::aarch64_neon_ld3:
4273 case Intrinsic::aarch64_neon_ld4:
4274 if (Inst->
getType() == ExpectedType)
4285 case Intrinsic::aarch64_neon_ld2:
4286 case Intrinsic::aarch64_neon_ld3:
4287 case Intrinsic::aarch64_neon_ld4:
4288 Info.ReadMem =
true;
4289 Info.WriteMem =
false;
4292 case Intrinsic::aarch64_neon_st2:
4293 case Intrinsic::aarch64_neon_st3:
4294 case Intrinsic::aarch64_neon_st4:
4295 Info.ReadMem =
false;
4296 Info.WriteMem =
true;
4304 case Intrinsic::aarch64_neon_ld2:
4305 case Intrinsic::aarch64_neon_st2:
4306 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
4308 case Intrinsic::aarch64_neon_ld3:
4309 case Intrinsic::aarch64_neon_st3:
4310 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
4312 case Intrinsic::aarch64_neon_ld4:
4313 case Intrinsic::aarch64_neon_st4:
4314 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
4326 const Instruction &
I,
bool &AllowPromotionWithoutCommonHeader) {
4327 bool Considerable =
false;
4328 AllowPromotionWithoutCommonHeader =
false;
4329 if (!isa<SExtInst>(&
I))
4331 Type *ConsideredSExtType =
4333 if (
I.getType() != ConsideredSExtType)
4337 for (
const User *U :
I.users()) {
4339 Considerable =
true;
4343 if (GEPInst->getNumOperands() > 2) {
4344 AllowPromotionWithoutCommonHeader =
true;
4349 return Considerable;
4390 if (
auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4396 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
4406 return LegalizationCost + 2;
4416 LegalizationCost *= LT.first - 1;
4420 assert(ISD &&
"Invalid opcode");
4428 return LegalizationCost + 2;
4436 std::optional<FastMathFlags> FMF,
4442 if (
auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
4447 if (
auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
4452 return BaseCost + FixedVTy->getNumElements();
4455 if (Opcode != Instruction::FAdd)
4458 auto *VTy = cast<ScalableVectorType>(ValTy);
4465 if (isa<ScalableVectorType>(ValTy))
4469 MVT MTy = LT.second;
4471 assert(ISD &&
"Invalid opcode");
4517 MTy.
isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
4518 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
4530 return (LT.first - 1) +
Log2_32(NElts);
4535 return (LT.first - 1) + Entry->Cost;
4543 auto *ValVTy = cast<FixedVectorType>(ValTy);
4547 if (LT.first != 1) {
4553 ExtraCost *= LT.first - 1;
4556 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
4557 return Cost + ExtraCost;
4591 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
4606 if (LT.second.getScalarType() == MVT::i1) {
4615 assert(Entry &&
"Illegal Type for Splice");
4616 LegalizationCost += Entry->Cost;
4617 return LegalizationCost * LT.first;
4628 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
4630 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
4636 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
4639 return std::max<InstructionCost>(1, LT.first / 4);
4652 unsigned TpNumElts = Mask.size();
4653 unsigned LTNumElts = LT.second.getVectorNumElements();
4654 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
4658 for (
unsigned N = 0;
N < NumVecs;
N++) {
4662 unsigned Source1, Source2;
4663 unsigned NumSources = 0;
4664 for (
unsigned E = 0; E < LTNumElts; E++) {
4665 int MaskElt = (
N * LTNumElts + E < TpNumElts) ? Mask[
N * LTNumElts + E]
4674 unsigned Source = MaskElt / LTNumElts;
4675 if (NumSources == 0) {
4678 }
else if (NumSources == 1 &