21#include "llvm/IR/IntrinsicsAArch64.h"
31#define DEBUG_TYPE "aarch64tti"
52 "Penalty of calling a function that requires a change to PSTATE.SM"));
56 cl::desc(
"Penalty of inlining a call that requires a change to PSTATE.SM"));
67 cl::desc(
"The cost of a histcnt instruction"));
70class TailFoldingOption {
85 bool NeedsDefault =
true;
89 void setNeedsDefault(
bool V) { NeedsDefault =
V; }
104 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
105 "Initial bits should only include one of "
106 "(disabled|all|simple|default)");
107 Bits = NeedsDefault ? DefaultBits : InitialBits;
109 Bits &= ~DisableBits;
115 errs() <<
"invalid argument '" << Opt
116 <<
"' to -sve-tail-folding=; the option should be of the form\n"
117 " (disabled|all|default|simple)[+(reductions|recurrences"
118 "|reverse|noreductions|norecurrences|noreverse)]\n";
124 void operator=(
const std::string &Val) {
133 setNeedsDefault(
false);
138 unsigned StartIdx = 1;
139 if (TailFoldTypes[0] ==
"disabled")
140 setInitialBits(TailFoldingOpts::Disabled);
141 else if (TailFoldTypes[0] ==
"all")
142 setInitialBits(TailFoldingOpts::All);
143 else if (TailFoldTypes[0] ==
"default")
144 setNeedsDefault(
true);
145 else if (TailFoldTypes[0] ==
"simple")
146 setInitialBits(TailFoldingOpts::Simple);
149 setInitialBits(TailFoldingOpts::Disabled);
152 for (
unsigned I = StartIdx;
I < TailFoldTypes.
size();
I++) {
153 if (TailFoldTypes[
I] ==
"reductions")
154 setEnableBit(TailFoldingOpts::Reductions);
155 else if (TailFoldTypes[
I] ==
"recurrences")
156 setEnableBit(TailFoldingOpts::Recurrences);
157 else if (TailFoldTypes[
I] ==
"reverse")
158 setEnableBit(TailFoldingOpts::Reverse);
159 else if (TailFoldTypes[
I] ==
"noreductions")
160 setDisableBit(TailFoldingOpts::Reductions);
161 else if (TailFoldTypes[
I] ==
"norecurrences")
162 setDisableBit(TailFoldingOpts::Recurrences);
163 else if (TailFoldTypes[
I] ==
"noreverse")
164 setDisableBit(TailFoldingOpts::Reverse);
181 "Control the use of vectorisation using tail-folding for SVE where the"
182 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
183 "\ndisabled (Initial) No loop types will vectorize using "
185 "\ndefault (Initial) Uses the default tail-folding settings for "
187 "\nall (Initial) All legal loop types will vectorize using "
189 "\nsimple (Initial) Use tail-folding for simple loops (not "
190 "reductions or recurrences)"
191 "\nreductions Use tail-folding for loops containing reductions"
192 "\nnoreductions Inverse of above"
193 "\nrecurrences Use tail-folding for loops containing fixed order "
195 "\nnorecurrences Inverse of above"
196 "\nreverse Use tail-folding for loops requiring reversed "
198 "\nnoreverse Inverse of above"),
216 .
Case(
"__arm_sme_state",
true)
217 .
Case(
"__arm_tpidr2_save",
true)
218 .
Case(
"__arm_tpidr2_restore",
true)
219 .
Case(
"__arm_za_disable",
true)
233 if (isa<CallInst>(
I) && !
I.isDebugOrPseudoInst() &&
234 (cast<CallInst>(
I).isInlineAsm() || isa<IntrinsicInst>(
I) ||
244 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
256 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
257 CallerAttrs.requiresSMChange(CalleeAttrs) ||
258 CallerAttrs.requiresPreservingZT0(CalleeAttrs)) {
266 TM.getSubtargetImpl(*Caller)->getFeatureBits();
268 TM.getSubtargetImpl(*Callee)->getFeatureBits();
272 return (CallerBits & CalleeBits) == CalleeBits;
290 auto FVTy = dyn_cast<FixedVectorType>(Ty);
292 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
301 unsigned DefaultCallPenalty)
const {
324 if (
F == Call.getCaller())
330 return DefaultCallPenalty;
369 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
374 for (
unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
380 return std::max<InstructionCost>(1,
Cost);
395 unsigned ImmIdx = ~0U;
399 case Instruction::GetElementPtr:
404 case Instruction::Store:
407 case Instruction::Add:
408 case Instruction::Sub:
409 case Instruction::Mul:
410 case Instruction::UDiv:
411 case Instruction::SDiv:
412 case Instruction::URem:
413 case Instruction::SRem:
414 case Instruction::And:
415 case Instruction::Or:
416 case Instruction::Xor:
417 case Instruction::ICmp:
421 case Instruction::Shl:
422 case Instruction::LShr:
423 case Instruction::AShr:
427 case Instruction::Trunc:
428 case Instruction::ZExt:
429 case Instruction::SExt:
430 case Instruction::IntToPtr:
431 case Instruction::PtrToInt:
432 case Instruction::BitCast:
433 case Instruction::PHI:
434 case Instruction::Call:
435 case Instruction::Select:
436 case Instruction::Ret:
437 case Instruction::Load:
442 int NumConstants = (BitSize + 63) / 64;
466 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
472 case Intrinsic::sadd_with_overflow:
473 case Intrinsic::uadd_with_overflow:
474 case Intrinsic::ssub_with_overflow:
475 case Intrinsic::usub_with_overflow:
476 case Intrinsic::smul_with_overflow:
477 case Intrinsic::umul_with_overflow:
479 int NumConstants = (BitSize + 63) / 64;
486 case Intrinsic::experimental_stackmap:
487 if ((
Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
490 case Intrinsic::experimental_patchpoint_void:
491 case Intrinsic::experimental_patchpoint:
492 if ((
Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
495 case Intrinsic::experimental_gc_statepoint:
496 if ((
Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
506 if (TyWidth == 32 || TyWidth == 64)
531 if (
VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy))
532 if ((VTy->getElementCount().getKnownMinValue() != 2 &&
533 VTy->getElementCount().getKnownMinValue() != 4) ||
534 VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 ||
535 !VTy->isScalableTy())
549 if (
auto *VTy = dyn_cast<ScalableVectorType>(
RetTy))
553 switch (ICA.
getID()) {
554 case Intrinsic::experimental_vector_histogram_add:
558 case Intrinsic::umin:
559 case Intrinsic::umax:
560 case Intrinsic::smin:
561 case Intrinsic::smax: {
562 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
563 MVT::v8i16, MVT::v2i32, MVT::v4i32,
564 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
568 if (LT.second == MVT::v2i64)
570 if (
any_of(ValidMinMaxTys, [<](
MVT M) {
return M == LT.second; }))
574 case Intrinsic::sadd_sat:
575 case Intrinsic::ssub_sat:
576 case Intrinsic::uadd_sat:
577 case Intrinsic::usub_sat: {
578 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
579 MVT::v8i16, MVT::v2i32, MVT::v4i32,
585 LT.second.getScalarSizeInBits() ==
RetTy->getScalarSizeInBits() ? 1 : 4;
586 if (
any_of(ValidSatTys, [<](
MVT M) {
return M == LT.second; }))
587 return LT.first * Instrs;
590 case Intrinsic::abs: {
591 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
592 MVT::v8i16, MVT::v2i32, MVT::v4i32,
595 if (
any_of(ValidAbsTys, [<](
MVT M) {
return M == LT.second; }))
599 case Intrinsic::bswap: {
600 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
601 MVT::v4i32, MVT::v2i64};
603 if (
any_of(ValidAbsTys, [<](
MVT M) {
return M == LT.second; }) &&
604 LT.second.getScalarSizeInBits() ==
RetTy->getScalarSizeInBits())
608 case Intrinsic::experimental_stepvector: {
617 Cost += AddCost * (LT.first - 1);
621 case Intrinsic::vector_extract:
622 case Intrinsic::vector_insert: {
635 bool IsExtract = ICA.
getID() == Intrinsic::vector_extract;
654 case Intrinsic::bitreverse: {
656 {Intrinsic::bitreverse, MVT::i32, 1},
657 {Intrinsic::bitreverse, MVT::i64, 1},
658 {Intrinsic::bitreverse, MVT::v8i8, 1},
659 {Intrinsic::bitreverse, MVT::v16i8, 1},
660 {Intrinsic::bitreverse, MVT::v4i16, 2},
661 {Intrinsic::bitreverse, MVT::v8i16, 2},
662 {Intrinsic::bitreverse, MVT::v2i32, 2},
663 {Intrinsic::bitreverse, MVT::v4i32, 2},
664 {Intrinsic::bitreverse, MVT::v1i64, 2},
665 {Intrinsic::bitreverse, MVT::v2i64, 2},
675 return LegalisationCost.first * Entry->Cost + 1;
677 return LegalisationCost.first * Entry->Cost;
681 case Intrinsic::ctpop: {
682 if (!ST->hasNEON()) {
703 RetTy->getScalarSizeInBits()
706 return LT.first * Entry->Cost + ExtraCost;
710 case Intrinsic::sadd_with_overflow:
711 case Intrinsic::uadd_with_overflow:
712 case Intrinsic::ssub_with_overflow:
713 case Intrinsic::usub_with_overflow:
714 case Intrinsic::smul_with_overflow:
715 case Intrinsic::umul_with_overflow: {
717 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
718 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
719 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
720 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
721 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
722 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
723 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
724 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
725 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
726 {Intrinsic::usub_with_overflow, MVT::i8, 3},
727 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
728 {Intrinsic::usub_with_overflow, MVT::i16, 3},
729 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
730 {Intrinsic::usub_with_overflow, MVT::i32, 1},
731 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
732 {Intrinsic::usub_with_overflow, MVT::i64, 1},
733 {Intrinsic::smul_with_overflow, MVT::i8, 5},
734 {Intrinsic::umul_with_overflow, MVT::i8, 4},
735 {Intrinsic::smul_with_overflow, MVT::i16, 5},
736 {Intrinsic::umul_with_overflow, MVT::i16, 4},
737 {Intrinsic::smul_with_overflow, MVT::i32, 2},
738 {Intrinsic::umul_with_overflow, MVT::i32, 2},
739 {Intrinsic::smul_with_overflow, MVT::i64, 3},
740 {Intrinsic::umul_with_overflow, MVT::i64, 3},
749 case Intrinsic::fptosi_sat:
750 case Intrinsic::fptoui_sat: {
753 bool IsSigned = ICA.
getID() == Intrinsic::fptosi_sat;
758 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
759 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
760 LT.second == MVT::v2f64)) {
762 (LT.second == MVT::f64 && MTy == MVT::i32) ||
763 (LT.second == MVT::f32 && MTy == MVT::i64)))
772 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
779 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
780 (LT.second == MVT::f16 && MTy == MVT::i64) ||
781 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
795 if ((LT.second.getScalarType() == MVT::f32 ||
796 LT.second.getScalarType() == MVT::f64 ||
797 LT.second.getScalarType() == MVT::f16) &&
801 if (LT.second.isVector())
805 LegalTy, {LegalTy, LegalTy});
808 LegalTy, {LegalTy, LegalTy});
810 return LT.first *
Cost +
811 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
818 if (LT.second.isVector()) {
830 Type *CondTy =
RetTy->getWithNewBitWidth(1);
836 return LT.first *
Cost;
838 case Intrinsic::fshl:
839 case Intrinsic::fshr: {
852 {Intrinsic::fshl, MVT::v4i32, 3},
853 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
854 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
855 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
861 return LegalisationCost.first * Entry->Cost;
865 if (!
RetTy->isIntegerTy())
870 bool HigherCost = (
RetTy->getScalarSizeInBits() != 32 &&
871 RetTy->getScalarSizeInBits() < 64) ||
872 (
RetTy->getScalarSizeInBits() % 64 != 0);
873 unsigned ExtraCost = HigherCost ? 1 : 0;
874 if (
RetTy->getScalarSizeInBits() == 32 ||
875 RetTy->getScalarSizeInBits() == 64)
882 return TyL.first + ExtraCost;
884 case Intrinsic::get_active_lane_mask: {
889 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
900 return RetTy->getNumElements() * 2;
916 auto RequiredType =
II.getType();
918 auto *PN = dyn_cast<PHINode>(
II.getArgOperand(0));
919 assert(PN &&
"Expected Phi Node!");
922 if (!PN->hasOneUse())
925 for (
Value *IncValPhi : PN->incoming_values()) {
926 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
928 Reinterpret->getIntrinsicID() !=
929 Intrinsic::aarch64_sve_convert_to_svbool ||
930 RequiredType != Reinterpret->getArgOperand(0)->getType())
939 for (
unsigned I = 0;
I < PN->getNumIncomingValues();
I++) {
940 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(
I));
941 NPN->
addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(
I));
958static std::optional<Instruction *>
960 auto BinOp = dyn_cast<IntrinsicInst>(
II.getOperand(0));
964 auto IntrinsicID = BinOp->getIntrinsicID();
965 switch (IntrinsicID) {
966 case Intrinsic::aarch64_sve_and_z:
967 case Intrinsic::aarch64_sve_bic_z:
968 case Intrinsic::aarch64_sve_eor_z:
969 case Intrinsic::aarch64_sve_nand_z:
970 case Intrinsic::aarch64_sve_nor_z:
971 case Intrinsic::aarch64_sve_orn_z:
972 case Intrinsic::aarch64_sve_orr_z:
978 auto BinOpPred = BinOp->getOperand(0);
979 auto BinOpOp1 = BinOp->getOperand(1);
980 auto BinOpOp2 = BinOp->getOperand(2);
982 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
984 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
987 auto PredOp = PredIntr->getOperand(0);
988 auto PredOpTy = cast<VectorType>(PredOp->getType());
989 if (PredOpTy !=
II.getType())
994 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
995 NarrowedBinOpArgs.
push_back(NarrowBinOpOp1);
996 if (BinOpOp1 == BinOpOp2)
997 NarrowedBinOpArgs.
push_back(NarrowBinOpOp1);
1000 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1002 auto NarrowedBinOp =
1007static std::optional<Instruction *>
1010 if (isa<PHINode>(
II.getArgOperand(0)))
1014 return BinOpCombine;
1017 if (isa<TargetExtType>(
II.getArgOperand(0)->getType()) ||
1018 isa<TargetExtType>(
II.getType()))
1019 return std::nullopt;
1022 Value *Cursor =
II.getOperand(0), *EarliestReplacement =
nullptr;
1024 const auto *IVTy = cast<VectorType>(
II.getType());
1030 const auto *CursorVTy = cast<VectorType>(Cursor->
getType());
1031 if (CursorVTy->getElementCount().getKnownMinValue() <
1032 IVTy->getElementCount().getKnownMinValue())
1036 if (Cursor->
getType() == IVTy)
1037 EarliestReplacement = Cursor;
1039 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1042 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1043 Intrinsic::aarch64_sve_convert_to_svbool ||
1044 IntrinsicCursor->getIntrinsicID() ==
1045 Intrinsic::aarch64_sve_convert_from_svbool))
1048 CandidatesForRemoval.
insert(CandidatesForRemoval.
begin(), IntrinsicCursor);
1049 Cursor = IntrinsicCursor->getOperand(0);
1054 if (!EarliestReplacement)
1055 return std::nullopt;
1062 Value *UncastedPred;
1063 if (
match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1064 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1068 if (cast<ScalableVectorType>(Pred->
getType())->getMinNumElements() <=
1069 cast<ScalableVectorType>(UncastedPred->
getType())->getMinNumElements())
1070 Pred = UncastedPred;
1072 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1073 m_ConstantInt<AArch64SVEPredPattern::all>()));
1077static std::optional<Instruction *>
1083 return std::nullopt;
1088static std::optional<Instruction *>
1093 if (
RetTy->isStructTy()) {
1094 auto StructT = cast<StructType>(
RetTy);
1095 auto VecT = StructT->getElementType(0);
1097 for (
unsigned i = 0; i < StructT->getNumElements(); i++) {
1098 ZerVec.
push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1099 : ConstantInt::get(VecT, 0));
1102 }
else if (
RetTy->isFPOrFPVectorTy())
1105 Node = ConstantInt::get(
II.getType(), 0);
1110 return std::nullopt;
1116 auto *OpPredicate =
II.getOperand(0);
1129 return std::nullopt;
1132 return std::nullopt;
1134 const auto PTruePattern =
1135 cast<ConstantInt>(Pg->
getOperand(0))->getZExtValue();
1136 if (PTruePattern != AArch64SVEPredPattern::vl1)
1137 return std::nullopt;
1142 II.getArgOperand(0),
II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1143 Insert->insertBefore(&
II);
1144 Insert->takeName(&
II);
1152 auto *
RetTy = cast<ScalableVectorType>(
II.getType());
1154 II.getArgOperand(0));
1164 auto *Pg = dyn_cast<IntrinsicInst>(
II.getArgOperand(0));
1165 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1166 return std::nullopt;
1168 const auto PTruePattern =
1169 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1170 if (PTruePattern != AArch64SVEPredPattern::all)
1171 return std::nullopt;
1176 if (!SplatValue || !SplatValue->isZero())
1177 return std::nullopt;
1180 auto *DupQLane = dyn_cast<IntrinsicInst>(
II.getArgOperand(1));
1182 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1183 return std::nullopt;
1186 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1187 return std::nullopt;
1189 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1190 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1191 return std::nullopt;
1195 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1196 return std::nullopt;
1198 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1199 return std::nullopt;
1201 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1203 return std::nullopt;
1205 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1206 auto *OutTy = dyn_cast<ScalableVectorType>(
II.getType());
1207 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1208 return std::nullopt;
1210 unsigned NumElts = VecTy->getNumElements();
1211 unsigned PredicateBits = 0;
1214 for (
unsigned I = 0;
I < NumElts; ++
I) {
1215 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(
I));
1217 return std::nullopt;
1219 PredicateBits |= 1 << (
I * (16 / NumElts));
1223 if (PredicateBits == 0) {
1225 PFalse->takeName(&
II);
1231 for (
unsigned I = 0;
I < 16; ++
I)
1232 if ((PredicateBits & (1 <<
I)) != 0)
1235 unsigned PredSize = Mask & -Mask;
1240 for (
unsigned I = 0;
I < 16;
I += PredSize)
1241 if ((PredicateBits & (1 <<
I)) == 0)
1242 return std::nullopt;
1247 {PredType}, {PTruePat});
1249 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1250 auto *ConvertFromSVBool =
1252 {
II.getType()}, {ConvertToSVBool});
1260 Value *Pg =
II.getArgOperand(0);
1261 Value *Vec =
II.getArgOperand(1);
1262 auto IntrinsicID =
II.getIntrinsicID();
1263 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1274 auto *OldBinOp = cast<BinaryOperator>(Vec);
1275 auto OpC = OldBinOp->getOpcode();
1281 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(),
II.getIterator());
1286 auto *
C = dyn_cast<Constant>(Pg);
1287 if (IsAfter &&
C &&
C->isNullValue()) {
1291 Extract->insertBefore(&
II);
1292 Extract->takeName(&
II);
1296 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1298 return std::nullopt;
1300 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1301 return std::nullopt;
1303 const auto PTruePattern =
1304 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1309 return std::nullopt;
1311 unsigned Idx = MinNumElts - 1;
1320 auto *PgVTy = cast<ScalableVectorType>(Pg->
getType());
1321 if (
Idx >= PgVTy->getMinNumElements())
1322 return std::nullopt;
1327 Extract->insertBefore(&
II);
1328 Extract->takeName(&
II);
1341 Value *Pg =
II.getArgOperand(0);
1343 Value *Vec =
II.getArgOperand(2);
1347 return std::nullopt;
1352 return std::nullopt;
1366 FPTy, cast<VectorType>(Vec->
getType())->getElementCount());
1369 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1382 {
II.getType()}, {AllPat});
1389static std::optional<Instruction *>
1391 const auto Pattern = cast<ConstantInt>(
II.getArgOperand(0))->getZExtValue();
1393 if (
Pattern == AArch64SVEPredPattern::all) {
1394 Constant *StepVal = ConstantInt::get(
II.getType(), NumElts);
1402 return MinNumElts && NumElts >= MinNumElts
1404 II, ConstantInt::get(
II.getType(), MinNumElts)))
1410 Value *PgVal =
II.getArgOperand(0);
1411 Value *OpVal =
II.getArgOperand(1);
1415 if (PgVal == OpVal &&
1416 (
II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1417 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1418 Value *Ops[] = {PgVal, OpVal};
1432 return std::nullopt;
1436 if (Pg->
getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1437 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1451 if ((Pg ==
Op) && (
II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1452 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1453 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1454 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1455 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1456 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1457 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1458 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1459 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1460 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1461 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1462 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1463 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1473 return std::nullopt;
1476template <Intrinsic::ID MulOpc,
typename Intrinsic::ID FuseOpc>
1477static std::optional<Instruction *>
1479 bool MergeIntoAddendOp) {
1481 Value *MulOp0, *MulOp1, *AddendOp, *
Mul;
1482 if (MergeIntoAddendOp) {
1483 AddendOp =
II.getOperand(1);
1484 Mul =
II.getOperand(2);
1486 AddendOp =
II.getOperand(2);
1487 Mul =
II.getOperand(1);
1492 return std::nullopt;
1494 if (!
Mul->hasOneUse())
1495 return std::nullopt;
1498 if (
II.getType()->isFPOrFPVectorTy()) {
1503 return std::nullopt;
1505 return std::nullopt;
1510 if (MergeIntoAddendOp)
1512 {
P, AddendOp, MulOp0, MulOp1}, FMFSource);
1515 {
P, MulOp0, MulOp1, AddendOp}, FMFSource);
1520static std::optional<Instruction *>
1522 Value *Pred =
II.getOperand(0);
1523 Value *PtrOp =
II.getOperand(1);
1524 Type *VecTy =
II.getType();
1532 Load->copyMetadata(
II);
1543static std::optional<Instruction *>
1545 Value *VecOp =
II.getOperand(0);
1546 Value *Pred =
II.getOperand(1);
1547 Value *PtrOp =
II.getOperand(2);
1551 Store->copyMetadata(
II);
1562 switch (Intrinsic) {
1563 case Intrinsic::aarch64_sve_fmul_u:
1564 return Instruction::BinaryOps::FMul;
1565 case Intrinsic::aarch64_sve_fadd_u:
1566 return Instruction::BinaryOps::FAdd;
1567 case Intrinsic::aarch64_sve_fsub_u:
1568 return Instruction::BinaryOps::FSub;
1570 return Instruction::BinaryOpsEnd;
1574static std::optional<Instruction *>
1577 if (
II.isStrictFP())
1578 return std::nullopt;
1580 auto *OpPredicate =
II.getOperand(0);
1582 if (BinOpCode == Instruction::BinaryOpsEnd ||
1583 !
match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1584 m_ConstantInt<AArch64SVEPredPattern::all>())))
1585 return std::nullopt;
1597 auto *OpPredicate =
II.getOperand(0);
1598 if (!
match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1599 m_ConstantInt<AArch64SVEPredPattern::all>())))
1600 return std::nullopt;
1602 auto *
Mod =
II.getModule();
1604 II.setCalledFunction(NewDecl);
1611static std::optional<Instruction *>
1628 Intrinsic::aarch64_sve_mla>(
1632 Intrinsic::aarch64_sve_mad>(
1635 return std::nullopt;
1638static std::optional<Instruction *>
1645 Intrinsic::aarch64_sve_fmla>(IC,
II,
1650 Intrinsic::aarch64_sve_fmad>(IC,
II,
1655 Intrinsic::aarch64_sve_fmla>(IC,
II,
1658 return std::nullopt;
1661static std::optional<Instruction *>
1665 Intrinsic::aarch64_sve_fmla>(IC,
II,
1670 Intrinsic::aarch64_sve_fmad>(IC,
II,
1675 Intrinsic::aarch64_sve_fmla_u>(
1681static std::optional<Instruction *>
1688 Intrinsic::aarch64_sve_fmls>(IC,
II,
1693 Intrinsic::aarch64_sve_fnmsb>(
1698 Intrinsic::aarch64_sve_fmls>(IC,
II,
1701 return std::nullopt;
1704static std::optional<Instruction *>
1708 Intrinsic::aarch64_sve_fmls>(IC,
II,
1713 Intrinsic::aarch64_sve_fnmsb>(
1718 Intrinsic::aarch64_sve_fmls_u>(
1730 Intrinsic::aarch64_sve_mls>(
1733 return std::nullopt;
1739 auto *OpPredicate =
II.getOperand(0);
1740 auto *OpMultiplicand =
II.getOperand(1);
1741 auto *OpMultiplier =
II.getOperand(2);
1744 auto IsUnitSplat = [](
auto *
I) {
1753 auto IsUnitDup = [](
auto *
I) {
1754 auto *IntrI = dyn_cast<IntrinsicInst>(
I);
1755 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1758 auto *SplatValue = IntrI->getOperand(2);
1762 if (IsUnitSplat(OpMultiplier)) {
1764 OpMultiplicand->takeName(&
II);
1766 }
else if (IsUnitDup(OpMultiplier)) {
1768 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1769 auto *DupPg = DupInst->getOperand(1);
1772 if (OpPredicate == DupPg) {
1773 OpMultiplicand->takeName(&
II);
1783 Value *UnpackArg =
II.getArgOperand(0);
1784 auto *
RetTy = cast<ScalableVectorType>(
II.getType());
1785 bool IsSigned =
II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1786 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1799 return std::nullopt;
1803 auto *OpVal =
II.getOperand(0);
1804 auto *OpIndices =
II.getOperand(1);
1809 auto *SplatValue = dyn_cast_or_null<ConstantInt>(
getSplatValue(OpIndices));
1811 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1812 return std::nullopt;
1828 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1829 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1833 if ((
match(
II.getArgOperand(0),
1834 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(
m_Value(
A)))) &&
1836 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(
m_Value(
B))))) ||
1839 auto *TyA = cast<ScalableVectorType>(
A->getType());
1840 if (TyA ==
B->getType() &&
1851 return std::nullopt;
1859 if (
match(
II.getArgOperand(0),
1861 match(
II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1864 II, (
II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ?
A :
B));
1866 return std::nullopt;
1869static std::optional<Instruction *>
1871 Value *Mask =
II.getOperand(0);
1872 Value *BasePtr =
II.getOperand(1);
1885 if (
match(
Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1888 BasePtr->getPointerAlignment(
II.getDataLayout());
1892 BasePtr, IndexBase);
1900 return std::nullopt;
1903static std::optional<Instruction *>
1905 Value *Val =
II.getOperand(0);
1906 Value *Mask =
II.getOperand(1);
1907 Value *BasePtr =
II.getOperand(2);
1915 if (
match(
Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1918 BasePtr->getPointerAlignment(
II.getDataLayout());
1921 BasePtr, IndexBase);
1930 return std::nullopt;
1936 Value *Pred =
II.getOperand(0);
1937 Value *Vec =
II.getOperand(1);
1938 Value *DivVec =
II.getOperand(2);
1941 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1942 if (!SplatConstantInt)
1943 return std::nullopt;
1949 Intrinsic::aarch64_sve_asrd, {
II.getType()}, {Pred, Vec, DivisorLog2});
1956 Intrinsic::aarch64_sve_asrd, {
II.getType()}, {Pred, Vec, DivisorLog2});
1958 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1962 return std::nullopt;
1966 size_t VecSize = Vec.
size();
1971 size_t HalfVecSize = VecSize / 2;
1975 if (*
LHS !=
nullptr && *
RHS !=
nullptr) {
1983 if (*
LHS ==
nullptr && *
RHS !=
nullptr)
1998 m_Intrinsic<Intrinsic::vector_insert>(
2000 !isa<FixedVectorType>(CurrentInsertElt->
getType()))
2001 return std::nullopt;
2002 auto IIScalableTy = cast<ScalableVectorType>(
II.getType());
2006 while (
auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2007 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2008 Elts[
Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2009 CurrentInsertElt = InsertElt->getOperand(0);
2013 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(
Default);
2015 return std::nullopt;
2019 for (
size_t I = 0;
I < Elts.
size();
I++) {
2020 if (Elts[
I] ==
nullptr)
2025 if (InsertEltChain ==
nullptr)
2026 return std::nullopt;
2032 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.
size();
2033 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2034 IIScalableTy->getMinNumElements() /
2039 auto *WideShuffleMaskTy =
2050 auto NarrowBitcast =
2063 return std::nullopt;
2068 Value *Pred =
II.getOperand(0);
2069 Value *Vec =
II.getOperand(1);
2070 Value *Shift =
II.getOperand(2);
2073 Value *AbsPred, *MergedValue;
2074 if (!
match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2076 !
match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2079 return std::nullopt;
2087 return std::nullopt;
2092 return std::nullopt;
2095 {
II.getType()}, {Pred, Vec, Shift});
2100std::optional<Instruction *>
2108 case Intrinsic::aarch64_sve_st1_scatter:
2109 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
2110 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
2111 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
2112 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
2113 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
2114 case Intrinsic::aarch64_sve_st1dq:
2115 case Intrinsic::aarch64_sve_st1q_scatter_index:
2116 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
2117 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
2118 case Intrinsic::aarch64_sve_st1wq:
2119 case Intrinsic::aarch64_sve_stnt1:
2120 case Intrinsic::aarch64_sve_stnt1_scatter:
2121 case Intrinsic::aarch64_sve_stnt1_scatter_index:
2122 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
2123 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
2125 case Intrinsic::aarch64_sve_st2:
2126 case Intrinsic::aarch64_sve_st2q:
2128 case Intrinsic::aarch64_sve_st3:
2129 case Intrinsic::aarch64_sve_st3q:
2131 case Intrinsic::aarch64_sve_st4:
2132 case Intrinsic::aarch64_sve_st4q:
2134 case Intrinsic::aarch64_sve_ld1_gather:
2135 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2136 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2137 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2138 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2139 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2140 case Intrinsic::aarch64_sve_ld1q_gather_index:
2141 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2142 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2143 case Intrinsic::aarch64_sve_ld1ro:
2144 case Intrinsic::aarch64_sve_ld1rq:
2145 case Intrinsic::aarch64_sve_ld1udq:
2146 case Intrinsic::aarch64_sve_ld1uwq:
2147 case Intrinsic::aarch64_sve_ld2_sret:
2148 case Intrinsic::aarch64_sve_ld2q_sret:
2149 case Intrinsic::aarch64_sve_ld3_sret:
2150 case Intrinsic::aarch64_sve_ld3q_sret:
2151 case Intrinsic::aarch64_sve_ld4_sret:
2152 case Intrinsic::aarch64_sve_ld4q_sret:
2153 case Intrinsic::aarch64_sve_ldff1:
2154 case Intrinsic::aarch64_sve_ldff1_gather:
2155 case Intrinsic::aarch64_sve_ldff1_gather_index:
2156 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2157 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2158 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2159 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2160 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2161 case Intrinsic::aarch64_sve_ldnf1:
2162 case Intrinsic::aarch64_sve_ldnt1:
2163 case Intrinsic::aarch64_sve_ldnt1_gather:
2164 case Intrinsic::aarch64_sve_ldnt1_gather_index:
2165 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2166 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2168 case Intrinsic::aarch64_neon_fmaxnm:
2169 case Intrinsic::aarch64_neon_fminnm:
2171 case Intrinsic::aarch64_sve_convert_from_svbool:
2173 case Intrinsic::aarch64_sve_dup:
2175 case Intrinsic::aarch64_sve_dup_x:
2177 case Intrinsic::aarch64_sve_cmpne:
2178 case Intrinsic::aarch64_sve_cmpne_wide:
2180 case Intrinsic::aarch64_sve_rdffr:
2182 case Intrinsic::aarch64_sve_lasta:
2183 case Intrinsic::aarch64_sve_lastb:
2185 case Intrinsic::aarch64_sve_clasta_n:
2186 case Intrinsic::aarch64_sve_clastb_n:
2188 case Intrinsic::aarch64_sve_cntd:
2190 case Intrinsic::aarch64_sve_cntw:
2192 case Intrinsic::aarch64_sve_cnth:
2194 case Intrinsic::aarch64_sve_cntb:
2196 case Intrinsic::aarch64_sve_ptest_any:
2197 case Intrinsic::aarch64_sve_ptest_first:
2198 case Intrinsic::aarch64_sve_ptest_last:
2200 case Intrinsic::aarch64_sve_fabd:
2202 case Intrinsic::aarch64_sve_fadd:
2204 case Intrinsic::aarch64_sve_fadd_u:
2206 case Intrinsic::aarch64_sve_fdiv:
2208 case Intrinsic::aarch64_sve_fmax:
2210 case Intrinsic::aarch64_sve_fmaxnm:
2212 case Intrinsic::aarch64_sve_fmin:
2214 case Intrinsic::aarch64_sve_fminnm:
2216 case Intrinsic::aarch64_sve_fmla:
2218 case Intrinsic::aarch64_sve_fmls:
2220 case Intrinsic::aarch64_sve_fmul:
2225 case Intrinsic::aarch64_sve_fmul_u:
2227 case Intrinsic::aarch64_sve_fmulx:
2229 case Intrinsic::aarch64_sve_fnmla:
2231 case Intrinsic::aarch64_sve_fnmls:
2233 case Intrinsic::aarch64_sve_fsub:
2235 case Intrinsic::aarch64_sve_fsub_u:
2237 case Intrinsic::aarch64_sve_add:
2239 case Intrinsic::aarch64_sve_add_u:
2241 Intrinsic::aarch64_sve_mla_u>(
2243 case Intrinsic::aarch64_sve_mla:
2245 case Intrinsic::aarch64_sve_mls:
2247 case Intrinsic::aarch64_sve_mul:
2252 case Intrinsic::aarch64_sve_mul_u:
2254 case Intrinsic::aarch64_sve_sabd:
2256 case Intrinsic::aarch64_sve_smax:
2258 case Intrinsic::aarch64_sve_smin:
2260 case Intrinsic::aarch64_sve_smulh:
2262 case Intrinsic::aarch64_sve_sub:
2264 case Intrinsic::aarch64_sve_sub_u:
2266 Intrinsic::aarch64_sve_mls_u>(
2268 case Intrinsic::aarch64_sve_uabd:
2270 case Intrinsic::aarch64_sve_umax:
2272 case Intrinsic::aarch64_sve_umin:
2274 case Intrinsic::aarch64_sve_umulh:
2276 case Intrinsic::aarch64_sve_asr:
2278 case Intrinsic::aarch64_sve_lsl:
2280 case Intrinsic::aarch64_sve_lsr:
2282 case Intrinsic::aarch64_sve_and:
2284 case Intrinsic::aarch64_sve_bic:
2286 case Intrinsic::aarch64_sve_eor:
2288 case Intrinsic::aarch64_sve_orr:
2290 case Intrinsic::aarch64_sve_sqsub:
2292 case Intrinsic::aarch64_sve_uqsub:
2294 case Intrinsic::aarch64_sve_tbl:
2296 case Intrinsic::aarch64_sve_uunpkhi:
2297 case Intrinsic::aarch64_sve_uunpklo:
2298 case Intrinsic::aarch64_sve_sunpkhi:
2299 case Intrinsic::aarch64_sve_sunpklo:
2301 case Intrinsic::aarch64_sve_uzp1:
2303 case Intrinsic::aarch64_sve_zip1:
2304 case Intrinsic::aarch64_sve_zip2:
2306 case Intrinsic::aarch64_sve_ld1_gather_index:
2308 case Intrinsic::aarch64_sve_st1_scatter_index:
2310 case Intrinsic::aarch64_sve_ld1:
2312 case Intrinsic::aarch64_sve_st1:
2314 case Intrinsic::aarch64_sve_sdiv:
2316 case Intrinsic::aarch64_sve_sel:
2318 case Intrinsic::aarch64_sve_srshl:
2320 case Intrinsic::aarch64_sve_dupq_lane:
2324 return std::nullopt;
2331 SimplifyAndSetOp)
const {
2332 switch (
II.getIntrinsicID()) {
2335 case Intrinsic::aarch64_neon_fcvtxn:
2336 case Intrinsic::aarch64_neon_rshrn:
2337 case Intrinsic::aarch64_neon_sqrshrn:
2338 case Intrinsic::aarch64_neon_sqrshrun:
2339 case Intrinsic::aarch64_neon_sqshrn:
2340 case Intrinsic::aarch64_neon_sqshrun:
2341 case Intrinsic::aarch64_neon_sqxtn:
2342 case Intrinsic::aarch64_neon_sqxtun:
2343 case Intrinsic::aarch64_neon_uqrshrn:
2344 case Intrinsic::aarch64_neon_uqshrn:
2345 case Intrinsic::aarch64_neon_uqxtn:
2346 SimplifyAndSetOp(&
II, 0, OrigDemandedElts, UndefElts);
2350 return std::nullopt;
2382bool AArch64TTIImpl::isWideningInstruction(
Type *DstTy,
unsigned Opcode,
2384 Type *SrcOverrideTy) {
2387 auto toVectorTy = [&](
Type *ArgTy) {
2389 cast<VectorType>(DstTy)->getElementCount());
2399 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2409 Type *SrcTy = SrcOverrideTy;
2411 case Instruction::Add:
2412 case Instruction::Sub:
2414 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2417 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->
getType());
2421 case Instruction::Mul: {
2423 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2424 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2427 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->
getType());
2428 }
else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2457 assert(SrcTy &&
"Expected some SrcTy");
2459 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2465 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2467 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2471 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2484 (Src->isScalableTy() && !ST->hasSVE2()))
2493 dyn_cast_or_null<Instruction>(
Add->getUniqueUndroppableUser());
2494 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2497 auto *Shr = dyn_cast_or_null<Instruction>(
Add->getUniqueUndroppableUser());
2498 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2501 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2502 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2503 Src->getScalarSizeInBits() !=
2504 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2528 assert(ISD &&
"Invalid opcode");
2531 if (
I &&
I->hasOneUser()) {
2532 auto *SingleUser = cast<Instruction>(*
I->user_begin());
2534 if (isWideningInstruction(Dst, SingleUser->getOpcode(),
Operands, Src)) {
2538 if (SingleUser->getOpcode() == Instruction::Add) {
2539 if (
I == SingleUser->getOperand(1) ||
2540 (isa<CastInst>(SingleUser->getOperand(1)) &&
2541 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2548 if ((isa<ZExtInst>(
I) || isa<SExtInst>(
I)) &&
2556 return Cost == 0 ? 0 : 1;
2841 EVT WiderTy = SrcTy.
bitsGT(DstTy) ? SrcTy : DstTy;
2845 std::pair<InstructionCost, MVT> LT =
2848 LT.second.getScalarSizeInBits();
2860 return AdjustCost(Entry->Cost);
2887 if (ST->hasFullFP16())
2890 return AdjustCost(Entry->Cost);
2906 Opcode, LegalTy, Src, CCH,
CostKind,
I);
2909 return Part1 + Part2;
2929 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2937 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) &&
"Invalid type");
2953 if (!VecLT.second.isVector() || !TLI->
isTypeLegal(DstVT))
2959 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2969 case Instruction::SExt:
2974 case Instruction::ZExt:
2975 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2988 return Opcode == Instruction::PHI ? 0 : 1;
3005 if (!LT.second.isVector())
3010 if (LT.second.isFixedLengthVector()) {
3011 unsigned Width = LT.second.getVectorNumElements();
3028 if (
I && dyn_cast<LoadInst>(
I->getOperand(1)))
3052 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3053 return getVectorInstrCostHelper(
nullptr, Val,
Index, HasRealUse);
3060 return getVectorInstrCostHelper(&
I, Val,
Index,
true );
3066 if (isa<ScalableVectorType>(Ty))
3071 return DemandedElts.
popcount() * (Insert + Extract) *
3085 if (
auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3092 Op2Info, Args, CxtI);
3134 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
3139 Opcode, Ty,
CostKind, Op1Info, Op2Info);
3144 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
3145 ->getPrimitiveSizeInBits()
3146 .getFixedValue() < 128) {
3157 if (
nullptr != Entry)
3162 if (LT.second.getScalarType() == MVT::i8)
3164 else if (LT.second.getScalarType() == MVT::i16)
3174 if (
auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
3177 return (4 + DivCost) * VTy->getNumElements();
3197 if (LT.second == MVT::v2i64 && ST->hasSVE())
3212 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3214 return LT.first * 14;
3233 return 2 * LT.first;
3242 return 2 * LT.first;
3264 int MaxMergeDistance = 64;
3268 return NumVectorInstToHideOverhead;
3288 if (isa<FixedVectorType>(ValTy) && ISD ==
ISD::SELECT) {
3290 const int AmortizationCost = 20;
3298 VecPred = CurrentPred;
3306 static const auto ValidMinMaxTys = {
3307 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3308 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3309 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3312 if (
any_of(ValidMinMaxTys, [<](
MVT M) {
return M == LT.second; }) ||
3313 (ST->hasFullFP16() &&
3314 any_of(ValidFP16MinMaxTys, [<](
MVT M) {
return M == LT.second; })))
3319 VectorSelectTbl[] = {
3328 {
ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3329 {
ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3330 {
ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3343 if (isa<FixedVectorType>(ValTy) && ISD ==
ISD::SETCC) {
3346 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3347 return LT.first * 4;
3368 if (ST->requiresStrictAlign()) {
3373 Options.AllowOverlappingLoads =
true;
3379 Options.LoadSizes = {8, 4, 2, 1};
3380 Options.AllowedTailExpansions = {3, 5, 6};
3385 return ST->hasSVE();
3396 if (!LT.first.isValid())
3400 auto *VT = cast<VectorType>(Src);
3401 if (VT->getElementType()->isIntegerTy(1))
3418 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3419 "Should be called on only load or stores.");
3421 case Instruction::Load:
3424 return ST->getGatherOverhead();
3426 case Instruction::Store:
3429 return ST->getScatterOverhead();
3437 unsigned Opcode,
Type *DataTy,
const Value *
Ptr,
bool VariableMask,
3442 auto *VT = cast<VectorType>(DataTy);
3444 if (!LT.first.isValid())
3448 if (!LT.second.isVector() ||
3450 VT->getElementType()->isIntegerTy(1))
3460 ElementCount LegalVF = LT.second.getVectorElementCount();
3463 {TTI::OK_AnyValue, TTI::OP_None},
I);
3481 if (VT == MVT::Other)
3486 if (!LT.first.isValid())
3494 if (
auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3496 (VTy->getElementType()->isIntegerTy(1) &&
3497 !VTy->getElementCount().isKnownMultipleOf(
3508 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3509 LT.second.is128BitVector() && (!Alignment || *Alignment <
Align(16))) {
3515 const int AmortizationCost = 6;
3517 return LT.first * 2 * AmortizationCost;
3528 if (VT == MVT::v4i8)
3531 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3535 if (!
isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3537 *Alignment !=
Align(1))
3551 while (!TypeWorklist.
empty()) {
3573 bool UseMaskForCond,
bool UseMaskForGaps) {
3574 assert(Factor >= 2 &&
"Invalid interleave factor");
3575 auto *VecVTy = cast<VectorType>(VecTy);
3577 if (VecTy->
isScalableTy() && (!ST->hasSVE() || Factor != 2))
3582 if (!VecTy->
isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3585 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3586 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3589 VecVTy->getElementCount().divideCoefficientBy(Factor));
3595 if (MinElts % Factor == 0 &&
3602 UseMaskForCond, UseMaskForGaps);
3609 for (
auto *
I : Tys) {
3610 if (!
I->isVectorTy())
3612 if (
I->getScalarSizeInBits() * cast<FixedVectorType>(
I)->getNumElements() ==
3631 enum { MaxStridedLoads = 7 };
3633 int StridedLoads = 0;
3636 for (
const auto BB : L->blocks()) {
3637 for (
auto &
I : *BB) {
3638 LoadInst *LMemI = dyn_cast<LoadInst>(&
I);
3643 if (L->isLoopInvariant(PtrValue))
3647 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
3648 if (!LSCEVAddRec || !LSCEVAddRec->
isAffine())
3657 if (StridedLoads > MaxStridedLoads / 2)
3658 return StridedLoads;
3661 return StridedLoads;
3664 int StridedLoads = countStridedLoads(L, SE);
3666 <<
" strided loads\n");
3687 if (L->getLoopDepth() > 1)
3700 for (
auto *BB : L->getBlocks()) {
3701 for (
auto &
I : *BB) {
3703 if (
I.getType()->isVectorTy())
3706 if (isa<CallInst>(
I) || isa<InvokeInst>(
I)) {
3721 !ST->getSchedModel().isOutOfOrder()) {
3738 Type *ExpectedType) {
3742 case Intrinsic::aarch64_neon_st2:
3743 case Intrinsic::aarch64_neon_st3:
3744 case Intrinsic::aarch64_neon_st4: {
3746 StructType *ST = dyn_cast<StructType>(ExpectedType);
3749 unsigned NumElts = Inst->
arg_size() - 1;
3750 if (ST->getNumElements() != NumElts)
3752 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
3758 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
3764 case Intrinsic::aarch64_neon_ld2:
3765 case Intrinsic::aarch64_neon_ld3:
3766 case Intrinsic::aarch64_neon_ld4:
3767 if (Inst->
getType() == ExpectedType)
3778 case Intrinsic::aarch64_neon_ld2:
3779 case Intrinsic::aarch64_neon_ld3:
3780 case Intrinsic::aarch64_neon_ld4:
3781 Info.ReadMem =
true;
3782 Info.WriteMem =
false;
3785 case Intrinsic::aarch64_neon_st2:
3786 case Intrinsic::aarch64_neon_st3:
3787 case Intrinsic::aarch64_neon_st4:
3788 Info.ReadMem =
false;
3789 Info.WriteMem =
true;
3797 case Intrinsic::aarch64_neon_ld2:
3798 case Intrinsic::aarch64_neon_st2:
3799 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
3801 case Intrinsic::aarch64_neon_ld3:
3802 case Intrinsic::aarch64_neon_st3:
3803 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
3805 case Intrinsic::aarch64_neon_ld4:
3806 case Intrinsic::aarch64_neon_st4:
3807 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
3819 const Instruction &
I,
bool &AllowPromotionWithoutCommonHeader) {
3820 bool Considerable =
false;
3821 AllowPromotionWithoutCommonHeader =
false;
3822 if (!isa<SExtInst>(&
I))
3824 Type *ConsideredSExtType =
3826 if (
I.getType() != ConsideredSExtType)
3830 for (
const User *U :
I.users()) {
3832 Considerable =
true;
3836 if (GEPInst->getNumOperands() > 2) {
3837 AllowPromotionWithoutCommonHeader =
true;
3842 return Considerable;
3883 if (
auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3889 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3899 return LegalizationCost + 2;
3909 LegalizationCost *= LT.first - 1;
3913 assert(ISD &&
"Invalid opcode");
3921 return LegalizationCost + 2;
3929 std::optional<FastMathFlags> FMF,
3935 if (
auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
3940 if (
auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3945 return BaseCost + FixedVTy->getNumElements();
3948 if (Opcode != Instruction::FAdd)
3951 auto *VTy = cast<ScalableVectorType>(ValTy);
3958 if (isa<ScalableVectorType>(ValTy))
3962 MVT MTy = LT.second;
3964 assert(ISD &&
"Invalid opcode");
4008 return (LT.first - 1) + Entry->Cost;
4016 auto *ValVTy = cast<FixedVectorType>(ValTy);
4020 if (LT.first != 1) {
4026 ExtraCost *= LT.first - 1;
4029 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
4030 return Cost + ExtraCost;
4064 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
4079 if (LT.second.getScalarType() == MVT::i1) {
4088 assert(Entry &&
"Illegal Type for Splice");
4089 LegalizationCost += Entry->Cost;
4090 return LegalizationCost * LT.first;
4101 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
4103 Mask.size() > LT.second.getVectorNumElements() && !
Index && !SubTp) {
4109 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
4112 return std::max<InstructionCost>(1, LT.first / 4);
4125 unsigned TpNumElts = Mask.size();
4126 unsigned LTNumElts = LT.second.getVectorNumElements();
4127 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
4131 for (
unsigned N = 0;
N < NumVecs;
N++) {
4135 unsigned Source1, Source2;
4136 unsigned NumSources = 0;
4137 for (
unsigned E = 0; E < LTNumElts; E++) {
4138 int MaskElt = (
N * LTNumElts + E < TpNumElts) ? Mask[
N * LTNumElts + E]
4147 unsigned Source = MaskElt / LTNumElts;
4148 if (NumSources == 0) {
4151 }
else if (NumSources == 1 && Source != Source1) {
4154 }
else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
4160 if (Source == Source1)
4162 else if (Source == Source2)
4163 NMask.
push_back(MaskElt % LTNumElts + LTNumElts);
4170 if (NumSources <= 2)
4173 NTp, NMask,
CostKind, 0,
nullptr, Args, CxtI);
4183 if (IsExtractSubvector && LT.second.isFixedLengthVector())
4194 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
4195 if (IsLoad && LT.second.isVector() &&
4197 LT.second.getVectorElementCount()))
4205 all_of(Mask, [](
int E) {
return E < 8; }))
4209 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
4212 return M.value() < 0 || M.value() == (
int)M.index();
4219 if (LT.second.isFixedLengthVector() &&
4220 LT.second.getVectorNumElements() == Mask.size() &&
4222 (
isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4223 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4226 [&Mask](
int M) {
return M < 0 || M == Mask[0]; })))
4349 return LT.first * Entry->Cost;
4358 LT.second.getSizeInBits() <= 128 && SubTp) {
4360 if (SubLT.second.isVector()) {
4361 int NumElts = LT.second.getVectorNumElements();
4362 int NumSubElts = SubLT.second.getVectorNumElements();
4363 if ((
Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
4369 if (IsExtractSubvector)
4382 if (isa<LoadInst>(&
I) || isa<StoreInst>(&
I)) {
4426 unsigned NumInsns = 0;
4428 NumInsns += BB->sizeWithoutDebug();
4438 int64_t Scale,
unsigned AddrSpace)
const {
4465 isa<BranchInst>(
I->getNextNode()) &&
4466 cast<BranchInst>(
I->getNextNode())->isUnconditional())
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn