73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Enable vectorization for wider vector utilization"));
122 cl::desc(
"Only vectorize if you gain more than this "
127 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
132 cl::desc(
"Attempt to vectorize horizontal reductions"));
137 "Attempt to vectorize horizontal reductions feeding into a store"));
143 cl::desc(
"Allow optimization of original scalar identity operations on "
144 "matched horizontal reductions."));
148 cl::desc(
"Attempt to vectorize for this register size in bits"));
152 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
160 cl::desc(
"Limit the size of the SLP scheduling region per block"));
164 cl::desc(
"Attempt to vectorize for this register size in bits"));
168 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
172 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
178 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
187 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
191 cl::desc(
"The minimum number of loads, which should be considered strided, "
192 "if the stride is > 1 or is runtime value"));
196 cl::desc(
"The maximum stride, considered to be profitable."));
200 cl::desc(
"Display the SLP trees with Graphviz"));
204 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
235 if (
SLPReVec && isa<FixedVectorType>(Ty))
237 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
243 assert(!isa<ScalableVectorType>(Ty) &&
244 "ScalableVectorType is not supported.");
245 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
246 return VecTy->getNumElements();
263 for (
unsigned I : seq<unsigned>(Mask.size()))
265 I * VecTyNumElements, VecTyNumElements)))
267 : Mask[
I] * VecTyNumElements + J;
299 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
301 auto *SV = cast<ShuffleVectorInst>(VL.
front());
302 unsigned SVNumElements =
303 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
304 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
305 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
307 unsigned NumGroup = 0;
308 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
309 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
310 Value *Src = SV->getOperand(0);
314 auto *SV = cast<ShuffleVectorInst>(V);
316 if (SV->getOperand(0) != Src)
319 if (!SV->isExtractSubvectorMask(
Index))
321 for (
int I : seq<int>(
Index,
Index + SV->getShuffleMask().size()))
330 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
348 auto *SV = cast<ShuffleVectorInst>(VL.
front());
349 unsigned SVNumElements =
350 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
352 unsigned AccumulateLength = 0;
353 for (
Value *V : VL) {
354 auto *SV = cast<ShuffleVectorInst>(V);
355 for (
int M : SV->getShuffleMask())
357 : AccumulateLength + M);
358 AccumulateLength += SVNumElements;
366 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
373 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
374 !isa<ExtractValueInst, UndefValue>(V))
376 auto *
I = dyn_cast<Instruction>(V);
377 if (!
I || isa<ExtractValueInst>(
I))
379 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
381 if (isa<ExtractElementInst>(
I))
383 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
399 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
408 OS <<
"Idx: " <<
Idx <<
", ";
409 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
425 for (
int I = 1, E = VL.
size();
I < E;
I++) {
426 auto *
II = dyn_cast<Instruction>(VL[
I]);
430 if (BB !=
II->getParent())
447 Value *FirstNonUndef =
nullptr;
448 for (
Value *V : VL) {
449 if (isa<UndefValue>(V))
451 if (!FirstNonUndef) {
455 if (V != FirstNonUndef)
458 return FirstNonUndef !=
nullptr;
463 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
464 return Cmp->isCommutative();
465 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
466 return BO->isCommutative() ||
467 (BO->getOpcode() == Instruction::Sub &&
473 ICmpInst::Predicate Pred;
474 if (match(U.getUser(),
475 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
476 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
480 return match(U.getUser(),
481 m_Intrinsic<Intrinsic::abs>(
482 m_Specific(U.get()), m_ConstantInt(Flag))) &&
483 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
486 (BO->getOpcode() == Instruction::FSub &&
489 return match(U.getUser(),
490 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
492 return I->isCommutative();
498 static_assert(std::is_same_v<T, InsertElementInst> ||
499 std::is_same_v<T, ExtractElementInst>,
502 if (
const auto *IE = dyn_cast<T>(Inst)) {
503 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
506 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
509 if (CI->getValue().uge(VT->getNumElements()))
511 Index *= VT->getNumElements();
512 Index += CI->getZExtValue();
523 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
525 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
530 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
534 Type *CurrentType =
IV->getType();
535 for (
unsigned I :
IV->indices()) {
536 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
537 Index *= ST->getNumElements();
538 CurrentType = ST->getElementType(
I);
539 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
540 Index *= AT->getNumElements();
541 CurrentType = AT->getElementType();
574 if (MaskArg == UseMask::UndefsAsMask)
578 if (MaskArg == UseMask::FirstArg &&
Value < VF)
579 UseMask.reset(
Value);
580 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
581 UseMask.reset(
Value - VF);
589template <
bool IsPoisonOnly = false>
593 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
596 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
599 auto *
C = dyn_cast<Constant>(V);
601 if (!UseMask.empty()) {
603 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
605 if (isa<T>(
II->getOperand(1)))
612 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
620 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
627 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
628 if (
Constant *Elem =
C->getAggregateElement(
I))
630 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
658static std::optional<TargetTransformInfo::ShuffleKind>
660 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
664 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
665 auto *EI = dyn_cast<ExtractElementInst>(V);
668 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
671 return std::max(S, VTy->getNumElements());
674 Value *Vec1 =
nullptr;
675 Value *Vec2 =
nullptr;
677 auto *EE = dyn_cast<ExtractElementInst>(V);
680 Value *Vec = EE->getVectorOperand();
681 if (isa<UndefValue>(Vec))
686 ShuffleMode CommonShuffleMode =
Unknown;
688 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
690 if (isa<UndefValue>(VL[
I]))
692 auto *EI = cast<ExtractElementInst>(VL[
I]);
693 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
695 auto *Vec = EI->getVectorOperand();
697 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
700 if (isa<UndefValue>(Vec)) {
703 if (isa<UndefValue>(EI->getIndexOperand()))
705 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
711 unsigned IntIdx =
Idx->getValue().getZExtValue();
718 if (!Vec1 || Vec1 == Vec) {
720 }
else if (!Vec2 || Vec2 == Vec) {
726 if (CommonShuffleMode == Permute)
730 if (Mask[
I] %
Size !=
I) {
731 CommonShuffleMode = Permute;
734 CommonShuffleMode =
Select;
737 if (CommonShuffleMode ==
Select && Vec2)
748 assert((Opcode == Instruction::ExtractElement ||
749 Opcode == Instruction::ExtractValue) &&
750 "Expected extractelement or extractvalue instruction.");
751 if (Opcode == Instruction::ExtractElement) {
752 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
755 return CI->getZExtValue();
757 auto *EI = cast<ExtractValueInst>(E);
758 if (EI->getNumIndices() != 1)
760 return *EI->idx_begin();
766struct InstructionsState {
768 Value *OpValue =
nullptr;
779 unsigned getAltOpcode()
const {
784 bool isAltShuffle()
const {
return AltOp != MainOp; }
787 unsigned CheckedOpcode =
I->getOpcode();
788 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
791 InstructionsState() =
delete;
793 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
812 unsigned BaseIndex = 0);
820 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
821 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
822 BaseOp0 == Op0 || BaseOp1 == Op1 ||
833 "Assessing comparisons of different types?");
843 return (BasePred == Pred &&
845 (BasePred == SwappedPred &&
854 unsigned BaseIndex) {
857 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
859 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
860 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
861 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
863 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
865 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
866 unsigned AltOpcode = Opcode;
867 unsigned AltIndex = BaseIndex;
869 bool SwappedPredsCompatible = [&]() {
873 UniquePreds.
insert(BasePred);
874 UniqueNonSwappedPreds.
insert(BasePred);
875 for (
Value *V : VL) {
876 auto *
I = dyn_cast<CmpInst>(V);
882 UniqueNonSwappedPreds.
insert(CurrentPred);
883 if (!UniquePreds.
contains(CurrentPred) &&
884 !UniquePreds.
contains(SwappedCurrentPred))
885 UniquePreds.
insert(CurrentPred);
890 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
894 auto *IBase = cast<Instruction>(VL[BaseIndex]);
897 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
901 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
903 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
904 auto *
I = cast<Instruction>(VL[Cnt]);
905 unsigned InstOpcode =
I->getOpcode();
906 if (IsBinOp && isa<BinaryOperator>(
I)) {
907 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
911 AltOpcode = InstOpcode;
915 }
else if (IsCastOp && isa<CastInst>(
I)) {
916 Value *Op0 = IBase->getOperand(0);
918 Value *Op1 =
I->getOperand(0);
921 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
923 if (Opcode == AltOpcode) {
926 "Cast isn't safe for alternation, logic needs to be updated!");
927 AltOpcode = InstOpcode;
932 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
933 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
934 Type *Ty0 = BaseInst->getOperand(0)->getType();
935 Type *Ty1 = Inst->getOperand(0)->getType();
937 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
944 if ((E == 2 || SwappedPredsCompatible) &&
945 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
950 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
951 if (AltIndex != BaseIndex) {
954 }
else if (BasePred != CurrentPred) {
957 "CmpInst isn't safe for alternation, logic needs to be updated!");
962 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
963 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
966 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
967 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
968 if (Gep->getNumOperands() != 2 ||
969 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
970 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
971 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
973 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
974 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
975 auto *BaseLI = cast<LoadInst>(IBase);
976 if (!LI->isSimple() || !BaseLI->isSimple())
977 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
978 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
979 auto *
CallBase = cast<CallInst>(IBase);
981 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
983 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
984 Call->op_begin() + Call->getBundleOperandsEndIndex(),
987 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
990 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
993 if (Mappings.
size() != BaseMappings.
size() ||
994 Mappings.
front().ISA != BaseMappings.
front().ISA ||
995 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
996 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
997 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
998 Mappings.
front().Shape.Parameters !=
999 BaseMappings.
front().Shape.Parameters)
1000 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
1005 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
1008 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
1009 cast<Instruction>(VL[AltIndex]));
1023 unsigned Opcode = UserInst->
getOpcode();
1025 case Instruction::Load: {
1026 LoadInst *LI = cast<LoadInst>(UserInst);
1029 case Instruction::Store: {
1030 StoreInst *SI = cast<StoreInst>(UserInst);
1031 return (SI->getPointerOperand() == Scalar);
1033 case Instruction::Call: {
1034 CallInst *CI = cast<CallInst>(UserInst);
1037 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
1038 Arg.value().get() == Scalar;
1050 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1057 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1058 return LI->isSimple();
1060 return SI->isSimple();
1062 return !
MI->isVolatile();
1070 bool ExtendingManyInputs =
false) {
1071 if (SubMask.
empty())
1074 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1076 (SubMask.
size() == Mask.size() &&
1077 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
1078 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
1079 "SubMask with many inputs support must be larger than the mask.");
1081 Mask.append(SubMask.
begin(), SubMask.
end());
1085 int TermValue = std::min(Mask.size(), SubMask.
size());
1086 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1088 (!ExtendingManyInputs &&
1089 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1091 NewMask[
I] = Mask[SubMask[
I]];
1107 const unsigned Sz = Order.
size();
1110 for (
unsigned I = 0;
I < Sz; ++
I) {
1112 UnusedIndices.
reset(Order[
I]);
1114 MaskedIndices.
set(
I);
1116 if (MaskedIndices.
none())
1119 "Non-synced masked/available indices.");
1123 assert(
Idx >= 0 &&
"Indices must be synced.");
1134 Type *ScalarTy = VL[0]->getType();
1137 for (
unsigned Lane : seq<unsigned>(VL.
size()))
1138 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1139 OpcodeMask.
set(Lane * ScalarTyNumElements,
1140 Lane * ScalarTyNumElements + ScalarTyNumElements);
1149 const unsigned E = Indices.
size();
1151 for (
unsigned I = 0;
I < E; ++
I)
1152 Mask[Indices[
I]] =
I;
1158 assert(!Mask.empty() &&
"Expected non-empty mask.");
1162 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1164 Scalars[Mask[
I]] = Prev[
I];
1172 auto *
I = dyn_cast<Instruction>(V);
1177 auto *IO = dyn_cast<Instruction>(V);
1180 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1189 auto *
I = dyn_cast<Instruction>(V);
1193 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1195 auto *IU = dyn_cast<Instruction>(U);
1198 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1214 return !VL.
empty() &&
1218namespace slpvectorizer {
1223 struct ScheduleData;
1248 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1249 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1303 return !VectorizableTree.
empty() &&
1304 !VectorizableTree.
front()->UserTreeIndices.empty();
1309 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1310 return VectorizableTree.
front()->Scalars;
1316 return MinBWs.
at(VectorizableTree.
front().get()).second;
1331 VectorizableTree.
clear();
1332 ScalarToTreeEntry.clear();
1333 MultiNodeScalars.clear();
1335 NonScheduledFirst.
clear();
1336 EntryToLastInstruction.clear();
1337 ExternalUses.
clear();
1338 ExternalUsesAsOriginalScalar.clear();
1339 for (
auto &Iter : BlocksSchedules) {
1340 BlockScheduling *BS = Iter.second.get();
1344 ReductionBitWidth = 0;
1345 CastMaxMinBWSizes.reset();
1346 ExtraBitWidthNodes.
clear();
1347 InstrElementSize.clear();
1348 UserIgnoreList =
nullptr;
1349 PostponedGathers.
clear();
1350 ValueToGatherNodes.
clear();
1407 return MaxVecRegSize;
1412 return MinVecRegSize;
1420 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1422 return MaxVF ? MaxVF : UINT_MAX;
1466 bool TryRecursiveCheck =
true)
const;
1490 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1491 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1513 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1514 MaxLevel(MaxLevel) {}
1568 if (isa<LoadInst>(V1)) {
1570 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1575 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1577 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1580 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1583 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1585 ((
int)V1->getNumUses() == NumLanes ||
1586 AllUsersAreInternal(V1, V2)))
1592 auto CheckSameEntryOrFail = [&]() {
1593 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1594 TE1 && TE1 == R.getTreeEntry(V2))
1599 auto *LI1 = dyn_cast<LoadInst>(V1);
1600 auto *LI2 = dyn_cast<LoadInst>(V2);
1602 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1604 return CheckSameEntryOrFail();
1607 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1608 LI2->getPointerOperand(),
DL, SE,
true);
1609 if (!Dist || *Dist == 0) {
1612 R.TTI->isLegalMaskedGather(
1615 return CheckSameEntryOrFail();
1619 if (std::abs(*Dist) > NumLanes / 2)
1628 auto *C1 = dyn_cast<Constant>(V1);
1629 auto *C2 = dyn_cast<Constant>(V2);
1643 if (isa<UndefValue>(V2))
1647 Value *EV2 =
nullptr;
1660 int Dist = Idx2 - Idx1;
1663 if (std::abs(Dist) == 0)
1665 if (std::abs(Dist) > NumLanes / 2)
1672 return CheckSameEntryOrFail();
1675 auto *I1 = dyn_cast<Instruction>(V1);
1676 auto *I2 = dyn_cast<Instruction>(V2);
1678 if (I1->getParent() != I2->getParent())
1679 return CheckSameEntryOrFail();
1686 if (S.getOpcode() &&
1687 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1688 !S.isAltShuffle()) &&
1690 return cast<Instruction>(V)->getNumOperands() ==
1691 S.MainOp->getNumOperands();
1697 if (isa<UndefValue>(V2))
1700 return CheckSameEntryOrFail();
1734 int ShallowScoreAtThisLevel =
1743 auto *I1 = dyn_cast<Instruction>(
LHS);
1744 auto *I2 = dyn_cast<Instruction>(
RHS);
1745 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1747 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1748 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1749 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1750 ShallowScoreAtThisLevel))
1751 return ShallowScoreAtThisLevel;
1752 assert(I1 && I2 &&
"Should have early exited.");
1759 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1760 OpIdx1 != NumOperands1; ++OpIdx1) {
1762 int MaxTmpScore = 0;
1763 unsigned MaxOpIdx2 = 0;
1764 bool FoundBest =
false;
1768 ? I2->getNumOperands()
1769 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1770 assert(FromIdx <= ToIdx &&
"Bad index");
1771 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1773 if (Op2Used.
count(OpIdx2))
1778 I1, I2, CurrLevel + 1, std::nullopt);
1781 TmpScore > MaxTmpScore) {
1782 MaxTmpScore = TmpScore;
1789 Op2Used.
insert(MaxOpIdx2);
1790 ShallowScoreAtThisLevel += MaxTmpScore;
1793 return ShallowScoreAtThisLevel;
1824 struct OperandData {
1825 OperandData() =
default;
1826 OperandData(
Value *V,
bool APO,
bool IsUsed)
1827 : V(V), APO(APO), IsUsed(IsUsed) {}
1837 bool IsUsed =
false;
1846 enum class ReorderingMode {
1863 const Loop *L =
nullptr;
1866 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1867 return OpsVec[OpIdx][Lane];
1871 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1872 return OpsVec[OpIdx][Lane];
1877 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1878 OpIdx != NumOperands; ++OpIdx)
1879 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1881 OpsVec[OpIdx][Lane].IsUsed =
false;
1885 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1886 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1898 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1899 Value *IdxLaneV = getData(
Idx, Lane).V;
1900 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1903 for (
unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1906 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1907 if (!isa<Instruction>(OpIdxLnV))
1909 Uniques.
insert(OpIdxLnV);
1911 int UniquesCount = Uniques.
size();
1912 int UniquesCntWithIdxLaneV =
1913 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1914 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1915 int UniquesCntWithOpIdxLaneV =
1916 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1917 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1920 UniquesCntWithOpIdxLaneV) -
1921 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1930 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1931 Value *IdxLaneV = getData(
Idx, Lane).V;
1932 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1941 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1942 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1944 return R.areAllUsersVectorized(IdxLaneI)
1952 static const int ScoreScaleFactor = 10;
1960 int Lane,
unsigned OpIdx,
unsigned Idx,
1970 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1971 if (Score <= -SplatScore) {
1976 Score += SplatScore;
1982 Score *= ScoreScaleFactor;
1983 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2001 std::optional<unsigned>
2002 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2005 unsigned NumOperands = getNumOperands();
2008 Value *OpLastLane = getData(OpIdx, LastLane).V;
2011 ReorderingMode RMode = ReorderingModes[OpIdx];
2012 if (RMode == ReorderingMode::Failed)
2013 return std::nullopt;
2016 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2022 std::optional<unsigned>
Idx;
2026 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2032 bool IsUsed = RMode == ReorderingMode::Splat ||
2033 RMode == ReorderingMode::Constant ||
2034 RMode == ReorderingMode::Load;
2036 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2038 OperandData &OpData = getData(
Idx, Lane);
2040 bool OpAPO = OpData.APO;
2049 if (OpAPO != OpIdxAPO)
2054 case ReorderingMode::Load:
2055 case ReorderingMode::Opcode: {
2056 bool LeftToRight = Lane > LastLane;
2057 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2058 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2059 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2060 OpIdx,
Idx, IsUsed);
2061 if (Score >
static_cast<int>(BestOp.Score) ||
2062 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2065 BestOp.Score = Score;
2066 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2070 case ReorderingMode::Constant:
2071 if (isa<Constant>(
Op) ||
2072 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2074 if (isa<Constant>(
Op)) {
2076 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2079 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2083 case ReorderingMode::Splat:
2084 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2085 IsUsed =
Op == OpLastLane;
2086 if (
Op == OpLastLane) {
2088 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2094 case ReorderingMode::Failed:
2100 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2104 return std::nullopt;
2111 unsigned getBestLaneToStartReordering()
const {
2112 unsigned Min = UINT_MAX;
2113 unsigned SameOpNumber = 0;
2124 for (
int I = getNumLanes();
I > 0; --
I) {
2125 unsigned Lane =
I - 1;
2126 OperandsOrderData NumFreeOpsHash =
2127 getMaxNumOperandsThatCanBeReordered(Lane);
2130 if (NumFreeOpsHash.NumOfAPOs < Min) {
2131 Min = NumFreeOpsHash.NumOfAPOs;
2132 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2134 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2135 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2136 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2139 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2140 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2141 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2142 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2143 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
2144 if (It == HashMap.
end())
2145 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2151 unsigned BestLane = 0;
2152 unsigned CntMin = UINT_MAX;
2154 if (
Data.second.first < CntMin) {
2155 CntMin =
Data.second.first;
2156 BestLane =
Data.second.second;
2163 struct OperandsOrderData {
2166 unsigned NumOfAPOs = UINT_MAX;
2169 unsigned NumOpsWithSameOpcodeParent = 0;
2183 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2184 unsigned CntTrue = 0;
2185 unsigned NumOperands = getNumOperands();
2195 bool AllUndefs =
true;
2196 unsigned NumOpsWithSameOpcodeParent = 0;
2200 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2201 const OperandData &OpData = getData(OpIdx, Lane);
2206 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2208 I->getParent() != Parent) {
2209 if (NumOpsWithSameOpcodeParent == 0) {
2210 NumOpsWithSameOpcodeParent = 1;
2212 Parent =
I->getParent();
2214 --NumOpsWithSameOpcodeParent;
2217 ++NumOpsWithSameOpcodeParent;
2221 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2222 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2226 OperandsOrderData
Data;
2227 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2228 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2236 assert((empty() || VL.
size() == getNumLanes()) &&
2237 "Expected same number of lanes");
2238 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2239 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2240 constexpr unsigned IntrinsicNumOperands = 2;
2241 if (isa<IntrinsicInst>(VL[0]))
2242 NumOperands = IntrinsicNumOperands;
2243 OpsVec.
resize(NumOperands);
2244 unsigned NumLanes = VL.
size();
2245 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2246 OpsVec[OpIdx].
resize(NumLanes);
2247 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2248 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2259 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2260 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2261 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2268 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2271 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2274 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2275 return getData(OpIdx, Lane).V;
2279 bool empty()
const {
return OpsVec.
empty(); }
2282 void clear() { OpsVec.
clear(); }
2287 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2288 bool OpAPO = getData(OpIdx, Lane).APO;
2289 bool IsInvariant = L && L->isLoopInvariant(
Op);
2291 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2295 bool FoundCandidate =
false;
2296 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2297 OperandData &
Data = getData(OpI, Ln);
2298 if (
Data.APO != OpAPO ||
Data.IsUsed)
2300 Value *OpILane = getValue(OpI, Lane);
2301 bool IsConstantOp = isa<Constant>(OpILane);
2310 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2316 isa<Constant>(
Data.V)))) ||
2323 (IsInvariant && !isa<Constant>(
Data.V) &&
2325 L->isLoopInvariant(
Data.V))) {
2326 FoundCandidate =
true;
2333 if (!FoundCandidate)
2336 return getNumLanes() == 2 || Cnt > 1;
2341 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2342 bool OpAPO = getData(OpIdx, Lane).APO;
2343 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2346 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2347 const OperandData &
Data = getData(OpI, Ln);
2348 if (
Data.APO != OpAPO ||
Data.IsUsed)
2350 Value *OpILn = getValue(OpI, Ln);
2351 return (L && L->isLoopInvariant(OpILn)) ||
2353 Op->getParent() == cast<Instruction>(OpILn)->getParent());
2363 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2367 appendOperandsOfVL(RootVL);
2374 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2375 "Expected same num of lanes across all operands");
2376 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2377 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2385 unsigned NumOperands = getNumOperands();
2386 unsigned NumLanes = getNumLanes();
2406 unsigned FirstLane = getBestLaneToStartReordering();
2409 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2410 Value *OpLane0 = getValue(OpIdx, FirstLane);
2413 if (isa<LoadInst>(OpLane0))
2414 ReorderingModes[OpIdx] = ReorderingMode::Load;
2415 else if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2417 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2418 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2419 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2421 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2422 }
else if (isa<Constant>(OpLane0))
2423 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2424 else if (isa<Argument>(OpLane0))
2426 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2429 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2436 auto &&SkipReordering = [
this]() {
2439 for (
const OperandData &
Data : Op0)
2442 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2461 if (SkipReordering())
2464 bool StrategyFailed =
false;
2472 for (
unsigned I = 0;
I < NumOperands; ++
I)
2473 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2475 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2478 int Lane = FirstLane +
Direction * Distance;
2479 if (Lane < 0 || Lane >= (
int)NumLanes)
2482 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2485 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2487 std::optional<unsigned> BestIdx = getBestOperand(
2488 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2495 swap(OpIdx, *BestIdx, Lane);
2498 StrategyFailed =
true;
2501 if (MainAltOps[OpIdx].
size() != 2) {
2502 OperandData &AltOp = getData(OpIdx, Lane);
2503 InstructionsState OpS =
2505 if (OpS.getOpcode() && OpS.isAltShuffle())
2512 if (!StrategyFailed)
2517#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2520 case ReorderingMode::Load:
2522 case ReorderingMode::Opcode:
2524 case ReorderingMode::Constant:
2526 case ReorderingMode::Splat:
2528 case ReorderingMode::Failed:
2549 const unsigned Indent = 2;
2552 OS <<
"Operand " << Cnt++ <<
"\n";
2553 for (
const OperandData &OpData : OpDataVec) {
2555 if (
Value *V = OpData.V)
2559 OS <<
", APO:" << OpData.APO <<
"}\n";
2581 int BestScore = Limit;
2582 std::optional<int>
Index;
2583 for (
int I : seq<int>(0, Candidates.size())) {
2585 Candidates[
I].second,
2588 if (Score > BestScore) {
2603 DeletedInstructions.insert(
I);
2608 template <
typename T>
2611 for (
T *V : DeadVals) {
2612 auto *
I = cast<Instruction>(V);
2613 DeletedInstructions.insert(
I);
2616 for (
T *V : DeadVals) {
2617 if (!V || !Processed.
insert(V).second)
2619 auto *
I = cast<Instruction>(V);
2622 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2623 Entries.push_back(Entry);
2624 auto It = MultiNodeScalars.find(
I);
2625 if (It != MultiNodeScalars.end())
2626 Entries.append(It->second.begin(), It->second.end());
2628 for (
Use &U :
I->operands()) {
2629 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2630 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2632 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2633 return Entry->VectorizedValue == OpI;
2637 I->dropAllReferences();
2639 for (
T *V : DeadVals) {
2640 auto *
I = cast<Instruction>(V);
2641 if (!
I->getParent())
2646 cast<Instruction>(U.getUser()));
2648 "trying to erase instruction with users.");
2649 I->removeFromParent();
2653 while (!DeadInsts.
empty()) {
2656 if (!VI || !VI->getParent())
2659 "Live instruction found in dead worklist!");
2660 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2667 for (
Use &OpU : VI->operands()) {
2668 Value *OpV = OpU.get();
2679 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2680 if (!DeletedInstructions.contains(OpI) &&
2685 VI->removeFromParent();
2686 DeletedInstructions.insert(VI);
2694 return AnalyzedReductionsRoots.count(
I);
2699 AnalyzedReductionsRoots.insert(
I);
2713 AnalyzedReductionsRoots.clear();
2714 AnalyzedReductionVals.
clear();
2715 AnalyzedMinBWVals.
clear();
2727 return NonScheduledFirst.
contains(V);
2740 bool collectValuesToDemote(
const TreeEntry &E,
bool IsProfitableToDemoteRoot,
2744 unsigned &MaxDepthLevel,
2745 bool &IsProfitableToDemote,
2746 bool IsTruncRoot)
const;
2756 canReorderOperands(TreeEntry *UserTE,
2763 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2767 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2769 TreeEntry *TE =
nullptr;
2771 TE = getTreeEntry(V);
2772 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2774 auto It = MultiNodeScalars.find(V);
2775 if (It != MultiNodeScalars.end()) {
2776 for (TreeEntry *E : It->second) {
2777 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2785 if (It != VL.
end()) {
2786 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2794 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2795 unsigned OpIdx)
const {
2796 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2797 const_cast<TreeEntry *
>(UserTE), OpIdx);
2801 bool areAllUsersVectorized(
2810 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
2814 getCastContextHint(
const TreeEntry &TE)
const;
2823 const EdgeInfo &EI);
2834 bool ResizeAllowed =
false)
const;
2845 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
2850 template <
typename BVTy,
typename ResTy,
typename...
Args>
2851 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
2856 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
2862 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
2869 std::optional<TargetTransformInfo::ShuffleKind>
2881 unsigned NumParts)
const;
2893 std::optional<TargetTransformInfo::ShuffleKind>
2894 isGatherShuffledSingleRegisterEntry(
2911 isGatherShuffledEntry(
2914 unsigned NumParts,
bool ForOrder =
false);
2921 Type *ScalarTy)
const;
2925 void setInsertPointAfterBundle(
const TreeEntry *E);
2933 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2946 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2962 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2966 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2983 [Scalars](
Value *V,
int Idx) {
2984 return (isa<UndefValue>(V) &&
2985 Idx == PoisonMaskElem) ||
2986 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2989 if (!ReorderIndices.empty()) {
2996 return IsSame(Scalars, Mask);
2997 if (VL.
size() == ReuseShuffleIndices.size()) {
2999 return IsSame(Scalars, Mask);
3003 return IsSame(Scalars, ReuseShuffleIndices);
3006 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3007 return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3008 UserTreeIndices.front().UserTE == UserEI.UserTE;
3012 bool hasEqualOperands(
const TreeEntry &TE)
const {
3013 if (
TE.getNumOperands() != getNumOperands())
3016 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3017 unsigned PrevCount =
Used.count();
3018 for (
unsigned K = 0;
K < E; ++
K) {
3021 if (getOperand(K) ==
TE.getOperand(
I)) {
3027 if (PrevCount ==
Used.count())
3036 unsigned getVectorFactor()
const {
3037 if (!ReuseShuffleIndices.empty())
3038 return ReuseShuffleIndices.size();
3039 return Scalars.
size();
3043 bool isGather()
const {
return State == NeedToGather; }
3070 enum CombinedOpcode {
3072 MinMax = Instruction::OtherOpsEnd + 1,
3074 CombinedOpcode CombinedOp = NotCombinedOp;
3088 VecTreeTy &Container;
3112 assert(Operands[OpIdx].empty() &&
"Already resized?");
3114 "Number of operands is greater than the number of scalars.");
3120 void setOperandsInOrder() {
3122 auto *I0 = cast<Instruction>(Scalars[0]);
3123 Operands.resize(I0->getNumOperands());
3124 unsigned NumLanes = Scalars.size();
3125 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3126 OpIdx != NumOperands; ++OpIdx) {
3128 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3129 auto *
I = cast<Instruction>(Scalars[Lane]);
3130 assert(
I->getNumOperands() == NumOperands &&
3131 "Expected same number of operands");
3132 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
3156 unsigned getNumOperands()
const {
return Operands.size(); }
3159 Value *getSingleOperand(
unsigned OpIdx)
const {
3161 assert(!Operands[OpIdx].empty() &&
"No operand available");
3166 bool isAltShuffle()
const {
return MainOp != AltOp; }
3169 unsigned CheckedOpcode =
I->getOpcode();
3170 return (getOpcode() == CheckedOpcode ||
3171 getAltOpcode() == CheckedOpcode);
3178 auto *
I = dyn_cast<Instruction>(
Op);
3179 if (
I && isOpcodeOrAlt(
I))
3184 void setOperations(
const InstructionsState &S) {
3198 unsigned getOpcode()
const {
3199 return MainOp ? MainOp->
getOpcode() : 0;
3202 unsigned getAltOpcode()
const {
3208 int findLaneForValue(
Value *V)
const {
3209 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
3210 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3211 if (!ReorderIndices.
empty())
3212 FoundLane = ReorderIndices[FoundLane];
3213 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3214 if (!ReuseShuffleIndices.
empty()) {
3215 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
3216 find(ReuseShuffleIndices, FoundLane));
3230 bool isNonPowOf2Vec()
const {
3232 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3233 "Reshuffling not supported with non-power-of-2 vectors yet.");
3234 return IsNonPowerOf2;
3241 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3242 dbgs() <<
"Operand " << OpI <<
":\n";
3243 for (
const Value *V : Operands[OpI])
3246 dbgs() <<
"Scalars: \n";
3247 for (
Value *V : Scalars)
3249 dbgs() <<
"State: ";
3252 dbgs() <<
"Vectorize\n";
3254 case ScatterVectorize:
3255 dbgs() <<
"ScatterVectorize\n";
3257 case StridedVectorize:
3258 dbgs() <<
"StridedVectorize\n";
3261 dbgs() <<
"NeedToGather\n";
3263 case CombinedVectorize:
3264 dbgs() <<
"CombinedVectorize\n";
3267 dbgs() <<
"MainOp: ";
3269 dbgs() << *MainOp <<
"\n";
3272 dbgs() <<
"AltOp: ";
3274 dbgs() << *AltOp <<
"\n";
3277 dbgs() <<
"VectorizedValue: ";
3278 if (VectorizedValue)
3279 dbgs() << *VectorizedValue <<
"\n";
3282 dbgs() <<
"ReuseShuffleIndices: ";
3283 if (ReuseShuffleIndices.
empty())
3286 for (
int ReuseIdx : ReuseShuffleIndices)
3287 dbgs() << ReuseIdx <<
", ";
3289 dbgs() <<
"ReorderIndices: ";
3290 for (
unsigned ReorderIdx : ReorderIndices)
3291 dbgs() << ReorderIdx <<
", ";
3293 dbgs() <<
"UserTreeIndices: ";
3294 for (
const auto &EInfo : UserTreeIndices)
3295 dbgs() << EInfo <<
", ";
3302 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3305 dbgs() <<
"SLP: " << Banner <<
":\n";
3307 dbgs() <<
"SLP: Costs:\n";
3308 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3309 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3310 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3311 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3312 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3318 std::optional<ScheduleData *> Bundle,
3319 const InstructionsState &S,
3320 const EdgeInfo &UserTreeIdx,
3323 TreeEntry::EntryState EntryState =
3324 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3325 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3326 ReuseShuffleIndices, ReorderIndices);
3330 TreeEntry::EntryState EntryState,
3331 std::optional<ScheduleData *> Bundle,
3332 const InstructionsState &S,
3333 const EdgeInfo &UserTreeIdx,
3336 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3337 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3338 "Need to vectorize gather entry?");
3339 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3340 TreeEntry *
Last = VectorizableTree.
back().get();
3341 Last->Idx = VectorizableTree.
size() - 1;
3342 Last->State = EntryState;
3343 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3344 ReuseShuffleIndices.end());
3345 if (ReorderIndices.
empty()) {
3347 Last->setOperations(S);
3350 Last->Scalars.assign(VL.
size(),
nullptr);
3353 if (Idx >= VL.size())
3354 return UndefValue::get(VL.front()->getType());
3358 Last->setOperations(S);
3359 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3361 if (!
Last->isGather()) {
3362 for (
Value *V : VL) {
3363 const TreeEntry *
TE = getTreeEntry(V);
3365 "Scalar already in tree!");
3368 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3371 ScalarToTreeEntry[
V] =
Last;
3374 ScheduleData *BundleMember = *Bundle;
3375 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3378 "Bundle and VL out of sync");
3380 for (
Value *V : VL) {
3385 BundleMember->TE =
Last;
3386 BundleMember = BundleMember->NextInBundle;
3389 assert(!BundleMember &&
"Bundle and VL out of sync");
3392 bool AllConstsOrCasts =
true;
3395 auto *
I = dyn_cast<CastInst>(V);
3396 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3399 if (AllConstsOrCasts)
3401 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3402 MustGather.
insert(VL.begin(), VL.end());
3405 if (UserTreeIdx.UserTE) {
3406 Last->UserTreeIndices.push_back(UserTreeIdx);
3407 assert((!
Last->isNonPowOf2Vec() ||
Last->ReorderIndices.empty()) &&
3408 "Reordering isn't implemented for non-power-of-2 nodes yet");
3415 TreeEntry::VecTreeTy VectorizableTree;
3420 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3421 VectorizableTree[
Id]->dump();
3427 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3429 const TreeEntry *getTreeEntry(
Value *V)
const {
3430 return ScalarToTreeEntry.lookup(V);
3439 bool areAltOperandsProfitable(
const InstructionsState &S,
3444 TreeEntry::EntryState getScalarsVectorizationState(
3477 using ValueToGatherNodesMap =
3479 ValueToGatherNodesMap ValueToGatherNodes;
3482 struct ExternalUser {
3506 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3507 auto It = AliasCache.
find(Key);
3508 if (It != AliasCache.
end())
3513 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3517 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3549 UserList ExternalUses;
3569 struct ScheduleData {
3572 enum { InvalidDeps = -1 };
3574 ScheduleData() =
default;
3577 FirstInBundle =
this;
3578 NextInBundle =
nullptr;
3579 NextLoadStore =
nullptr;
3580 IsScheduled =
false;
3581 SchedulingRegionID = BlockSchedulingRegionID;
3582 clearDependencies();
3589 if (hasValidDependencies()) {
3590 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3592 assert(UnscheduledDeps == Dependencies &&
"invariant");
3596 assert(isSchedulingEntity() &&
3597 "unexpected scheduled state");
3598 for (
const ScheduleData *BundleMember =
this; BundleMember;
3599 BundleMember = BundleMember->NextInBundle) {
3600 assert(BundleMember->hasValidDependencies() &&
3601 BundleMember->UnscheduledDeps == 0 &&
3602 "unexpected scheduled state");
3603 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3604 "only bundle is marked scheduled");
3608 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3609 "all bundle members must be in same basic block");
3615 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3619 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3623 bool isPartOfBundle()
const {
3624 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3629 bool isReady()
const {
3630 assert(isSchedulingEntity() &&
3631 "can't consider non-scheduling entity for ready list");
3632 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3638 int incrementUnscheduledDeps(
int Incr) {
3639 assert(hasValidDependencies() &&
3640 "increment of unscheduled deps would be meaningless");
3641 UnscheduledDeps += Incr;
3642 return FirstInBundle->unscheduledDepsInBundle();
3647 void resetUnscheduledDeps() {
3648 UnscheduledDeps = Dependencies;
3652 void clearDependencies() {
3653 Dependencies = InvalidDeps;
3654 resetUnscheduledDeps();
3655 MemoryDependencies.clear();
3656 ControlDependencies.clear();
3659 int unscheduledDepsInBundle()
const {
3660 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3662 for (
const ScheduleData *BundleMember =
this; BundleMember;
3663 BundleMember = BundleMember->NextInBundle) {
3664 if (BundleMember->UnscheduledDeps == InvalidDeps)
3666 Sum += BundleMember->UnscheduledDeps;
3672 if (!isSchedulingEntity()) {
3673 os <<
"/ " << *Inst;
3674 }
else if (NextInBundle) {
3676 ScheduleData *SD = NextInBundle;
3678 os <<
';' << *SD->Inst;
3679 SD = SD->NextInBundle;
3690 TreeEntry *
TE =
nullptr;
3694 ScheduleData *FirstInBundle =
nullptr;
3698 ScheduleData *NextInBundle =
nullptr;
3702 ScheduleData *NextLoadStore =
nullptr;
3716 int SchedulingRegionID = 0;
3719 int SchedulingPriority = 0;
3725 int Dependencies = InvalidDeps;
3731 int UnscheduledDeps = InvalidDeps;
3735 bool IsScheduled =
false;
3740 const BoUpSLP::ScheduleData &SD) {
3765 struct BlockScheduling {
3767 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3771 ScheduleStart =
nullptr;
3772 ScheduleEnd =
nullptr;
3773 FirstLoadStoreInRegion =
nullptr;
3774 LastLoadStoreInRegion =
nullptr;
3775 RegionHasStackSave =
false;
3779 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3782 ScheduleRegionSize = 0;
3786 ++SchedulingRegionID;
3790 if (BB !=
I->getParent())
3793 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3794 if (SD && isInSchedulingRegion(SD))
3799 ScheduleData *getScheduleData(
Value *V) {
3800 if (
auto *
I = dyn_cast<Instruction>(V))
3801 return getScheduleData(
I);
3805 bool isInSchedulingRegion(ScheduleData *SD)
const {
3806 return SD->SchedulingRegionID == SchedulingRegionID;
3811 template <
typename ReadyListType>
3812 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3813 SD->IsScheduled =
true;
3816 for (ScheduleData *BundleMember = SD; BundleMember;
3817 BundleMember = BundleMember->NextInBundle) {
3822 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3823 ScheduleData *OpDef = getScheduleData(
I);
3824 if (OpDef && OpDef->hasValidDependencies() &&
3825 OpDef->incrementUnscheduledDeps(-1) == 0) {
3829 ScheduleData *DepBundle = OpDef->FirstInBundle;
3830 assert(!DepBundle->IsScheduled &&
3831 "already scheduled bundle gets ready");
3832 ReadyList.insert(DepBundle);
3834 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3841 if (TreeEntry *TE = BundleMember->TE) {
3843 int Lane = std::distance(
TE->Scalars.begin(),
3844 find(
TE->Scalars, BundleMember->Inst));
3845 assert(Lane >= 0 &&
"Lane not set");
3853 auto *
In = BundleMember->Inst;
3856 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3857 In->getNumOperands() ==
TE->getNumOperands()) &&
3858 "Missed TreeEntry operands?");
3861 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3862 OpIdx != NumOperands; ++OpIdx)
3863 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3868 for (
Use &U : BundleMember->Inst->operands())
3869 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3873 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3874 if (MemoryDepSD->hasValidDependencies() &&
3875 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3878 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3879 assert(!DepBundle->IsScheduled &&
3880 "already scheduled bundle gets ready");
3881 ReadyList.insert(DepBundle);
3883 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3887 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3888 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3891 ScheduleData *DepBundle = DepSD->FirstInBundle;
3892 assert(!DepBundle->IsScheduled &&
3893 "already scheduled bundle gets ready");
3894 ReadyList.insert(DepBundle);
3896 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3907 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3908 ScheduleStart->comesBefore(ScheduleEnd) &&
3909 "Not a valid scheduling region?");
3911 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3912 auto *SD = getScheduleData(
I);
3915 assert(isInSchedulingRegion(SD) &&
3916 "primary schedule data not in window?");
3917 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3918 "entire bundle in window!");
3922 for (
auto *SD : ReadyInsts) {
3923 assert(SD->isSchedulingEntity() && SD->isReady() &&
3924 "item in ready list not ready?");
3930 template <
typename ReadyListType>
3931 void initialFillReadyList(ReadyListType &ReadyList) {
3932 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3933 ScheduleData *SD = getScheduleData(
I);
3934 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3936 ReadyList.insert(SD);
3938 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3952 std::optional<ScheduleData *>
3954 const InstructionsState &S);
3960 ScheduleData *allocateScheduleDataChunks();
3964 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3969 ScheduleData *PrevLoadStore,
3970 ScheduleData *NextLoadStore);
3974 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3978 void resetSchedule();
4008 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4012 ScheduleData *LastLoadStoreInRegion =
nullptr;
4017 bool RegionHasStackSave =
false;
4020 int ScheduleRegionSize = 0;
4029 int SchedulingRegionID = 1;
4037 void scheduleBlock(BlockScheduling *BS);
4044 struct OrdersTypeDenseMapInfo {
4057 static unsigned getHashValue(
const OrdersType &V) {
4078 unsigned MaxVecRegSize;
4079 unsigned MinVecRegSize;
4094 unsigned ReductionBitWidth = 0;
4098 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4117 struct ChildIteratorType
4119 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4130 return R.VectorizableTree[0].get();
4134 return {
N->UserTreeIndices.begin(),
N->Container};
4138 return {
N->UserTreeIndices.end(),
N->Container};
4143 class nodes_iterator {
4154 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4158 return nodes_iterator(R->VectorizableTree.begin());
4162 return nodes_iterator(R->VectorizableTree.end());
4165 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4176 OS << Entry->Idx <<
".\n";
4179 for (
auto *V : Entry->Scalars) {
4181 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4182 return EU.Scalar == V;
4192 if (Entry->isGather())
4194 if (Entry->State == TreeEntry::ScatterVectorize ||
4195 Entry->State == TreeEntry::StridedVectorize)
4196 return "color=blue";
4205 for (
auto *
I : DeletedInstructions) {
4206 if (!
I->getParent()) {
4209 if (isa<PHINode>(
I))
4211 I->insertBefore(
F->getEntryBlock(),
4212 F->getEntryBlock().getFirstNonPHIIt());
4214 I->insertBefore(
F->getEntryBlock().getTerminator());
4217 for (
Use &U :
I->operands()) {
4218 auto *
Op = dyn_cast<Instruction>(U.get());
4219 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4223 I->dropAllReferences();
4225 for (
auto *
I : DeletedInstructions) {
4227 "trying to erase instruction with users.");
4228 I->eraseFromParent();
4234#ifdef EXPENSIVE_CHECKS
4245 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4246 "Expected non-empty mask.");
4249 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4251 Reuses[Mask[
I]] = Prev[
I];
4259 bool BottomOrder =
false) {
4260 assert(!Mask.empty() &&
"Expected non-empty mask.");
4261 unsigned Sz = Mask.size();
4264 if (Order.
empty()) {
4266 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4268 PrevOrder.
swap(Order);
4271 for (
unsigned I = 0;
I < Sz; ++
I)
4273 Order[
I] = PrevOrder[Mask[
I]];
4275 return Data.value() == Sz ||
Data.index() ==
Data.value();
4284 if (Order.
empty()) {
4286 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4296 for (
unsigned I = 0;
I < Sz; ++
I)
4298 Order[MaskOrder[
I]] =
I;
4302std::optional<BoUpSLP::OrdersType>
4304 assert(TE.isGather() &&
"Expected gather node only.");
4308 Type *ScalarTy = GatheredScalars.
front()->getType();
4309 int NumScalars = GatheredScalars.
size();
4311 return std::nullopt;
4314 if (NumParts == 0 || NumParts >= NumScalars)
4320 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4322 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4325 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4326 return std::nullopt;
4327 OrdersType CurrentOrder(NumScalars, NumScalars);
4328 if (GatherShuffles.
size() == 1 &&
4330 Entries.front().front()->isSame(TE.Scalars)) {
4333 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4334 return CurrentOrder;
4338 return all_of(Mask, [&](
int I) {
4345 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4346 (Entries.size() != 1 ||
4347 Entries.front().front()->ReorderIndices.empty())) ||
4348 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4349 return std::nullopt;
4354 for (
int I : seq<int>(0, NumParts)) {
4355 if (ShuffledSubMasks.
test(
I))
4357 const int VF = GetVF(
I);
4363 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4364 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4365 ShuffledSubMasks.
set(
I);
4369 int FirstMin = INT_MAX;
4370 int SecondVecFound =
false;
4371 for (
int K : seq<int>(Limit)) {
4372 int Idx = Mask[
I * PartSz + K];
4374 Value *V = GatheredScalars[
I * PartSz + K];
4376 SecondVecFound =
true;
4385 SecondVecFound =
true;
4389 FirstMin = (FirstMin / PartSz) * PartSz;
4391 if (SecondVecFound) {
4392 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4393 ShuffledSubMasks.
set(
I);
4396 for (
int K : seq<int>(Limit)) {
4397 int Idx = Mask[
I * PartSz + K];
4401 if (
Idx >= PartSz) {
4402 SecondVecFound =
true;
4405 if (CurrentOrder[
I * PartSz +
Idx] >
4406 static_cast<unsigned>(
I * PartSz + K) &&
4407 CurrentOrder[
I * PartSz +
Idx] !=
4408 static_cast<unsigned>(
I * PartSz +
Idx))
4409 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4412 if (SecondVecFound) {
4413 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4414 ShuffledSubMasks.
set(
I);
4420 if (!ExtractShuffles.
empty())
4421 TransformMaskToOrder(
4422 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4423 if (!ExtractShuffles[
I])
4426 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4427 for (
unsigned Idx : seq<unsigned>(Sz)) {
4428 int K =
I * PartSz +
Idx;
4431 if (!TE.ReuseShuffleIndices.empty())
4432 K = TE.ReuseShuffleIndices[K];
4433 if (!TE.ReorderIndices.empty())
4434 K = std::distance(TE.ReorderIndices.begin(),
4435 find(TE.ReorderIndices, K));
4436 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4439 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4441 .getKnownMinValue());
4446 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4447 if (ShuffledSubMasks.
any())
4448 return std::nullopt;
4449 PartSz = NumScalars;
4452 if (!Entries.empty())
4453 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4454 if (!GatherShuffles[
I])
4456 return std::max(Entries[
I].front()->getVectorFactor(),
4457 Entries[
I].back()->getVectorFactor());
4460 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4461 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4462 return std::nullopt;
4463 return std::move(CurrentOrder);
4468 bool CompareOpcodes =
true) {
4471 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4474 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4477 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4481 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4486template <
typename T>
4488 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4490 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4491 return CommonAlignment;
4496 unsigned Sz = Order.
size();
4498 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4509static std::optional<Value *>
4515 const SCEV *PtrSCEVLowest =
nullptr;
4516 const SCEV *PtrSCEVHighest =
nullptr;
4522 return std::nullopt;
4524 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4525 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4529 if (isa<SCEVCouldNotCompute>(Diff))
4530 return std::nullopt;
4532 PtrSCEVLowest = PtrSCEV;
4536 if (isa<SCEVCouldNotCompute>(Diff1))
4537 return std::nullopt;
4539 PtrSCEVHighest = PtrSCEV;
4545 if (isa<SCEVCouldNotCompute>(Dist))
4546 return std::nullopt;
4547 int Size =
DL.getTypeStoreSize(ElemTy);
4548 auto TryGetStride = [&](
const SCEV *Dist,
4549 const SCEV *Multiplier) ->
const SCEV * {
4550 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4551 if (M->getOperand(0) == Multiplier)
4552 return M->getOperand(1);
4553 if (M->getOperand(1) == Multiplier)
4554 return M->getOperand(0);
4557 if (Multiplier == Dist)
4562 const SCEV *Stride =
nullptr;
4563 if (
Size != 1 || SCEVs.
size() > 2) {
4565 Stride = TryGetStride(Dist, Sz);
4567 return std::nullopt;
4569 if (!Stride || isa<SCEVConstant>(Stride))
4570 return std::nullopt;
4573 using DistOrdPair = std::pair<int64_t, int>;
4575 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4577 bool IsConsecutive =
true;
4578 for (
const SCEV *PtrSCEV : SCEVs) {
4580 if (PtrSCEV != PtrSCEVLowest) {
4582 const SCEV *Coeff = TryGetStride(Diff, Stride);
4584 return std::nullopt;
4585 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4586 if (!SC || isa<SCEVCouldNotCompute>(SC))
4587 return std::nullopt;
4591 return std::nullopt;
4592 Dist = SC->getAPInt().getZExtValue();
4596 return std::nullopt;
4597 auto Res = Offsets.emplace(Dist, Cnt);
4599 return std::nullopt;
4601 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4604 if (Offsets.size() != SCEVs.
size())
4605 return std::nullopt;
4606 SortedIndices.
clear();
4607 if (!IsConsecutive) {
4611 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4612 SortedIndices[Cnt] = Pair.second;
4622static std::pair<InstructionCost, InstructionCost>
4638 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4641 Mask, NumSrcElts, NumSubElts,
Index)) {
4642 if (
Index + NumSubElts > NumSrcElts &&
4643 Index + NumSrcElts <=
static_cast<int>(Mask.size()))
4663 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4669 const unsigned Sz = VL.
size();
4671 auto *POIter = PointerOps.
begin();
4672 for (
Value *V : VL) {
4673 auto *L = cast<LoadInst>(V);
4676 *POIter = L->getPointerOperand();
4687 "supported with VectorizeNonPowerOf2");
4691 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4702 if (Order.
empty()) {
4703 Ptr0 = PointerOps.
front();
4704 PtrN = PointerOps.
back();
4706 Ptr0 = PointerOps[Order.
front()];
4707 PtrN = PointerOps[Order.
back()];
4709 std::optional<int> Diff =
4712 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4715 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4729 auto IsAnyPointerUsedOutGraph =
4730 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
4731 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
4732 return !getTreeEntry(U) && !MustGather.contains(U);
4735 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4737 (
static_cast<unsigned>(std::abs(*Diff)) <=
4740 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4741 *Diff == -(
static_cast<int>(Sz) - 1))) {
4742 int Stride = *Diff /
static_cast<int>(Sz - 1);
4743 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4755 else if (
Ptr != Ptr0)
4760 if (((Dist / Stride) * Stride) != Dist ||
4761 !Dists.
insert(Dist).second)
4764 if (Dists.
size() == Sz)
4770 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment) {
4771 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4773 unsigned MaxVF = std::max<unsigned>(
bit_floor(VL.
size() / 2), MinVF);
4774 MaxVF = std::min(
getMaximumVF(Sz, Instruction::Load), MaxVF);
4775 for (
unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4776 unsigned VectorizedCnt = 0;
4778 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End;
4779 Cnt += VF, ++VectorizedCnt) {
4797 if (VectorizedCnt == VL.
size() / VF) {
4800 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4801 TTI, PointerOps, PointerOps.
front(), Instruction::GetElementPtr,
4805 Instruction::Load, VecTy,
4807 false, CommonAlignment,
CostKind) +
4808 VectorGEPCost - ScalarGEPCost;
4812 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
4815 auto [ScalarGEPCost, VectorGEPCost] =
4817 LI0->getPointerOperand(), Instruction::Load,
4820 Instruction::Load, SubVecTy, LI0->getAlign(),
4821 LI0->getPointerAddressSpace(),
CostKind,
4823 VectorGEPCost - ScalarGEPCost;
4827 auto [ScalarGEPCost, VectorGEPCost] =
4829 LI0->getPointerOperand(), Instruction::Load,
4833 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4834 false, CommonAlignment,
CostKind) +
4835 VectorGEPCost - ScalarGEPCost;
4839 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4841 LI0->getPointerOperand(), Instruction::GetElementPtr,
4845 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4846 false, CommonAlignment,
CostKind) +
4847 VectorGEPCost - ScalarGEPCost;
4852 "Expected only consecutive, strided or masked gather loads.");
4855 for (
int Idx : seq<int>(0, VL.
size()))
4859 ShuffleMask,
CostKind,
I * VF, SubVecTy);
4864 if (MaskedGatherCost >= VecLdCost)
4874 bool ProfitableGatherPointers =
4877 return L->isLoopInvariant(V);
4879 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
4880 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
4882 (
GEP &&
GEP->getNumOperands() == 2 &&
4883 isa<Constant, Instruction>(
GEP->getOperand(1)));
4885 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4890 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4909 "Expected list of pointer operands.");
4914 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4919 std::optional<int> Diff =
4925 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4931 if (Bases.
size() > VL.
size() / 2 - 1)
4935 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4941 bool AnyConsecutive =
false;
4942 for (
auto &
Base : Bases) {
4943 auto &Vec =
Base.second;
4944 if (Vec.size() > 1) {
4946 const std::tuple<Value *, int, unsigned> &
Y) {
4947 return std::get<1>(
X) < std::get<1>(
Y);
4949 int InitialOffset = std::get<1>(Vec[0]);
4951 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4957 SortedIndices.
clear();
4958 if (!AnyConsecutive)
4966 for (
auto &
Base : Bases) {