73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Enable vectorization for wider vector utilization"));
122 cl::desc(
"Only vectorize if you gain more than this "
127 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
132 cl::desc(
"Attempt to vectorize horizontal reductions"));
137 "Attempt to vectorize horizontal reductions feeding into a store"));
143 cl::desc(
"Allow optimization of original scalar identity operations on "
144 "matched horizontal reductions."));
148 cl::desc(
"Attempt to vectorize for this register size in bits"));
152 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
160 cl::desc(
"Limit the size of the SLP scheduling region per block"));
164 cl::desc(
"Attempt to vectorize for this register size in bits"));
168 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
172 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
178 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
187 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
191 cl::desc(
"The minimum number of loads, which should be considered strided, "
192 "if the stride is > 1 or is runtime value"));
196 cl::desc(
"The maximum stride, considered to be profitable."));
200 cl::desc(
"Display the SLP trees with Graphviz"));
204 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
235 if (
SLPReVec && isa<FixedVectorType>(Ty))
237 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
243 assert(!isa<ScalableVectorType>(Ty) &&
244 "ScalableVectorType is not supported.");
245 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
246 return VecTy->getNumElements();
263 for (
unsigned I : seq<unsigned>(Mask.size()))
265 I * VecTyNumElements, VecTyNumElements)))
267 : Mask[
I] * VecTyNumElements + J;
274 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
281 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
282 !isa<ExtractValueInst, UndefValue>(V))
284 auto *
I = dyn_cast<Instruction>(V);
285 if (!
I || isa<ExtractValueInst>(
I))
287 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
289 if (isa<ExtractElementInst>(
I))
291 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
307 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
316 OS <<
"Idx: " <<
Idx <<
", ";
317 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
333 for (
int I = 1, E = VL.
size();
I < E;
I++) {
334 auto *
II = dyn_cast<Instruction>(VL[
I]);
338 if (BB !=
II->getParent())
355 Value *FirstNonUndef =
nullptr;
356 for (
Value *V : VL) {
357 if (isa<UndefValue>(V))
359 if (!FirstNonUndef) {
363 if (V != FirstNonUndef)
366 return FirstNonUndef !=
nullptr;
371 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
372 return Cmp->isCommutative();
373 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
374 return BO->isCommutative() ||
375 (BO->getOpcode() == Instruction::Sub &&
381 ICmpInst::Predicate Pred;
382 if (match(U.getUser(),
383 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
384 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
388 return match(U.getUser(),
389 m_Intrinsic<Intrinsic::abs>(
390 m_Specific(U.get()), m_ConstantInt(Flag))) &&
391 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
394 (BO->getOpcode() == Instruction::FSub &&
397 return match(U.getUser(),
398 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
400 return I->isCommutative();
406 static_assert(std::is_same_v<T, InsertElementInst> ||
407 std::is_same_v<T, ExtractElementInst>,
410 if (
const auto *IE = dyn_cast<T>(Inst)) {
411 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
414 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
417 if (CI->getValue().uge(VT->getNumElements()))
419 Index *= VT->getNumElements();
420 Index += CI->getZExtValue();
431 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
433 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
438 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
442 Type *CurrentType =
IV->getType();
443 for (
unsigned I :
IV->indices()) {
444 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
445 Index *= ST->getNumElements();
446 CurrentType = ST->getElementType(
I);
447 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
448 Index *= AT->getNumElements();
449 CurrentType = AT->getElementType();
482 if (MaskArg == UseMask::UndefsAsMask)
486 if (MaskArg == UseMask::FirstArg &&
Value < VF)
487 UseMask.reset(
Value);
488 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
489 UseMask.reset(
Value - VF);
497template <
bool IsPoisonOnly = false>
501 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
504 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
507 auto *
C = dyn_cast<Constant>(V);
509 if (!UseMask.empty()) {
511 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
513 if (isa<T>(
II->getOperand(1)))
520 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
528 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
535 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
536 if (
Constant *Elem =
C->getAggregateElement(
I))
538 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
566static std::optional<TargetTransformInfo::ShuffleKind>
568 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
572 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
573 auto *EI = dyn_cast<ExtractElementInst>(V);
576 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
579 return std::max(S, VTy->getNumElements());
582 Value *Vec1 =
nullptr;
583 Value *Vec2 =
nullptr;
585 auto *EE = dyn_cast<ExtractElementInst>(V);
588 Value *Vec = EE->getVectorOperand();
589 if (isa<UndefValue>(Vec))
594 ShuffleMode CommonShuffleMode =
Unknown;
596 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
598 if (isa<UndefValue>(VL[
I]))
600 auto *EI = cast<ExtractElementInst>(VL[
I]);
601 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
603 auto *Vec = EI->getVectorOperand();
605 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
608 if (isa<UndefValue>(Vec)) {
611 if (isa<UndefValue>(EI->getIndexOperand()))
613 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
619 unsigned IntIdx =
Idx->getValue().getZExtValue();
626 if (!Vec1 || Vec1 == Vec) {
628 }
else if (!Vec2 || Vec2 == Vec) {
634 if (CommonShuffleMode == Permute)
638 if (Mask[
I] %
Size !=
I) {
639 CommonShuffleMode = Permute;
642 CommonShuffleMode =
Select;
645 if (CommonShuffleMode ==
Select && Vec2)
656 assert((Opcode == Instruction::ExtractElement ||
657 Opcode == Instruction::ExtractValue) &&
658 "Expected extractelement or extractvalue instruction.");
659 if (Opcode == Instruction::ExtractElement) {
660 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
663 return CI->getZExtValue();
665 auto *EI = cast<ExtractValueInst>(E);
666 if (EI->getNumIndices() != 1)
668 return *EI->idx_begin();
674struct InstructionsState {
676 Value *OpValue =
nullptr;
687 unsigned getAltOpcode()
const {
692 bool isAltShuffle()
const {
return AltOp != MainOp; }
695 unsigned CheckedOpcode =
I->getOpcode();
696 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
699 InstructionsState() =
delete;
701 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
710 auto *
I = dyn_cast<Instruction>(
Op);
711 if (
I && S.isOpcodeOrAlt(
I))
730 unsigned BaseIndex = 0);
738 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
739 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
740 BaseOp0 == Op0 || BaseOp1 == Op1 ||
751 "Assessing comparisons of different types?");
761 return (BasePred == Pred &&
763 (BasePred == SwappedPred &&
772 unsigned BaseIndex) {
775 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
777 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
778 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
779 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
781 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
783 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
784 unsigned AltOpcode = Opcode;
785 unsigned AltIndex = BaseIndex;
787 bool SwappedPredsCompatible = [&]() {
791 UniquePreds.
insert(BasePred);
792 UniqueNonSwappedPreds.
insert(BasePred);
793 for (
Value *V : VL) {
794 auto *
I = dyn_cast<CmpInst>(V);
800 UniqueNonSwappedPreds.
insert(CurrentPred);
801 if (!UniquePreds.
contains(CurrentPred) &&
802 !UniquePreds.
contains(SwappedCurrentPred))
803 UniquePreds.
insert(CurrentPred);
808 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
812 auto *IBase = cast<Instruction>(VL[BaseIndex]);
815 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
819 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
821 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
822 auto *
I = cast<Instruction>(VL[Cnt]);
823 unsigned InstOpcode =
I->getOpcode();
824 if (IsBinOp && isa<BinaryOperator>(
I)) {
825 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
829 AltOpcode = InstOpcode;
833 }
else if (IsCastOp && isa<CastInst>(
I)) {
834 Value *Op0 = IBase->getOperand(0);
836 Value *Op1 =
I->getOperand(0);
839 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
841 if (Opcode == AltOpcode) {
844 "Cast isn't safe for alternation, logic needs to be updated!");
845 AltOpcode = InstOpcode;
850 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
851 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
852 Type *Ty0 = BaseInst->getOperand(0)->getType();
853 Type *Ty1 = Inst->getOperand(0)->getType();
855 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
862 if ((E == 2 || SwappedPredsCompatible) &&
863 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
868 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
869 if (AltIndex != BaseIndex) {
872 }
else if (BasePred != CurrentPred) {
875 "CmpInst isn't safe for alternation, logic needs to be updated!");
880 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
881 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
884 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
885 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
886 if (Gep->getNumOperands() != 2 ||
887 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
888 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
889 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
891 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
892 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
893 auto *BaseLI = cast<LoadInst>(IBase);
894 if (!LI->isSimple() || !BaseLI->isSimple())
895 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
896 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
897 auto *
CallBase = cast<CallInst>(IBase);
899 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
901 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
902 Call->op_begin() + Call->getBundleOperandsEndIndex(),
905 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
908 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
911 if (Mappings.
size() != BaseMappings.
size() ||
912 Mappings.
front().ISA != BaseMappings.
front().ISA ||
913 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
914 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
915 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
916 Mappings.
front().Shape.Parameters !=
917 BaseMappings.
front().Shape.Parameters)
918 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
923 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
926 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
927 cast<Instruction>(VL[AltIndex]));
943 case Instruction::Load: {
944 LoadInst *LI = cast<LoadInst>(UserInst);
947 case Instruction::Store: {
948 StoreInst *SI = cast<StoreInst>(UserInst);
949 return (SI->getPointerOperand() == Scalar);
951 case Instruction::Call: {
952 CallInst *CI = cast<CallInst>(UserInst);
955 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
956 Arg.value().get() == Scalar;
968 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
975 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
976 return LI->isSimple();
978 return SI->isSimple();
980 return !
MI->isVolatile();
988 bool ExtendingManyInputs =
false) {
992 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
994 (SubMask.
size() == Mask.size() &&
995 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
996 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
997 "SubMask with many inputs support must be larger than the mask.");
999 Mask.append(SubMask.
begin(), SubMask.
end());
1003 int TermValue = std::min(Mask.size(), SubMask.
size());
1004 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1006 (!ExtendingManyInputs &&
1007 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1009 NewMask[
I] = Mask[SubMask[
I]];
1025 const unsigned Sz = Order.
size();
1028 for (
unsigned I = 0;
I < Sz; ++
I) {
1030 UnusedIndices.
reset(Order[
I]);
1032 MaskedIndices.
set(
I);
1034 if (MaskedIndices.
none())
1037 "Non-synced masked/available indices.");
1041 assert(
Idx >= 0 &&
"Indices must be synced.");
1052 Type *ScalarTy = VL[0]->getType();
1055 for (
unsigned Lane : seq<unsigned>(VL.
size()))
1056 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1057 OpcodeMask.
set(Lane * ScalarTyNumElements,
1058 Lane * ScalarTyNumElements + ScalarTyNumElements);
1067 const unsigned E = Indices.
size();
1069 for (
unsigned I = 0;
I < E; ++
I)
1070 Mask[Indices[
I]] =
I;
1076 assert(!Mask.empty() &&
"Expected non-empty mask.");
1080 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1082 Scalars[Mask[
I]] = Prev[
I];
1090 auto *
I = dyn_cast<Instruction>(V);
1095 auto *IO = dyn_cast<Instruction>(V);
1098 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1107 auto *
I = dyn_cast<Instruction>(V);
1111 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1113 auto *IU = dyn_cast<Instruction>(U);
1116 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1132 return !VL.
empty() &&
1136namespace slpvectorizer {
1141 struct ScheduleData;
1166 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1167 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1221 return !VectorizableTree.
empty() &&
1222 !VectorizableTree.
front()->UserTreeIndices.empty();
1227 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1228 return VectorizableTree.
front()->Scalars;
1234 return MinBWs.
at(VectorizableTree.
front().get()).second;
1249 VectorizableTree.
clear();
1250 ScalarToTreeEntry.clear();
1251 MultiNodeScalars.clear();
1253 NonScheduledFirst.
clear();
1254 EntryToLastInstruction.clear();
1255 ExternalUses.
clear();
1256 ExternalUsesAsGEPs.clear();
1257 for (
auto &Iter : BlocksSchedules) {
1258 BlockScheduling *BS = Iter.second.get();
1262 ReductionBitWidth = 0;
1263 CastMaxMinBWSizes.reset();
1264 ExtraBitWidthNodes.
clear();
1265 InstrElementSize.clear();
1266 UserIgnoreList =
nullptr;
1267 PostponedGathers.
clear();
1268 ValueToGatherNodes.
clear();
1325 return MaxVecRegSize;
1330 return MinVecRegSize;
1338 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1340 return MaxVF ? MaxVF : UINT_MAX;
1384 bool TryRecursiveCheck =
true)
const;
1408 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1409 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1431 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1432 MaxLevel(MaxLevel) {}
1486 if (isa<LoadInst>(V1)) {
1488 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1493 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1495 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1498 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1501 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1503 ((
int)V1->getNumUses() == NumLanes ||
1504 AllUsersAreInternal(V1, V2)))
1510 auto CheckSameEntryOrFail = [&]() {
1511 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1512 TE1 && TE1 == R.getTreeEntry(V2))
1517 auto *LI1 = dyn_cast<LoadInst>(V1);
1518 auto *LI2 = dyn_cast<LoadInst>(V2);
1520 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1522 return CheckSameEntryOrFail();
1525 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1526 LI2->getPointerOperand(),
DL, SE,
true);
1527 if (!Dist || *Dist == 0) {
1530 R.TTI->isLegalMaskedGather(
1533 return CheckSameEntryOrFail();
1537 if (std::abs(*Dist) > NumLanes / 2)
1546 auto *C1 = dyn_cast<Constant>(V1);
1547 auto *C2 = dyn_cast<Constant>(V2);
1561 if (isa<UndefValue>(V2))
1565 Value *EV2 =
nullptr;
1578 int Dist = Idx2 - Idx1;
1581 if (std::abs(Dist) == 0)
1583 if (std::abs(Dist) > NumLanes / 2)
1590 return CheckSameEntryOrFail();
1593 auto *I1 = dyn_cast<Instruction>(V1);
1594 auto *I2 = dyn_cast<Instruction>(V2);
1596 if (I1->getParent() != I2->getParent())
1597 return CheckSameEntryOrFail();
1604 if (S.getOpcode() &&
1605 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1606 !S.isAltShuffle()) &&
1608 return cast<Instruction>(V)->getNumOperands() ==
1609 S.MainOp->getNumOperands();
1615 if (isa<UndefValue>(V2))
1618 return CheckSameEntryOrFail();
1652 int ShallowScoreAtThisLevel =
1661 auto *I1 = dyn_cast<Instruction>(
LHS);
1662 auto *I2 = dyn_cast<Instruction>(
RHS);
1663 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1665 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1666 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1667 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1668 ShallowScoreAtThisLevel))
1669 return ShallowScoreAtThisLevel;
1670 assert(I1 && I2 &&
"Should have early exited.");
1677 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1678 OpIdx1 != NumOperands1; ++OpIdx1) {
1680 int MaxTmpScore = 0;
1681 unsigned MaxOpIdx2 = 0;
1682 bool FoundBest =
false;
1686 ? I2->getNumOperands()
1687 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1688 assert(FromIdx <= ToIdx &&
"Bad index");
1689 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1691 if (Op2Used.
count(OpIdx2))
1696 I1, I2, CurrLevel + 1, std::nullopt);
1699 TmpScore > MaxTmpScore) {
1700 MaxTmpScore = TmpScore;
1707 Op2Used.
insert(MaxOpIdx2);
1708 ShallowScoreAtThisLevel += MaxTmpScore;
1711 return ShallowScoreAtThisLevel;
1742 struct OperandData {
1743 OperandData() =
default;
1744 OperandData(
Value *V,
bool APO,
bool IsUsed)
1745 : V(V), APO(APO), IsUsed(IsUsed) {}
1755 bool IsUsed =
false;
1764 enum class ReorderingMode {
1781 const Loop *L =
nullptr;
1784 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1785 return OpsVec[OpIdx][Lane];
1789 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1790 return OpsVec[OpIdx][Lane];
1795 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1796 OpIdx != NumOperands; ++OpIdx)
1797 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1799 OpsVec[OpIdx][Lane].IsUsed =
false;
1803 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1804 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1816 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1817 Value *IdxLaneV = getData(
Idx, Lane).V;
1818 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1821 for (
unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1824 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1825 if (!isa<Instruction>(OpIdxLnV))
1827 Uniques.
insert(OpIdxLnV);
1829 int UniquesCount = Uniques.
size();
1830 int UniquesCntWithIdxLaneV =
1831 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1832 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1833 int UniquesCntWithOpIdxLaneV =
1834 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1835 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1838 UniquesCntWithOpIdxLaneV) -
1839 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1848 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1849 Value *IdxLaneV = getData(
Idx, Lane).V;
1850 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1859 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1860 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1862 return R.areAllUsersVectorized(IdxLaneI)
1870 static const int ScoreScaleFactor = 10;
1878 int Lane,
unsigned OpIdx,
unsigned Idx,
1888 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1889 if (Score <= -SplatScore) {
1894 Score += SplatScore;
1900 Score *= ScoreScaleFactor;
1901 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1919 std::optional<unsigned>
1920 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1923 unsigned NumOperands = getNumOperands();
1926 Value *OpLastLane = getData(OpIdx, LastLane).V;
1929 ReorderingMode RMode = ReorderingModes[OpIdx];
1930 if (RMode == ReorderingMode::Failed)
1931 return std::nullopt;
1934 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1940 std::optional<unsigned>
Idx;
1944 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1950 bool IsUsed = RMode == ReorderingMode::Splat ||
1951 RMode == ReorderingMode::Constant ||
1952 RMode == ReorderingMode::Load;
1954 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1956 OperandData &OpData = getData(
Idx, Lane);
1958 bool OpAPO = OpData.APO;
1967 if (OpAPO != OpIdxAPO)
1972 case ReorderingMode::Load:
1973 case ReorderingMode::Opcode: {
1974 bool LeftToRight = Lane > LastLane;
1975 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1976 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1977 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1978 OpIdx,
Idx, IsUsed);
1979 if (Score >
static_cast<int>(BestOp.Score) ||
1980 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
1983 BestOp.Score = Score;
1984 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1988 case ReorderingMode::Constant:
1989 if (isa<Constant>(
Op) ||
1990 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
1992 if (isa<Constant>(
Op)) {
1994 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1997 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2001 case ReorderingMode::Splat:
2002 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2003 IsUsed =
Op == OpLastLane;
2004 if (
Op == OpLastLane) {
2006 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2012 case ReorderingMode::Failed:
2018 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2022 return std::nullopt;
2029 unsigned getBestLaneToStartReordering()
const {
2030 unsigned Min = UINT_MAX;
2031 unsigned SameOpNumber = 0;
2042 for (
int I = getNumLanes();
I > 0; --
I) {
2043 unsigned Lane =
I - 1;
2044 OperandsOrderData NumFreeOpsHash =
2045 getMaxNumOperandsThatCanBeReordered(Lane);
2048 if (NumFreeOpsHash.NumOfAPOs < Min) {
2049 Min = NumFreeOpsHash.NumOfAPOs;
2050 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2052 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2053 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2054 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2057 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2058 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2059 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2060 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2061 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
2062 if (It == HashMap.
end())
2063 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2069 unsigned BestLane = 0;
2070 unsigned CntMin = UINT_MAX;
2072 if (
Data.second.first < CntMin) {
2073 CntMin =
Data.second.first;
2074 BestLane =
Data.second.second;
2081 struct OperandsOrderData {
2084 unsigned NumOfAPOs = UINT_MAX;
2087 unsigned NumOpsWithSameOpcodeParent = 0;
2101 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2102 unsigned CntTrue = 0;
2103 unsigned NumOperands = getNumOperands();
2113 bool AllUndefs =
true;
2114 unsigned NumOpsWithSameOpcodeParent = 0;
2118 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2119 const OperandData &OpData = getData(OpIdx, Lane);
2124 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2126 I->getParent() != Parent) {
2127 if (NumOpsWithSameOpcodeParent == 0) {
2128 NumOpsWithSameOpcodeParent = 1;
2130 Parent =
I->getParent();
2132 --NumOpsWithSameOpcodeParent;
2135 ++NumOpsWithSameOpcodeParent;
2139 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2140 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2144 OperandsOrderData
Data;
2145 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2146 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2154 assert((empty() || VL.
size() == getNumLanes()) &&
2155 "Expected same number of lanes");
2156 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2157 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2158 constexpr unsigned IntrinsicNumOperands = 2;
2159 if (isa<IntrinsicInst>(VL[0]))
2160 NumOperands = IntrinsicNumOperands;
2161 OpsVec.
resize(NumOperands);
2162 unsigned NumLanes = VL.
size();
2163 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2164 OpsVec[OpIdx].
resize(NumLanes);
2165 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2166 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2177 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2178 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2179 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2186 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2189 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2192 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2193 return getData(OpIdx, Lane).V;
2197 bool empty()
const {
return OpsVec.
empty(); }
2200 void clear() { OpsVec.
clear(); }
2205 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2206 bool OpAPO = getData(OpIdx, Lane).APO;
2207 bool IsInvariant = L && L->isLoopInvariant(
Op);
2209 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2213 bool FoundCandidate =
false;
2214 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2215 OperandData &
Data = getData(OpI, Ln);
2216 if (
Data.APO != OpAPO ||
Data.IsUsed)
2218 Value *OpILane = getValue(OpI, Lane);
2219 bool IsConstantOp = isa<Constant>(OpILane);
2228 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2234 isa<Constant>(
Data.V)))) ||
2241 (IsInvariant && !isa<Constant>(
Data.V) &&
2243 L->isLoopInvariant(
Data.V))) {
2244 FoundCandidate =
true;
2251 if (!FoundCandidate)
2254 return getNumLanes() == 2 || Cnt > 1;
2259 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2260 bool OpAPO = getData(OpIdx, Lane).APO;
2261 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2264 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2265 const OperandData &
Data = getData(OpI, Ln);
2266 if (
Data.APO != OpAPO ||
Data.IsUsed)
2268 Value *OpILn = getValue(OpI, Ln);
2269 return (L && L->isLoopInvariant(OpILn)) ||
2271 Op->getParent() == cast<Instruction>(OpILn)->getParent());
2281 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2285 appendOperandsOfVL(RootVL);
2292 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2293 "Expected same num of lanes across all operands");
2294 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2295 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2303 unsigned NumOperands = getNumOperands();
2304 unsigned NumLanes = getNumLanes();
2324 unsigned FirstLane = getBestLaneToStartReordering();
2327 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2328 Value *OpLane0 = getValue(OpIdx, FirstLane);
2331 if (isa<LoadInst>(OpLane0))
2332 ReorderingModes[OpIdx] = ReorderingMode::Load;
2333 else if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2335 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2336 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2337 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2339 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2340 }
else if (isa<Constant>(OpLane0))
2341 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2342 else if (isa<Argument>(OpLane0))
2344 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2347 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2354 auto &&SkipReordering = [
this]() {
2357 for (
const OperandData &
Data : Op0)
2360 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2379 if (SkipReordering())
2382 bool StrategyFailed =
false;
2390 for (
unsigned I = 0;
I < NumOperands; ++
I)
2391 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2393 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2396 int Lane = FirstLane +
Direction * Distance;
2397 if (Lane < 0 || Lane >= (
int)NumLanes)
2400 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2403 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2405 std::optional<unsigned> BestIdx = getBestOperand(
2406 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2413 swap(OpIdx, *BestIdx, Lane);
2416 StrategyFailed =
true;
2419 if (MainAltOps[OpIdx].
size() != 2) {
2420 OperandData &AltOp = getData(OpIdx, Lane);
2421 InstructionsState OpS =
2423 if (OpS.getOpcode() && OpS.isAltShuffle())
2430 if (!StrategyFailed)
2435#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2438 case ReorderingMode::Load:
2440 case ReorderingMode::Opcode:
2442 case ReorderingMode::Constant:
2444 case ReorderingMode::Splat:
2446 case ReorderingMode::Failed:
2467 const unsigned Indent = 2;
2470 OS <<
"Operand " << Cnt++ <<
"\n";
2471 for (
const OperandData &OpData : OpDataVec) {
2473 if (
Value *V = OpData.V)
2477 OS <<
", APO:" << OpData.APO <<
"}\n";
2499 int BestScore = Limit;
2500 std::optional<int>
Index;
2501 for (
int I : seq<int>(0, Candidates.size())) {
2503 Candidates[
I].second,
2506 if (Score > BestScore) {
2521 DeletedInstructions.insert(
I);
2526 template <
typename T>
2529 for (
T *V : DeadVals) {
2530 auto *
I = cast<Instruction>(V);
2531 DeletedInstructions.insert(
I);
2534 for (
T *V : DeadVals) {
2535 if (!V || !Processed.
insert(V).second)
2537 auto *
I = cast<Instruction>(V);
2540 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2541 Entries.push_back(Entry);
2542 auto It = MultiNodeScalars.find(
I);
2543 if (It != MultiNodeScalars.end())
2544 Entries.append(It->second.begin(), It->second.end());
2546 for (
Use &U :
I->operands()) {
2547 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2548 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2550 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2551 return Entry->VectorizedValue == OpI;
2555 I->dropAllReferences();
2557 for (
T *V : DeadVals) {
2558 auto *
I = cast<Instruction>(V);
2559 if (!
I->getParent())
2564 cast<Instruction>(U.getUser()));
2566 "trying to erase instruction with users.");
2567 I->removeFromParent();
2571 while (!DeadInsts.
empty()) {
2574 if (!VI || !VI->getParent())
2577 "Live instruction found in dead worklist!");
2578 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2585 for (
Use &OpU : VI->operands()) {
2586 Value *OpV = OpU.get();
2597 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2598 if (!DeletedInstructions.contains(OpI) &&
2603 VI->removeFromParent();
2604 DeletedInstructions.insert(VI);
2612 return AnalyzedReductionsRoots.count(
I);
2617 AnalyzedReductionsRoots.insert(
I);
2631 AnalyzedReductionsRoots.clear();
2632 AnalyzedReductionVals.
clear();
2633 AnalyzedMinBWVals.
clear();
2645 return NonScheduledFirst.
contains(V);
2658 bool collectValuesToDemote(
const TreeEntry &E,
bool IsProfitableToDemoteRoot,
2662 unsigned &MaxDepthLevel,
2663 bool &IsProfitableToDemote,
2664 bool IsTruncRoot)
const;
2674 canReorderOperands(TreeEntry *UserTE,
2681 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2685 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2687 TreeEntry *TE =
nullptr;
2689 TE = getTreeEntry(V);
2690 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2692 auto It = MultiNodeScalars.find(V);
2693 if (It != MultiNodeScalars.end()) {
2694 for (TreeEntry *E : It->second) {
2695 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2703 if (It != VL.
end()) {
2704 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2712 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2713 unsigned OpIdx)
const {
2714 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2715 const_cast<TreeEntry *
>(UserTE), OpIdx);
2719 bool areAllUsersVectorized(
2728 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
2732 getCastContextHint(
const TreeEntry &TE)
const;
2741 const EdgeInfo &EI);
2752 bool ResizeAllowed =
false)
const;
2763 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
2768 template <
typename BVTy,
typename ResTy,
typename...
Args>
2769 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
2774 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
2780 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
2787 std::optional<TargetTransformInfo::ShuffleKind>
2799 unsigned NumParts)
const;
2811 std::optional<TargetTransformInfo::ShuffleKind>
2812 isGatherShuffledSingleRegisterEntry(
2829 isGatherShuffledEntry(
2832 unsigned NumParts,
bool ForOrder =
false);
2839 Type *ScalarTy)
const;
2843 void setInsertPointAfterBundle(
const TreeEntry *E);
2851 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2864 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2880 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2884 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2901 [Scalars](
Value *V,
int Idx) {
2902 return (isa<UndefValue>(V) &&
2903 Idx == PoisonMaskElem) ||
2904 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2907 if (!ReorderIndices.empty()) {
2914 return IsSame(Scalars, Mask);
2915 if (VL.
size() == ReuseShuffleIndices.size()) {
2917 return IsSame(Scalars, Mask);
2921 return IsSame(Scalars, ReuseShuffleIndices);
2924 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2925 return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2926 UserTreeIndices.front().UserTE == UserEI.UserTE;
2930 bool hasEqualOperands(
const TreeEntry &TE)
const {
2931 if (
TE.getNumOperands() != getNumOperands())
2934 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
2935 unsigned PrevCount =
Used.count();
2936 for (
unsigned K = 0;
K < E; ++
K) {
2939 if (getOperand(K) ==
TE.getOperand(
I)) {
2945 if (PrevCount ==
Used.count())
2954 unsigned getVectorFactor()
const {
2955 if (!ReuseShuffleIndices.empty())
2956 return ReuseShuffleIndices.size();
2957 return Scalars.
size();
2961 bool isGather()
const {
return State == NeedToGather; }
2988 enum CombinedOpcode {
2990 MinMax = Instruction::OtherOpsEnd + 1,
2992 CombinedOpcode CombinedOp = NotCombinedOp;
3006 VecTreeTy &Container;
3030 assert(Operands[OpIdx].empty() &&
"Already resized?");
3032 "Number of operands is greater than the number of scalars.");
3038 void setOperandsInOrder() {
3040 auto *I0 = cast<Instruction>(Scalars[0]);
3041 Operands.resize(I0->getNumOperands());
3042 unsigned NumLanes = Scalars.size();
3043 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3044 OpIdx != NumOperands; ++OpIdx) {
3046 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3047 auto *
I = cast<Instruction>(Scalars[Lane]);
3048 assert(
I->getNumOperands() == NumOperands &&
3049 "Expected same number of operands");
3050 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
3074 unsigned getNumOperands()
const {
return Operands.size(); }
3077 Value *getSingleOperand(
unsigned OpIdx)
const {
3079 assert(!Operands[OpIdx].empty() &&
"No operand available");
3084 bool isAltShuffle()
const {
return MainOp != AltOp; }
3087 unsigned CheckedOpcode =
I->getOpcode();
3088 return (getOpcode() == CheckedOpcode ||
3089 getAltOpcode() == CheckedOpcode);
3096 auto *
I = dyn_cast<Instruction>(
Op);
3097 if (
I && isOpcodeOrAlt(
I))
3102 void setOperations(
const InstructionsState &S) {
3116 unsigned getOpcode()
const {
3117 return MainOp ? MainOp->
getOpcode() : 0;
3120 unsigned getAltOpcode()
const {
3126 int findLaneForValue(
Value *V)
const {
3127 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
3128 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3129 if (!ReorderIndices.
empty())
3130 FoundLane = ReorderIndices[FoundLane];
3131 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3132 if (!ReuseShuffleIndices.
empty()) {
3133 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
3134 find(ReuseShuffleIndices, FoundLane));
3148 bool isNonPowOf2Vec()
const {
3150 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3151 "Reshuffling not supported with non-power-of-2 vectors yet.");
3152 return IsNonPowerOf2;
3159 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3160 dbgs() <<
"Operand " << OpI <<
":\n";
3161 for (
const Value *V : Operands[OpI])
3164 dbgs() <<
"Scalars: \n";
3165 for (
Value *V : Scalars)
3167 dbgs() <<
"State: ";
3170 dbgs() <<
"Vectorize\n";
3172 case ScatterVectorize:
3173 dbgs() <<
"ScatterVectorize\n";
3175 case StridedVectorize:
3176 dbgs() <<
"StridedVectorize\n";
3179 dbgs() <<
"NeedToGather\n";
3181 case CombinedVectorize:
3182 dbgs() <<
"CombinedVectorize\n";
3185 dbgs() <<
"MainOp: ";
3187 dbgs() << *MainOp <<
"\n";
3190 dbgs() <<
"AltOp: ";
3192 dbgs() << *AltOp <<
"\n";
3195 dbgs() <<
"VectorizedValue: ";
3196 if (VectorizedValue)
3197 dbgs() << *VectorizedValue <<
"\n";
3200 dbgs() <<
"ReuseShuffleIndices: ";
3201 if (ReuseShuffleIndices.
empty())
3204 for (
int ReuseIdx : ReuseShuffleIndices)
3205 dbgs() << ReuseIdx <<
", ";
3207 dbgs() <<
"ReorderIndices: ";
3208 for (
unsigned ReorderIdx : ReorderIndices)
3209 dbgs() << ReorderIdx <<
", ";
3211 dbgs() <<
"UserTreeIndices: ";
3212 for (
const auto &EInfo : UserTreeIndices)
3213 dbgs() << EInfo <<
", ";
3220 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3223 dbgs() <<
"SLP: " << Banner <<
":\n";
3225 dbgs() <<
"SLP: Costs:\n";
3226 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3227 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3228 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3229 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3230 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3236 std::optional<ScheduleData *> Bundle,
3237 const InstructionsState &S,
3238 const EdgeInfo &UserTreeIdx,
3241 TreeEntry::EntryState EntryState =
3242 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3243 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3244 ReuseShuffleIndices, ReorderIndices);
3248 TreeEntry::EntryState EntryState,
3249 std::optional<ScheduleData *> Bundle,
3250 const InstructionsState &S,
3251 const EdgeInfo &UserTreeIdx,
3254 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3255 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3256 "Need to vectorize gather entry?");
3257 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3258 TreeEntry *
Last = VectorizableTree.
back().get();
3259 Last->Idx = VectorizableTree.
size() - 1;
3260 Last->State = EntryState;
3261 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3262 ReuseShuffleIndices.end());
3263 if (ReorderIndices.
empty()) {
3265 Last->setOperations(S);
3268 Last->Scalars.assign(VL.
size(),
nullptr);
3271 if (Idx >= VL.size())
3272 return UndefValue::get(VL.front()->getType());
3276 Last->setOperations(S);
3277 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3279 if (!
Last->isGather()) {
3280 for (
Value *V : VL) {
3281 const TreeEntry *
TE = getTreeEntry(V);
3283 "Scalar already in tree!");
3286 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3289 ScalarToTreeEntry[
V] =
Last;
3292 ScheduleData *BundleMember = *Bundle;
3293 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3296 "Bundle and VL out of sync");
3298 for (
Value *V : VL) {
3303 BundleMember->TE =
Last;
3304 BundleMember = BundleMember->NextInBundle;
3307 assert(!BundleMember &&
"Bundle and VL out of sync");
3310 bool AllConstsOrCasts =
true;
3313 auto *
I = dyn_cast<CastInst>(V);
3314 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3317 if (AllConstsOrCasts)
3319 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3320 MustGather.
insert(VL.begin(), VL.end());
3323 if (UserTreeIdx.UserTE) {
3324 Last->UserTreeIndices.push_back(UserTreeIdx);
3325 assert((!
Last->isNonPowOf2Vec() ||
Last->ReorderIndices.empty()) &&
3326 "Reordering isn't implemented for non-power-of-2 nodes yet");
3333 TreeEntry::VecTreeTy VectorizableTree;
3338 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3339 VectorizableTree[
Id]->dump();
3345 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3347 const TreeEntry *getTreeEntry(
Value *V)
const {
3348 return ScalarToTreeEntry.lookup(V);
3357 bool areAltOperandsProfitable(
const InstructionsState &S,
3362 TreeEntry::EntryState getScalarsVectorizationState(
3395 using ValueToGatherNodesMap =
3397 ValueToGatherNodesMap ValueToGatherNodes;
3400 struct ExternalUser {
3424 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3425 auto It = AliasCache.
find(Key);
3426 if (It != AliasCache.
end())
3431 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3435 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3467 UserList ExternalUses;
3487 struct ScheduleData {
3490 enum { InvalidDeps = -1 };
3492 ScheduleData() =
default;
3494 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3495 FirstInBundle =
this;
3496 NextInBundle =
nullptr;
3497 NextLoadStore =
nullptr;
3498 IsScheduled =
false;
3499 SchedulingRegionID = BlockSchedulingRegionID;
3500 clearDependencies();
3507 if (hasValidDependencies()) {
3508 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3510 assert(UnscheduledDeps == Dependencies &&
"invariant");
3514 assert(isSchedulingEntity() &&
3515 "unexpected scheduled state");
3516 for (
const ScheduleData *BundleMember =
this; BundleMember;
3517 BundleMember = BundleMember->NextInBundle) {
3518 assert(BundleMember->hasValidDependencies() &&
3519 BundleMember->UnscheduledDeps == 0 &&
3520 "unexpected scheduled state");
3521 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3522 "only bundle is marked scheduled");
3526 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3527 "all bundle members must be in same basic block");
3533 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3537 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3541 bool isPartOfBundle()
const {
3542 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3547 bool isReady()
const {
3548 assert(isSchedulingEntity() &&
3549 "can't consider non-scheduling entity for ready list");
3550 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3556 int incrementUnscheduledDeps(
int Incr) {
3557 assert(hasValidDependencies() &&
3558 "increment of unscheduled deps would be meaningless");
3559 UnscheduledDeps += Incr;
3560 return FirstInBundle->unscheduledDepsInBundle();
3565 void resetUnscheduledDeps() {
3566 UnscheduledDeps = Dependencies;
3570 void clearDependencies() {
3571 Dependencies = InvalidDeps;
3572 resetUnscheduledDeps();
3573 MemoryDependencies.clear();
3574 ControlDependencies.clear();
3577 int unscheduledDepsInBundle()
const {
3578 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3580 for (
const ScheduleData *BundleMember =
this; BundleMember;
3581 BundleMember = BundleMember->NextInBundle) {
3582 if (BundleMember->UnscheduledDeps == InvalidDeps)
3584 Sum += BundleMember->UnscheduledDeps;
3590 if (!isSchedulingEntity()) {
3591 os <<
"/ " << *Inst;
3592 }
else if (NextInBundle) {
3594 ScheduleData *SD = NextInBundle;
3596 os <<
';' << *SD->Inst;
3597 SD = SD->NextInBundle;
3608 Value *OpValue =
nullptr;
3611 TreeEntry *
TE =
nullptr;
3615 ScheduleData *FirstInBundle =
nullptr;
3619 ScheduleData *NextInBundle =
nullptr;
3623 ScheduleData *NextLoadStore =
nullptr;
3637 int SchedulingRegionID = 0;
3640 int SchedulingPriority = 0;
3646 int Dependencies = InvalidDeps;
3652 int UnscheduledDeps = InvalidDeps;
3656 bool IsScheduled =
false;
3661 const BoUpSLP::ScheduleData &SD) {
3686 struct BlockScheduling {
3688 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3692 ScheduleStart =
nullptr;
3693 ScheduleEnd =
nullptr;
3694 FirstLoadStoreInRegion =
nullptr;
3695 LastLoadStoreInRegion =
nullptr;
3696 RegionHasStackSave =
false;
3700 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3703 ScheduleRegionSize = 0;
3707 ++SchedulingRegionID;
3711 if (BB !=
I->getParent())
3714 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3715 if (SD && isInSchedulingRegion(SD))
3720 ScheduleData *getScheduleData(
Value *V) {
3721 if (
auto *
I = dyn_cast<Instruction>(V))
3722 return getScheduleData(
I);
3726 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3728 return getScheduleData(V);
3729 auto I = ExtraScheduleDataMap.find(V);
3730 if (
I != ExtraScheduleDataMap.end()) {
3731 ScheduleData *SD =
I->second.lookup(Key);
3732 if (SD && isInSchedulingRegion(SD))
3738 bool isInSchedulingRegion(ScheduleData *SD)
const {
3739 return SD->SchedulingRegionID == SchedulingRegionID;
3744 template <
typename ReadyListType>
3745 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3746 SD->IsScheduled =
true;
3749 for (ScheduleData *BundleMember = SD; BundleMember;
3750 BundleMember = BundleMember->NextInBundle) {
3751 if (BundleMember->Inst != BundleMember->OpValue)
3757 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3758 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3759 if (OpDef && OpDef->hasValidDependencies() &&
3760 OpDef->incrementUnscheduledDeps(-1) == 0) {
3764 ScheduleData *DepBundle = OpDef->FirstInBundle;
3765 assert(!DepBundle->IsScheduled &&
3766 "already scheduled bundle gets ready");
3767 ReadyList.insert(DepBundle);
3769 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3777 if (TreeEntry *TE = BundleMember->TE) {
3779 int Lane = std::distance(
TE->Scalars.begin(),
3780 find(
TE->Scalars, BundleMember->Inst));
3781 assert(Lane >= 0 &&
"Lane not set");
3789 auto *
In = BundleMember->Inst;
3792 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3793 In->getNumOperands() ==
TE->getNumOperands()) &&
3794 "Missed TreeEntry operands?");
3797 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3798 OpIdx != NumOperands; ++OpIdx)
3799 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3804 for (
Use &U : BundleMember->Inst->operands())
3805 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3809 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3810 if (MemoryDepSD->hasValidDependencies() &&
3811 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3814 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3815 assert(!DepBundle->IsScheduled &&
3816 "already scheduled bundle gets ready");
3817 ReadyList.insert(DepBundle);
3819 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3823 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3824 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3827 ScheduleData *DepBundle = DepSD->FirstInBundle;
3828 assert(!DepBundle->IsScheduled &&
3829 "already scheduled bundle gets ready");
3830 ReadyList.insert(DepBundle);
3832 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3843 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3844 ScheduleStart->comesBefore(ScheduleEnd) &&
3845 "Not a valid scheduling region?");
3847 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3848 auto *SD = getScheduleData(
I);
3851 assert(isInSchedulingRegion(SD) &&
3852 "primary schedule data not in window?");
3853 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3854 "entire bundle in window!");
3856 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3859 for (
auto *SD : ReadyInsts) {
3860 assert(SD->isSchedulingEntity() && SD->isReady() &&
3861 "item in ready list not ready?");
3866 void doForAllOpcodes(
Value *V,
3868 if (ScheduleData *SD = getScheduleData(V))
3870 auto I = ExtraScheduleDataMap.find(V);
3871 if (
I != ExtraScheduleDataMap.end())
3872 for (
auto &
P :
I->second)
3873 if (isInSchedulingRegion(
P.second))
3878 template <
typename ReadyListType>
3879 void initialFillReadyList(ReadyListType &ReadyList) {
3880 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3881 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3882 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3884 ReadyList.insert(SD);
3886 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3901 std::optional<ScheduleData *>
3903 const InstructionsState &S);
3909 ScheduleData *allocateScheduleDataChunks();
3913 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3918 ScheduleData *PrevLoadStore,
3919 ScheduleData *NextLoadStore);
3923 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3927 void resetSchedule();
3948 ExtraScheduleDataMap;
3961 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3965 ScheduleData *LastLoadStoreInRegion =
nullptr;
3970 bool RegionHasStackSave =
false;
3973 int ScheduleRegionSize = 0;
3982 int SchedulingRegionID = 1;
3990 void scheduleBlock(BlockScheduling *BS);
3997 struct OrdersTypeDenseMapInfo {
4010 static unsigned getHashValue(
const OrdersType &V) {
4031 unsigned MaxVecRegSize;
4032 unsigned MinVecRegSize;
4047 unsigned ReductionBitWidth = 0;
4051 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4070 struct ChildIteratorType
4072 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4083 return R.VectorizableTree[0].get();
4087 return {
N->UserTreeIndices.begin(),
N->Container};
4091 return {
N->UserTreeIndices.end(),
N->Container};
4096 class nodes_iterator {
4107 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4111 return nodes_iterator(R->VectorizableTree.begin());
4115 return nodes_iterator(R->VectorizableTree.end());
4118 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4129 OS << Entry->Idx <<
".\n";
4132 for (
auto *V : Entry->Scalars) {
4134 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4135 return EU.Scalar == V;
4145 if (Entry->isGather())
4147 if (Entry->State == TreeEntry::ScatterVectorize ||
4148 Entry->State == TreeEntry::StridedVectorize)
4149 return "color=blue";
4158 for (
auto *
I : DeletedInstructions) {
4159 if (!
I->getParent()) {
4162 if (isa<PHINode>(
I))
4164 I->insertBefore(
F->getEntryBlock(),
4165 F->getEntryBlock().getFirstNonPHIIt());
4167 I->insertBefore(
F->getEntryBlock().getTerminator());
4170 for (
Use &U :
I->operands()) {
4171 auto *
Op = dyn_cast<Instruction>(U.get());
4172 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4176 I->dropAllReferences();
4178 for (
auto *
I : DeletedInstructions) {
4180 "trying to erase instruction with users.");
4181 I->eraseFromParent();
4187#ifdef EXPENSIVE_CHECKS
4198 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4199 "Expected non-empty mask.");
4202 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4204 Reuses[Mask[
I]] = Prev[
I];
4212 bool BottomOrder =
false) {
4213 assert(!Mask.empty() &&
"Expected non-empty mask.");
4214 unsigned Sz = Mask.size();
4217 if (Order.
empty()) {
4219 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4221 PrevOrder.
swap(Order);
4224 for (
unsigned I = 0;
I < Sz; ++
I)
4226 Order[
I] = PrevOrder[Mask[
I]];
4228 return Data.value() == Sz ||
Data.index() ==
Data.value();
4237 if (Order.
empty()) {
4239 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4249 for (
unsigned I = 0;
I < Sz; ++
I)
4251 Order[MaskOrder[
I]] =
I;
4255std::optional<BoUpSLP::OrdersType>
4257 assert(TE.isGather() &&
"Expected gather node only.");
4261 Type *ScalarTy = GatheredScalars.
front()->getType();
4262 int NumScalars = GatheredScalars.
size();
4264 return std::nullopt;
4267 if (NumParts == 0 || NumParts >= NumScalars)
4273 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4275 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4278 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4279 return std::nullopt;
4280 OrdersType CurrentOrder(NumScalars, NumScalars);
4281 if (GatherShuffles.
size() == 1 &&
4283 Entries.front().front()->isSame(TE.Scalars)) {
4286 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4287 return CurrentOrder;
4291 return all_of(Mask, [&](
int I) {
4298 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4299 (Entries.size() != 1 ||
4300 Entries.front().front()->ReorderIndices.empty())) ||
4301 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4302 return std::nullopt;
4307 for (
int I : seq<int>(0, NumParts)) {
4308 if (ShuffledSubMasks.
test(
I))
4310 const int VF = GetVF(
I);
4316 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4317 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4318 ShuffledSubMasks.
set(
I);
4322 int FirstMin = INT_MAX;
4323 int SecondVecFound =
false;
4324 for (
int K : seq<int>(Limit)) {
4325 int Idx = Mask[
I * PartSz + K];
4327 Value *V = GatheredScalars[
I * PartSz + K];
4329 SecondVecFound =
true;
4338 SecondVecFound =
true;
4342 FirstMin = (FirstMin / PartSz) * PartSz;
4344 if (SecondVecFound) {
4345 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4346 ShuffledSubMasks.
set(
I);
4349 for (
int K : seq<int>(Limit)) {
4350 int Idx = Mask[
I * PartSz + K];
4354 if (
Idx >= PartSz) {
4355 SecondVecFound =
true;
4358 if (CurrentOrder[
I * PartSz +
Idx] >
4359 static_cast<unsigned>(
I * PartSz + K) &&
4360 CurrentOrder[
I * PartSz +
Idx] !=
4361 static_cast<unsigned>(
I * PartSz +
Idx))
4362 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4365 if (SecondVecFound) {
4366 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4367 ShuffledSubMasks.
set(
I);
4373 if (!ExtractShuffles.
empty())
4374 TransformMaskToOrder(
4375 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4376 if (!ExtractShuffles[
I])
4379 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4380 for (
unsigned Idx : seq<unsigned>(Sz)) {
4381 int K =
I * PartSz +
Idx;
4384 if (!TE.ReuseShuffleIndices.empty())
4385 K = TE.ReuseShuffleIndices[K];
4386 if (!TE.ReorderIndices.empty())
4387 K = std::distance(TE.ReorderIndices.begin(),
4388 find(TE.ReorderIndices, K));
4389 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4392 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4394 .getKnownMinValue());
4399 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4400 if (ShuffledSubMasks.
any())
4401 return std::nullopt;
4402 PartSz = NumScalars;
4405 if (!Entries.empty())
4406 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4407 if (!GatherShuffles[
I])
4409 return std::max(Entries[
I].front()->getVectorFactor(),
4410 Entries[
I].back()->getVectorFactor());
4413 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4414 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4415 return std::nullopt;
4416 return std::move(CurrentOrder);
4421 bool CompareOpcodes =
true) {
4424 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4427 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4430 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4434 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4439template <
typename T>
4441 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4443 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4444 return CommonAlignment;
4449 unsigned Sz = Order.
size();
4451 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4462static std::optional<Value *>
4468 const SCEV *PtrSCEVLowest =
nullptr;
4469 const SCEV *PtrSCEVHighest =
nullptr;
4475 return std::nullopt;
4477 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4478 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4482 if (isa<SCEVCouldNotCompute>(Diff))
4483 return std::nullopt;
4485 PtrSCEVLowest = PtrSCEV;
4489 if (isa<SCEVCouldNotCompute>(Diff1))
4490 return std::nullopt;
4492 PtrSCEVHighest = PtrSCEV;
4498 if (isa<SCEVCouldNotCompute>(Dist))
4499 return std::nullopt;
4500 int Size =
DL.getTypeStoreSize(ElemTy);
4501 auto TryGetStride = [&](
const SCEV *Dist,
4502 const SCEV *Multiplier) ->
const SCEV * {
4503 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4504 if (M->getOperand(0) == Multiplier)
4505 return M->getOperand(1);
4506 if (M->getOperand(1) == Multiplier)
4507 return M->getOperand(0);
4510 if (Multiplier == Dist)
4515 const SCEV *Stride =
nullptr;
4516 if (
Size != 1 || SCEVs.
size() > 2) {
4518 Stride = TryGetStride(Dist, Sz);
4520 return std::nullopt;
4522 if (!Stride || isa<SCEVConstant>(Stride))
4523 return std::nullopt;
4526 using DistOrdPair = std::pair<int64_t, int>;
4528 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4530 bool IsConsecutive =
true;
4531 for (
const SCEV *PtrSCEV : SCEVs) {
4533 if (PtrSCEV != PtrSCEVLowest) {
4535 const SCEV *Coeff = TryGetStride(Diff, Stride);
4537 return std::nullopt;
4538 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4539 if (!SC || isa<SCEVCouldNotCompute>(SC))
4540 return std::nullopt;
4544 return std::nullopt;
4545 Dist = SC->getAPInt().getZExtValue();
4549 return std::nullopt;
4550 auto Res = Offsets.emplace(Dist, Cnt);
4552 return std::nullopt;
4554 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4557 if (Offsets.size() != SCEVs.
size())
4558 return std::nullopt;
4559 SortedIndices.
clear();
4560 if (!IsConsecutive) {
4564 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4565 SortedIndices[Cnt] = Pair.second;
4575static std::pair<InstructionCost, InstructionCost>
4591 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4597 const unsigned Sz = VL.
size();
4599 auto *POIter = PointerOps.
begin();
4600 for (
Value *V : VL) {
4601 auto *L = cast<LoadInst>(V);
4604 *POIter = L->getPointerOperand();
4615 "supported with VectorizeNonPowerOf2");
4619 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4630 if (Order.
empty()) {
4631 Ptr0 = PointerOps.
front();
4632 PtrN = PointerOps.
back();
4634 Ptr0 = PointerOps[Order.
front()];
4635 PtrN = PointerOps[Order.
back()];
4637 std::optional<int> Diff =
4640 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4643 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4657 auto IsAnyPointerUsedOutGraph =
4658 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
4659 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
4660 return !getTreeEntry(U) && !MustGather.contains(U);
4663 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4665 (
static_cast<unsigned>(std::abs(*Diff)) <=
4668 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4669 *Diff == -(
static_cast<int>(Sz) - 1))) {
4670 int Stride = *Diff /
static_cast<int>(Sz - 1);
4671 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4683 else if (
Ptr != Ptr0)
4688 if (((Dist / Stride) * Stride) != Dist ||
4689 !Dists.
insert(Dist).second)
4692 if (Dists.
size() == Sz)
4698 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment) {
4699 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4701 unsigned MaxVF = std::max<unsigned>(
bit_floor(VL.
size() / 2), MinVF);
4702 MaxVF = std::min(
getMaximumVF(Sz, Instruction::Load), MaxVF);
4703 for (
unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4704 unsigned VectorizedCnt = 0;
4706 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End;
4707 Cnt += VF, ++VectorizedCnt) {
4725 if (VectorizedCnt == VL.
size() / VF) {
4728 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4729 TTI, PointerOps, PointerOps.
front(), Instruction::GetElementPtr,
4733 Instruction::Load, VecTy,
4735 false, CommonAlignment,
CostKind) +
4736 VectorGEPCost - ScalarGEPCost;
4740 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
4743 auto [ScalarGEPCost, VectorGEPCost] =
4745 LI0->getPointerOperand(), Instruction::Load,
4748 Instruction::Load, SubVecTy, LI0->getAlign(),
4749 LI0->getPointerAddressSpace(),
CostKind,
4751 VectorGEPCost - ScalarGEPCost;
4755 auto [ScalarGEPCost, VectorGEPCost] =
4757 LI0->getPointerOperand(), Instruction::Load,
4761 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4762 false, CommonAlignment,
CostKind) +
4763 VectorGEPCost - ScalarGEPCost;
4767 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4769 LI0->getPointerOperand(), Instruction::GetElementPtr,
4773 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4774 false, CommonAlignment,
CostKind) +
4775 VectorGEPCost - ScalarGEPCost;
4780 "Expected only consecutive, strided or masked gather loads.");
4783 for (
int Idx : seq<int>(0, VL.
size()))
4792 if (MaskedGatherCost >= VecLdCost)
4802 bool ProfitableGatherPointers =
4805 return L->isLoopInvariant(V);
4807 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
4808 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
4810 (
GEP &&
GEP->getNumOperands() == 2 &&
4811 isa<Constant, Instruction>(
GEP->getOperand(1)));
4813 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4818 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4837 "Expected list of pointer operands.");
4842 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4847 std::optional<int> Diff =
4853 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4859 if (Bases.
size() > VL.
size() / 2 - 1)
4863 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4869 bool AnyConsecutive =
false;
4870 for (
auto &
Base : Bases) {
4871 auto &Vec =
Base.second;
4872 if (Vec.size() > 1) {
4874 const std::tuple<Value *, int, unsigned> &
Y) {
4875 return std::get<1>(
X) < std::get<1>(
Y);
4877 int InitialOffset = std::get<1>(Vec[0]);
4879 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4885 SortedIndices.
clear();
4886 if (!AnyConsecutive)
4894 for (
auto &
Base : Bases) {
4895 Value *Strip =
Base.first->stripInBoundsConstantOffsets();
4896 Value *Root = Strip;
4897 while (
auto *Gep = dyn_cast<GetElementPtrInst>(Root))
4898 Root = Gep->getOperand(0);
4901 auto *Begin = SortedBases.
begin();
4902 auto *
End = SortedBases.
end();
4903 while (Begin !=
End) {
4904 Value *Root = std::get<2>(*Begin);
4905 auto *Mid = std::stable_partition(
4906 Begin,
End, [&Root](
auto V) {
return std::get<2>(V) == Root; });
4908 for (
auto I = Begin;
I < Mid; ++
I)
4909 LessThan.try_emplace(std::get<1>(*
I));
4910 for (
auto I = Begin;
I < Mid; ++
I) {
4911 Value *V = std::get<1>(*
I);
4912 while (
auto *Gep = dyn_cast<GetElementPtrInst>(V)) {
4913 V = Gep->getOperand(0);
4914 if (LessThan.contains(V))
4915 LessThan[V][std::get<1>(*
I)] =
true;
4918 std::stable_sort(Begin, Mid, [&LessThan](
auto &V1,
auto &V2) {
4919 return LessThan[std::get<1>(V1)][std::get<1>(V2)];
4925 for (
auto Base : SortedBases)
4926 for (
auto &
T : Bases[std::get<0>(
Base)])
4930 "Expected SortedIndices to be the size of VL");
4934std::optional<BoUpSLP::OrdersType>
4936 assert(TE.isGather() &&
"Expected gather node only.");
4937 Type *ScalarTy = TE.Scalars[0]->getType();
4940 Ptrs.
reserve(TE.Scalars.size());
4941 for (
Value *V : TE.Scalars) {
4942 auto *L = dyn_cast<LoadInst>(V);
4943 if (!L || !L->isSimple())
4944 return std::nullopt;
4950 return std::move(Order);
4951 return std::nullopt;
4962 if (VU->
getType() != V->getType())
4965 if (!VU->
hasOneUse() && !V->hasOneUse())
4971 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4977 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());