72#ifdef EXPENSIVE_CHECKS
104using namespace slpvectorizer;
106#define SV_NAME "slp-vectorizer"
107#define DEBUG_TYPE "SLP"
109STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
112 cl::desc(
"Run the SLP vectorization passes"));
116 cl::desc(
"Only vectorize if you gain more than this "
121 cl::desc(
"Attempt to vectorize horizontal reductions"));
126 "Attempt to vectorize horizontal reductions feeding into a store"));
132 cl::desc(
"Allow optimization of original scalar identity operations on "
133 "matched horizontal reductions."));
137 cl::desc(
"Attempt to vectorize for this register size in bits"));
141 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
145 cl::desc(
"Maximum depth of the lookup for consecutive stores."));
153 cl::desc(
"Limit the size of the SLP scheduling region per block"));
157 cl::desc(
"Attempt to vectorize for this register size in bits"));
161 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
165 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
171 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
180 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
184 cl::desc(
"Display the SLP trees with Graphviz"));
207 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
214 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
221 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
222 !isa<ExtractValueInst, UndefValue>(V))
224 auto *
I = dyn_cast<Instruction>(V);
225 if (!
I || isa<ExtractValueInst>(
I))
227 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
229 if (isa<ExtractElementInst>(
I))
231 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
245 for (
int I = 1,
E = VL.
size();
I <
E;
I++) {
246 auto *II = dyn_cast<Instruction>(VL[
I]);
267 Value *FirstNonUndef =
nullptr;
268 for (
Value *V : VL) {
269 if (isa<UndefValue>(V))
271 if (!FirstNonUndef) {
275 if (V != FirstNonUndef)
278 return FirstNonUndef !=
nullptr;
283 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
284 return Cmp->isCommutative();
285 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
286 return BO->isCommutative();
298 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
299 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
302 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
305 if (CI->getValue().uge(VT->getNumElements()))
307 Index *= VT->getNumElements();
308 Index += CI->getZExtValue();
312 const auto *
IV = cast<InsertValueInst>(InsertInst);
313 Type *CurrentType =
IV->getType();
314 for (
unsigned I :
IV->indices()) {
315 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
316 Index *= ST->getNumElements();
317 CurrentType = ST->getElementType(
I);
318 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
319 Index *= AT->getNumElements();
320 CurrentType = AT->getElementType();
353 if (MaskArg == UseMask::UndefsAsMask)
357 if (MaskArg == UseMask::FirstArg &&
Value < VF)
358 UseMask.reset(
Value);
359 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
360 UseMask.reset(
Value - VF);
368template <
bool IsPoisonOnly = false>
372 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
375 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
378 auto *
C = dyn_cast<Constant>(V);
380 if (!UseMask.empty()) {
382 while (
auto *II = dyn_cast<InsertElementInst>(
Base)) {
383 Base = II->getOperand(0);
384 if (isa<T>(II->getOperand(1)))
389 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
397 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
404 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
405 if (
Constant *Elem =
C->getAggregateElement(
I))
407 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
455static std::optional<TargetTransformInfo::ShuffleKind>
458 find_if(VL, [](
Value *V) {
return isa<ExtractElementInst>(V); });
461 auto *EI0 = cast<ExtractElementInst>(*It);
462 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
465 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
466 Value *Vec1 =
nullptr;
467 Value *Vec2 =
nullptr;
469 ShuffleMode CommonShuffleMode =
Unknown;
471 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
473 if (isa<UndefValue>(VL[
I]))
475 auto *EI = cast<ExtractElementInst>(VL[
I]);
476 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
478 auto *Vec = EI->getVectorOperand();
483 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
485 if (isa<UndefValue>(EI->getIndexOperand()))
487 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
493 unsigned IntIdx =
Idx->getValue().getZExtValue();
497 if (!Vec1 || Vec1 == Vec) {
499 }
else if (!Vec2 || Vec2 == Vec) {
505 if (CommonShuffleMode == Permute)
510 CommonShuffleMode = Permute;
513 CommonShuffleMode =
Select;
516 if (CommonShuffleMode ==
Select && Vec2)
526 unsigned Opcode =
E->getOpcode();
527 assert((Opcode == Instruction::ExtractElement ||
528 Opcode == Instruction::ExtractValue) &&
529 "Expected extractelement or extractvalue instruction.");
530 if (Opcode == Instruction::ExtractElement) {
531 auto *CI = dyn_cast<ConstantInt>(
E->getOperand(1));
534 return CI->getZExtValue();
536 auto *EI = cast<ExtractValueInst>(
E);
537 if (EI->getNumIndices() != 1)
539 return *EI->idx_begin();
547static std::optional<TTI::ShuffleKind>
554 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
555 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
557 if (isa<UndefValue>(VL[
I]))
561 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
562 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
576 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
580 for (
const auto &
Data : VectorOpToIdx)
581 VFToVector[cast<FixedVectorType>(
Data.first->getType())->getNumElements()]
582 .push_back(
Data.first);
583 for (
auto &
Data : VFToVector) {
585 return VectorOpToIdx.find(V1)->second.size() >
586 VectorOpToIdx.find(V2)->second.size();
591 const int UndefSz = UndefVectorExtracts.
size();
592 unsigned SingleMax = 0;
593 Value *SingleVec =
nullptr;
594 unsigned PairMax = 0;
595 std::pair<Value *, Value *> PairVec(
nullptr,
nullptr);
596 for (
auto &
Data : VFToVector) {
598 if (SingleMax < VectorOpToIdx[V1].
size() + UndefSz) {
599 SingleMax = VectorOpToIdx[V1].
size() + UndefSz;
603 if (
Data.second.size() > 1)
604 V2 = *std::next(
Data.second.begin());
605 if (V2 && PairMax < VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() +
607 PairMax = VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() + UndefSz;
608 PairVec = std::make_pair(V1, V2);
611 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
618 if (SingleMax >= PairMax && SingleMax) {
619 for (
int Idx : VectorOpToIdx[SingleVec])
622 for (
Value *V : {PairVec.first, PairVec.second})
623 for (
int Idx : VectorOpToIdx[V])
627 for (
int Idx : UndefVectorExtracts)
631 std::optional<TTI::ShuffleKind> Res =
641 for (
int I = 0,
E = GatheredExtracts.
size();
I <
E; ++
I) {
642 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
643 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
644 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
656struct InstructionsState {
658 Value *OpValue =
nullptr;
669 unsigned getAltOpcode()
const {
674 bool isAltShuffle()
const {
return AltOp != MainOp; }
677 unsigned CheckedOpcode =
I->getOpcode();
678 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
681 InstructionsState() =
delete;
683 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
692 auto *
I = dyn_cast<Instruction>(Op);
693 if (
I && S.isOpcodeOrAlt(
I))
712 unsigned BaseIndex = 0);
720 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
721 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
722 BaseOp0 == Op0 || BaseOp1 == Op1 ||
733 "Assessing comparisons of different types?");
743 return (BasePred == Pred &&
745 (BasePred == SwappedPred &&
754 unsigned BaseIndex) {
757 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
759 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
760 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
761 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
763 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
765 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
766 unsigned AltOpcode = Opcode;
767 unsigned AltIndex = BaseIndex;
771 auto *IBase = cast<Instruction>(VL[BaseIndex]);
774 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
778 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
780 for (
int Cnt = 0,
E = VL.
size(); Cnt <
E; Cnt++) {
781 auto *
I = cast<Instruction>(VL[Cnt]);
782 unsigned InstOpcode =
I->getOpcode();
783 if (IsBinOp && isa<BinaryOperator>(
I)) {
784 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
788 AltOpcode = InstOpcode;
792 }
else if (IsCastOp && isa<CastInst>(
I)) {
793 Value *Op0 = IBase->getOperand(0);
795 Value *Op1 =
I->getOperand(0);
798 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
800 if (Opcode == AltOpcode) {
803 "Cast isn't safe for alternation, logic needs to be updated!");
804 AltOpcode = InstOpcode;
809 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
810 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
811 Type *Ty0 = BaseInst->getOperand(0)->getType();
812 Type *Ty1 = Inst->getOperand(0)->getType();
814 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
822 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
827 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
828 if (AltIndex != BaseIndex) {
831 }
else if (BasePred != CurrentPred) {
834 "CmpInst isn't safe for alternation, logic needs to be updated!");
839 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
840 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
843 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
844 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
845 if (Gep->getNumOperands() != 2 ||
846 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
847 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
848 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
850 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
851 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
852 auto *BaseLI = cast<LoadInst>(IBase);
853 if (!LI->isSimple() || !BaseLI->isSimple())
854 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
855 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
856 auto *
CallBase = cast<CallInst>(IBase);
858 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
859 if (Call->hasOperandBundles() &&
860 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
861 Call->op_begin() + Call->getBundleOperandsEndIndex(),
864 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
867 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
870 if (Mappings.
size() != BaseMappings.
size() ||
871 Mappings.
front().ISA != BaseMappings.
front().ISA ||
872 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
873 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
874 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
875 Mappings.
front().Shape.Parameters !=
876 BaseMappings.
front().Shape.Parameters)
877 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
882 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
885 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
886 cast<Instruction>(VL[AltIndex]));
892 Type *Ty = VL[0]->getType();
893 for (
int i = 1, e = VL.
size(); i < e; i++)
906 case Instruction::Load: {
907 LoadInst *LI = cast<LoadInst>(UserInst);
910 case Instruction::Store: {
912 return (
SI->getPointerOperand() == Scalar);
914 case Instruction::Call: {
915 CallInst *CI = cast<CallInst>(UserInst);
917 for (
unsigned i = 0, e = CI->
arg_size(); i != e; ++i) {
932 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
939 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
940 return LI->isSimple();
942 return SI->isSimple();
944 return !
MI->isVolatile();
952 bool ExtendingManyInputs =
false) {
955 assert((!ExtendingManyInputs || SubMask.
size() > Mask.size()) &&
956 "SubMask with many inputs support must be larger than the mask.");
958 Mask.append(SubMask.
begin(), SubMask.
end());
962 int TermValue = std::min(Mask.size(), SubMask.
size());
963 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
965 (!ExtendingManyInputs &&
966 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
968 NewMask[
I] = Mask[SubMask[
I]];
984 const unsigned Sz = Order.
size();
987 for (
unsigned I = 0;
I < Sz; ++
I) {
989 UnusedIndices.
reset(Order[
I]);
991 MaskedIndices.
set(
I);
993 if (MaskedIndices.
none())
996 "Non-synced masked/available indices.");
1000 assert(
Idx >= 0 &&
"Indices must be synced.");
1012 const unsigned E = Indices.
size();
1014 for (
unsigned I = 0;
I <
E; ++
I)
1015 Mask[Indices[
I]] =
I;
1021 assert(!Mask.empty() &&
"Expected non-empty mask.");
1025 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
1027 Scalars[Mask[
I]] = Prev[
I];
1035 auto *
I = dyn_cast<Instruction>(V);
1040 auto *IO = dyn_cast<Instruction>(V);
1043 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1052 auto *
I = dyn_cast<Instruction>(V);
1056 constexpr int UsesLimit = 8;
1057 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(UsesLimit) &&
1059 auto *IU = dyn_cast<Instruction>(U);
1062 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1078 return !VL.
empty() &&
1082namespace slpvectorizer {
1087 struct ScheduleData;
1104 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li),
1105 DT(Dt), AC(AC), DB(DB),
DL(
DL), ORE(ORE), Builder(Se->getContext()) {
1158 return !VectorizableTree.
empty() &&
1159 !VectorizableTree.
front()->UserTreeIndices.empty();
1164 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1165 return VectorizableTree.
front()->Scalars;
1177 VectorizableTree.
clear();
1178 ScalarToTreeEntry.clear();
1180 EntryToLastInstruction.clear();
1181 ExternalUses.
clear();
1182 for (
auto &Iter : BlocksSchedules) {
1183 BlockScheduling *BS = Iter.second.get();
1187 InstrElementSize.clear();
1188 UserIgnoreList =
nullptr;
1189 PostponedGathers.
clear();
1190 ValueToGatherNodes.
clear();
1247 return MaxVecRegSize;
1252 return MinVecRegSize;
1260 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1262 return MaxVF ? MaxVF : UINT_MAX;
1317 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1318 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1337 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1338 MaxLevel(MaxLevel) {}
1392 if (isa<LoadInst>(V1)) {
1394 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1396 static constexpr unsigned Limit = 8;
1397 if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
1400 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1402 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1405 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1408 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1410 ((
int)V1->getNumUses() == NumLanes ||
1411 AllUsersAreInternal(V1, V2)))
1417 auto *LI1 = dyn_cast<LoadInst>(V1);
1418 auto *LI2 = dyn_cast<LoadInst>(V2);
1420 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1425 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1426 LI2->getPointerOperand(),
DL, SE,
true);
1427 if (!Dist || *Dist == 0) {
1430 R.TTI->isLegalMaskedGather(
1438 if (std::abs(*Dist) > NumLanes / 2)
1447 auto *C1 = dyn_cast<Constant>(V1);
1448 auto *C2 = dyn_cast<Constant>(V2);
1462 if (isa<UndefValue>(V2))
1466 Value *EV2 =
nullptr;
1479 int Dist = Idx2 - Idx1;
1482 if (std::abs(Dist) == 0)
1484 if (std::abs(Dist) > NumLanes / 2)
1494 auto *I1 = dyn_cast<Instruction>(V1);
1495 auto *I2 = dyn_cast<Instruction>(V2);
1497 if (I1->getParent() != I2->getParent())
1505 if (S.getOpcode() &&
1506 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1507 !S.isAltShuffle()) &&
1509 return cast<Instruction>(V)->getNumOperands() ==
1510 S.MainOp->getNumOperands();
1516 if (isa<UndefValue>(V2))
1553 int ShallowScoreAtThisLevel =
1562 auto *I1 = dyn_cast<Instruction>(
LHS);
1563 auto *I2 = dyn_cast<Instruction>(
RHS);
1564 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1566 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1567 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1568 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1569 ShallowScoreAtThisLevel))
1570 return ShallowScoreAtThisLevel;
1571 assert(I1 && I2 &&
"Should have early exited.");
1578 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1579 OpIdx1 != NumOperands1; ++OpIdx1) {
1581 int MaxTmpScore = 0;
1582 unsigned MaxOpIdx2 = 0;
1583 bool FoundBest =
false;
1587 ? I2->getNumOperands()
1588 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1589 assert(FromIdx <= ToIdx &&
"Bad index");
1590 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1592 if (Op2Used.
count(OpIdx2))
1597 I1, I2, CurrLevel + 1, std::nullopt);
1600 TmpScore > MaxTmpScore) {
1601 MaxTmpScore = TmpScore;
1608 Op2Used.
insert(MaxOpIdx2);
1609 ShallowScoreAtThisLevel += MaxTmpScore;
1612 return ShallowScoreAtThisLevel;
1643 struct OperandData {
1644 OperandData() =
default;
1645 OperandData(
Value *V,
bool APO,
bool IsUsed)
1646 : V(V), APO(APO), IsUsed(IsUsed) {}
1656 bool IsUsed =
false;
1665 enum class ReorderingMode {
1684 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1685 return OpsVec[OpIdx][Lane];
1689 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1690 return OpsVec[OpIdx][Lane];
1695 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1696 OpIdx != NumOperands; ++OpIdx)
1697 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1699 OpsVec[OpIdx][Lane].IsUsed =
false;
1703 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1704 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1716 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1717 Value *IdxLaneV = getData(
Idx, Lane).V;
1718 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1721 for (
unsigned Ln = 0,
E = getNumLanes(); Ln <
E; ++Ln) {
1724 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1725 if (!isa<Instruction>(OpIdxLnV))
1727 Uniques.
insert(OpIdxLnV);
1729 int UniquesCount = Uniques.
size();
1730 int UniquesCntWithIdxLaneV =
1731 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1732 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1733 int UniquesCntWithOpIdxLaneV =
1734 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1735 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1738 UniquesCntWithOpIdxLaneV) -
1739 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1748 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1749 Value *IdxLaneV = getData(
Idx, Lane).V;
1750 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1759 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1760 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1762 return R.areAllUsersVectorized(IdxLaneI, std::nullopt)
1770 static const int ScoreScaleFactor = 10;
1778 int Lane,
unsigned OpIdx,
unsigned Idx,
1788 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1789 if (Score <= -SplatScore) {
1794 Score += SplatScore;
1800 Score *= ScoreScaleFactor;
1801 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1819 std::optional<unsigned>
1820 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1823 unsigned NumOperands = getNumOperands();
1826 Value *OpLastLane = getData(OpIdx, LastLane).V;
1829 ReorderingMode RMode = ReorderingModes[OpIdx];
1830 if (RMode == ReorderingMode::Failed)
1831 return std::nullopt;
1834 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1840 std::optional<unsigned>
Idx;
1844 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1851 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1853 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1855 OperandData &OpData = getData(
Idx, Lane);
1856 Value *Op = OpData.V;
1857 bool OpAPO = OpData.APO;
1866 if (OpAPO != OpIdxAPO)
1871 case ReorderingMode::Load:
1872 case ReorderingMode::Constant:
1873 case ReorderingMode::Opcode: {
1874 bool LeftToRight = Lane > LastLane;
1875 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1876 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1877 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1878 OpIdx,
Idx, IsUsed);
1879 if (Score >
static_cast<int>(BestOp.Score)) {
1881 BestOp.Score = Score;
1882 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1886 case ReorderingMode::Splat:
1887 if (Op == OpLastLane)
1890 case ReorderingMode::Failed:
1896 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1900 return std::nullopt;
1907 unsigned getBestLaneToStartReordering()
const {
1908 unsigned Min = UINT_MAX;
1909 unsigned SameOpNumber = 0;
1920 for (
int I = getNumLanes();
I > 0; --
I) {
1921 unsigned Lane =
I - 1;
1922 OperandsOrderData NumFreeOpsHash =
1923 getMaxNumOperandsThatCanBeReordered(Lane);
1926 if (NumFreeOpsHash.NumOfAPOs < Min) {
1927 Min = NumFreeOpsHash.NumOfAPOs;
1928 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1930 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1931 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1932 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1935 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1936 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1937 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1938 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1939 auto It = HashMap.
find(NumFreeOpsHash.Hash);
1940 if (It == HashMap.
end())
1941 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1947 unsigned BestLane = 0;
1948 unsigned CntMin = UINT_MAX;
1950 if (
Data.second.first < CntMin) {
1951 CntMin =
Data.second.first;
1952 BestLane =
Data.second.second;
1959 struct OperandsOrderData {
1962 unsigned NumOfAPOs = UINT_MAX;
1965 unsigned NumOpsWithSameOpcodeParent = 0;
1979 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1980 unsigned CntTrue = 0;
1981 unsigned NumOperands = getNumOperands();
1991 bool AllUndefs =
true;
1992 unsigned NumOpsWithSameOpcodeParent = 0;
1996 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1997 const OperandData &OpData = getData(OpIdx, Lane);
2002 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2004 I->getParent() != Parent) {
2005 if (NumOpsWithSameOpcodeParent == 0) {
2006 NumOpsWithSameOpcodeParent = 1;
2008 Parent =
I->getParent();
2010 --NumOpsWithSameOpcodeParent;
2013 ++NumOpsWithSameOpcodeParent;
2017 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2018 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2022 OperandsOrderData
Data;
2023 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2024 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2032 assert((empty() || VL.
size() == getNumLanes()) &&
2033 "Expected same number of lanes");
2034 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2035 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2036 OpsVec.
resize(NumOperands);
2037 unsigned NumLanes = VL.
size();
2038 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2039 OpsVec[OpIdx].
resize(NumLanes);
2040 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2041 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2052 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2053 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2054 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2061 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2064 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2067 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2068 return getData(OpIdx, Lane).V;
2072 bool empty()
const {
return OpsVec.
empty(); }
2075 void clear() { OpsVec.
clear(); }
2080 bool shouldBroadcast(
Value *Op,
unsigned OpIdx,
unsigned Lane) {
2081 bool OpAPO = getData(OpIdx, Lane).APO;
2082 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2086 bool FoundCandidate =
false;
2087 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2088 OperandData &
Data = getData(OpI, Ln);
2089 if (
Data.APO != OpAPO ||
Data.IsUsed)
2092 FoundCandidate =
true;
2097 if (!FoundCandidate)
2107 : TLI(TLI),
DL(
DL), SE(SE), R(R) {
2109 appendOperandsOfVL(RootVL);
2116 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2117 "Expected same num of lanes across all operands");
2118 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2119 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2127 unsigned NumOperands = getNumOperands();
2128 unsigned NumLanes = getNumLanes();
2148 unsigned FirstLane = getBestLaneToStartReordering();
2151 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2152 Value *OpLane0 = getValue(OpIdx, FirstLane);
2155 if (isa<LoadInst>(OpLane0))
2156 ReorderingModes[OpIdx] = ReorderingMode::Load;
2157 else if (isa<Instruction>(OpLane0)) {
2159 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2160 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2162 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2164 else if (isa<Constant>(OpLane0))
2165 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2166 else if (isa<Argument>(OpLane0))
2168 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2171 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2178 auto &&SkipReordering = [
this]() {
2181 for (
const OperandData &
Data : Op0)
2184 if (
any_of(Op, [&UniqueValues](
const OperandData &
Data) {
2203 if (SkipReordering())
2206 bool StrategyFailed =
false;
2214 for (
unsigned I = 0;
I < NumOperands; ++
I)
2215 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2217 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2220 int Lane = FirstLane +
Direction * Distance;
2221 if (Lane < 0 || Lane >= (
int)NumLanes)
2224 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2227 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2229 std::optional<unsigned> BestIdx = getBestOperand(
2230 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2237 swap(OpIdx, *BestIdx, Lane);
2240 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2242 StrategyFailed =
true;
2245 if (MainAltOps[OpIdx].
size() != 2) {
2246 OperandData &AltOp = getData(OpIdx, Lane);
2247 InstructionsState OpS =
2249 if (OpS.getOpcode() && OpS.isAltShuffle())
2256 if (!StrategyFailed)
2261#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2264 case ReorderingMode::Load:
2266 case ReorderingMode::Opcode:
2268 case ReorderingMode::Constant:
2270 case ReorderingMode::Splat:
2272 case ReorderingMode::Failed:
2293 const unsigned Indent = 2;
2296 OS <<
"Operand " << Cnt++ <<
"\n";
2297 for (
const OperandData &OpData : OpDataVec) {
2299 if (
Value *V = OpData.V)
2303 OS <<
", APO:" << OpData.APO <<
"}\n";
2325 int BestScore = Limit;
2326 std::optional<int>
Index;
2327 for (
int I : seq<int>(0, Candidates.size())) {
2329 Candidates[
I].second,
2332 if (Score > BestScore) {
2347 DeletedInstructions.insert(
I);
2353 return AnalyzedReductionsRoots.count(
I);
2358 AnalyzedReductionsRoots.insert(
I);
2372 AnalyzedReductionsRoots.clear();
2373 AnalyzedReductionVals.
clear();
2394 canReorderOperands(TreeEntry *UserTE,
2401 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2405 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2407 TreeEntry *TE =
nullptr;
2409 TE = getTreeEntry(V);
2412 if (It != VL.
end() && TE->isSame(VL))
2419 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2420 unsigned OpIdx)
const {
2421 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2422 const_cast<TreeEntry *
>(UserTE), OpIdx);
2441 const EdgeInfo &EI);
2456 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
2461 template <
typename BVTy,
typename ResTy,
typename...
Args>
2462 ResTy processBuildVector(
const TreeEntry *
E, Args &...Params);
2467 Value *createBuildVector(
const TreeEntry *
E);
2473 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
2482 std::optional<TargetTransformInfo::ShuffleKind>
2495 void setInsertPointAfterBundle(
const TreeEntry *
E);
2503 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2507 static void reorderInputsAccordingToOpcode(
2516 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2532 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2536 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2545 [Scalars](
Value *V,
int Idx) {
2546 return (isa<UndefValue>(V) &&
2547 Idx == PoisonMaskElem) ||
2548 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2551 if (!ReorderIndices.empty()) {
2558 return IsSame(Scalars, Mask);
2559 if (VL.
size() == ReuseShuffleIndices.size()) {
2561 return IsSame(Scalars, Mask);
2565 return IsSame(Scalars, ReuseShuffleIndices);
2568 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2569 return State == TreeEntry::NeedToGather &&
2570 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2571 UserTreeIndices.front().UserTE == UserEI.UserTE;
2575 bool hasEqualOperands(
const TreeEntry &TE)
const {
2576 if (
TE.getNumOperands() != getNumOperands())
2579 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
2580 unsigned PrevCount =
Used.count();
2581 for (
unsigned K = 0;
K <
E; ++
K) {
2584 if (getOperand(K) ==
TE.getOperand(
I)) {
2590 if (PrevCount ==
Used.count())
2599 unsigned getVectorFactor()
const {
2600 if (!ReuseShuffleIndices.empty())
2601 return ReuseShuffleIndices.size();
2602 return Scalars.
size();
2614 enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
2629 VecTreeTy &Container;
2653 assert(Operands[OpIdx].empty() &&
"Already resized?");
2655 "Number of operands is greater than the number of scalars.");
2661 void setOperandsInOrder() {
2663 auto *I0 = cast<Instruction>(Scalars[0]);
2664 Operands.resize(I0->getNumOperands());
2665 unsigned NumLanes = Scalars.size();
2666 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2667 OpIdx != NumOperands; ++OpIdx) {
2669 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2670 auto *
I = cast<Instruction>(Scalars[Lane]);
2671 assert(
I->getNumOperands() == NumOperands &&
2672 "Expected same number of operands");
2673 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2697 unsigned getNumOperands()
const {
return Operands.size(); }
2700 Value *getSingleOperand(
unsigned OpIdx)
const {
2702 assert(!Operands[OpIdx].empty() &&
"No operand available");
2707 bool isAltShuffle()
const {
return MainOp != AltOp; }
2710 unsigned CheckedOpcode =
I->getOpcode();
2711 return (getOpcode() == CheckedOpcode ||
2712 getAltOpcode() == CheckedOpcode);
2719 auto *
I = dyn_cast<Instruction>(Op);
2720 if (
I && isOpcodeOrAlt(
I))
2725 void setOperations(
const InstructionsState &S) {
2739 unsigned getOpcode()
const {
2740 return MainOp ? MainOp->
getOpcode() : 0;
2743 unsigned getAltOpcode()
const {
2749 int findLaneForValue(
Value *V)
const {
2750 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2751 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2752 if (!ReorderIndices.
empty())
2753 FoundLane = ReorderIndices[FoundLane];
2754 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2755 if (!ReuseShuffleIndices.
empty()) {
2756 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2757 find(ReuseShuffleIndices, FoundLane));
2766 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2767 dbgs() <<
"Operand " << OpI <<
":\n";
2768 for (
const Value *V : Operands[OpI])
2771 dbgs() <<
"Scalars: \n";
2772 for (
Value *V : Scalars)
2774 dbgs() <<
"State: ";
2777 dbgs() <<
"Vectorize\n";
2779 case ScatterVectorize:
2780 dbgs() <<
"ScatterVectorize\n";
2783 dbgs() <<
"NeedToGather\n";
2786 dbgs() <<
"MainOp: ";
2788 dbgs() << *MainOp <<
"\n";
2791 dbgs() <<
"AltOp: ";
2793 dbgs() << *AltOp <<
"\n";
2796 dbgs() <<
"VectorizedValue: ";
2797 if (VectorizedValue)
2798 dbgs() << *VectorizedValue <<
"\n";
2801 dbgs() <<
"ReuseShuffleIndices: ";
2802 if (ReuseShuffleIndices.
empty())
2805 for (
int ReuseIdx : ReuseShuffleIndices)
2806 dbgs() << ReuseIdx <<
", ";
2808 dbgs() <<
"ReorderIndices: ";
2809 for (
unsigned ReorderIdx : ReorderIndices)
2810 dbgs() << ReorderIdx <<
", ";
2812 dbgs() <<
"UserTreeIndices: ";
2813 for (
const auto &EInfo : UserTreeIndices)
2814 dbgs() << EInfo <<
", ";
2824 dbgs() <<
"SLP: " << Banner <<
":\n";
2826 dbgs() <<
"SLP: Costs:\n";
2827 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2828 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2829 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2830 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2831 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
2837 std::optional<ScheduleData *> Bundle,
2838 const InstructionsState &S,
2839 const EdgeInfo &UserTreeIdx,
2842 TreeEntry::EntryState EntryState =
2843 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2844 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2845 ReuseShuffleIndices, ReorderIndices);
2849 TreeEntry::EntryState EntryState,
2850 std::optional<ScheduleData *> Bundle,
2851 const InstructionsState &S,
2852 const EdgeInfo &UserTreeIdx,
2855 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2856 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2857 "Need to vectorize gather entry?");
2858 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
2859 TreeEntry *
Last = VectorizableTree.
back().get();
2860 Last->Idx = VectorizableTree.
size() - 1;
2861 Last->State = EntryState;
2862 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2863 ReuseShuffleIndices.end());
2864 if (ReorderIndices.
empty()) {
2866 Last->setOperations(S);
2869 Last->Scalars.assign(VL.
size(),
nullptr);
2872 if (Idx >= VL.size())
2873 return UndefValue::get(VL.front()->getType());
2877 Last->setOperations(S);
2878 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
2880 if (
Last->State != TreeEntry::NeedToGather) {
2881 for (
Value *V : VL) {
2882 assert(!getTreeEntry(V) &&
"Scalar already in tree!");
2883 ScalarToTreeEntry[
V] =
Last;
2886 ScheduleData *BundleMember = *Bundle;
2887 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2890 "Bundle and VL out of sync");
2892 for (
Value *V : VL) {
2895 assert(BundleMember &&
"Unexpected end of bundle.");
2896 BundleMember->TE =
Last;
2897 BundleMember = BundleMember->NextInBundle;
2900 assert(!BundleMember &&
"Bundle and VL out of sync");
2902 MustGather.
insert(VL.begin(), VL.end());
2905 if (UserTreeIdx.UserTE)
2906 Last->UserTreeIndices.push_back(UserTreeIdx);
2913 TreeEntry::VecTreeTy VectorizableTree;
2918 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2919 VectorizableTree[
Id]->dump();
2925 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
2927 const TreeEntry *getTreeEntry(
Value *V)
const {
2928 return ScalarToTreeEntry.lookup(V);
2953 using ValueToGatherNodesMap =
2955 ValueToGatherNodesMap ValueToGatherNodes;
2958 struct ExternalUser {
2980 AliasCacheKey key = std::make_pair(Inst1, Inst2);
2981 std::optional<bool> &result = AliasCache[key];
2985 bool aliased =
true;
2993 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3021 UserList ExternalUses;
3037 struct ScheduleData {
3040 enum { InvalidDeps = -1 };
3042 ScheduleData() =
default;
3044 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3045 FirstInBundle =
this;
3046 NextInBundle =
nullptr;
3047 NextLoadStore =
nullptr;
3048 IsScheduled =
false;
3049 SchedulingRegionID = BlockSchedulingRegionID;
3050 clearDependencies();
3057 if (hasValidDependencies()) {
3058 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3060 assert(UnscheduledDeps == Dependencies &&
"invariant");
3064 assert(isSchedulingEntity() &&
3065 "unexpected scheduled state");
3066 for (
const ScheduleData *BundleMember =
this; BundleMember;
3067 BundleMember = BundleMember->NextInBundle) {
3068 assert(BundleMember->hasValidDependencies() &&
3069 BundleMember->UnscheduledDeps == 0 &&
3070 "unexpected scheduled state");
3071 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3072 "only bundle is marked scheduled");
3076 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3077 "all bundle members must be in same basic block");
3083 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3087 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3091 bool isPartOfBundle()
const {
3092 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3097 bool isReady()
const {
3098 assert(isSchedulingEntity() &&
3099 "can't consider non-scheduling entity for ready list");
3100 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3106 int incrementUnscheduledDeps(
int Incr) {
3107 assert(hasValidDependencies() &&
3108 "increment of unscheduled deps would be meaningless");
3109 UnscheduledDeps += Incr;
3110 return FirstInBundle->unscheduledDepsInBundle();
3115 void resetUnscheduledDeps() {
3116 UnscheduledDeps = Dependencies;
3120 void clearDependencies() {
3121 Dependencies = InvalidDeps;
3122 resetUnscheduledDeps();
3123 MemoryDependencies.clear();
3124 ControlDependencies.clear();
3127 int unscheduledDepsInBundle()
const {
3128 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3130 for (
const ScheduleData *BundleMember =
this; BundleMember;
3131 BundleMember = BundleMember->NextInBundle) {
3132 if (BundleMember->UnscheduledDeps == InvalidDeps)
3134 Sum += BundleMember->UnscheduledDeps;
3140 if (!isSchedulingEntity()) {
3141 os <<
"/ " << *Inst;
3142 }
else if (NextInBundle) {
3144 ScheduleData *SD = NextInBundle;
3146 os <<
';' << *SD->Inst;
3147 SD = SD->NextInBundle;
3158 Value *OpValue =
nullptr;
3161 TreeEntry *
TE =
nullptr;
3165 ScheduleData *FirstInBundle =
nullptr;
3169 ScheduleData *NextInBundle =
nullptr;
3173 ScheduleData *NextLoadStore =
nullptr;
3187 int SchedulingRegionID = 0;
3190 int SchedulingPriority = 0;
3196 int Dependencies = InvalidDeps;
3202 int UnscheduledDeps = InvalidDeps;
3206 bool IsScheduled =
false;
3211 const BoUpSLP::ScheduleData &SD) {
3236 struct BlockScheduling {
3238 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3242 ScheduleStart =
nullptr;
3243 ScheduleEnd =
nullptr;
3244 FirstLoadStoreInRegion =
nullptr;
3245 LastLoadStoreInRegion =
nullptr;
3246 RegionHasStackSave =
false;
3250 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3253 ScheduleRegionSize = 0;
3257 ++SchedulingRegionID;
3261 if (BB !=
I->getParent())
3264 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3265 if (SD && isInSchedulingRegion(SD))
3270 ScheduleData *getScheduleData(
Value *V) {
3271 if (
auto *
I = dyn_cast<Instruction>(V))
3272 return getScheduleData(
I);
3276 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3278 return getScheduleData(V);
3279 auto I = ExtraScheduleDataMap.find(V);
3280 if (
I != ExtraScheduleDataMap.end()) {
3281 ScheduleData *SD =
I->second.lookup(Key);
3282 if (SD && isInSchedulingRegion(SD))
3288 bool isInSchedulingRegion(ScheduleData *SD)
const {
3289 return SD->SchedulingRegionID == SchedulingRegionID;
3294 template <
typename ReadyListType>
3295 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3296 SD->IsScheduled =
true;
3299 for (ScheduleData *BundleMember = SD; BundleMember;
3300 BundleMember = BundleMember->NextInBundle) {
3301 if (BundleMember->Inst != BundleMember->OpValue)
3307 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3308 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3309 if (OpDef && OpDef->hasValidDependencies() &&
3310 OpDef->incrementUnscheduledDeps(-1) == 0) {
3314 ScheduleData *DepBundle = OpDef->FirstInBundle;
3315 assert(!DepBundle->IsScheduled &&
3316 "already scheduled bundle gets ready");
3317 ReadyList.insert(DepBundle);
3319 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3327 if (TreeEntry *TE = BundleMember->TE) {
3329 int Lane = std::distance(
TE->Scalars.begin(),
3330 find(
TE->Scalars, BundleMember->Inst));
3331 assert(Lane >= 0 &&
"Lane not set");
3339 auto *
In = BundleMember->Inst;
3341 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3342 In->getNumOperands() ==
TE->getNumOperands()) &&
3343 "Missed TreeEntry operands?");
3346 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3347 OpIdx != NumOperands; ++OpIdx)
3348 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3353 for (
Use &U : BundleMember->Inst->operands())
3354 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3358 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3359 if (MemoryDepSD->hasValidDependencies() &&
3360 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3363 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3364 assert(!DepBundle->IsScheduled &&
3365 "already scheduled bundle gets ready");
3366 ReadyList.insert(DepBundle);
3368 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3372 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3373 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3376 ScheduleData *DepBundle = DepSD->FirstInBundle;
3377 assert(!DepBundle->IsScheduled &&
3378 "already scheduled bundle gets ready");
3379 ReadyList.insert(DepBundle);
3381 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3393 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3394 ScheduleStart->comesBefore(ScheduleEnd) &&
3395 "Not a valid scheduling region?");
3397 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3398 auto *SD = getScheduleData(
I);
3401 assert(isInSchedulingRegion(SD) &&
3402 "primary schedule data not in window?");
3403 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3404 "entire bundle in window!");
3406 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3409 for (
auto *SD : ReadyInsts) {
3410 assert(SD->isSchedulingEntity() && SD->isReady() &&
3411 "item in ready list not ready?");
3416 void doForAllOpcodes(
Value *V,
3418 if (ScheduleData *SD = getScheduleData(V))
3420 auto I = ExtraScheduleDataMap.find(V);
3421 if (
I != ExtraScheduleDataMap.end())
3422 for (
auto &
P :
I->second)
3423 if (isInSchedulingRegion(
P.second))
3428 template <
typename ReadyListType>
3429 void initialFillReadyList(ReadyListType &ReadyList) {
3430 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3431 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3432 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3434 ReadyList.insert(SD);
3436 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3451 std::optional<ScheduleData *>
3453 const InstructionsState &S);
3459 ScheduleData *allocateScheduleDataChunks();
3463 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3468 ScheduleData *PrevLoadStore,
3469 ScheduleData *NextLoadStore);
3473 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3477 void resetSchedule();
3482 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3498 ExtraScheduleDataMap;
3511 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3515 ScheduleData *LastLoadStoreInRegion =
nullptr;
3520 bool RegionHasStackSave =
false;
3523 int ScheduleRegionSize = 0;
3532 int SchedulingRegionID = 1;
3540 void scheduleBlock(BlockScheduling *BS);
3547 struct OrdersTypeDenseMapInfo {
3560 static unsigned getHashValue(
const OrdersType &V) {
3581 unsigned MaxVecRegSize;
3582 unsigned MinVecRegSize;
3607 struct ChildIteratorType
3609 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3620 return R.VectorizableTree[0].get();
3624 return {
N->UserTreeIndices.begin(),
N->Container};
3628 return {
N->UserTreeIndices.end(),
N->Container};
3633 class nodes_iterator {
3644 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3648 return nodes_iterator(R->VectorizableTree.begin());
3652 return nodes_iterator(R->VectorizableTree.end());
3655 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3666 OS << Entry->Idx <<
".\n";
3669 for (
auto *V : Entry->Scalars) {
3671 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3672 return EU.Scalar == V;
3682 if (Entry->State == TreeEntry::NeedToGather)
3684 if (Entry->State == TreeEntry::ScatterVectorize)
3685 return "color=blue";
3694 for (
auto *
I : DeletedInstructions) {
3695 for (
Use &U :
I->operands()) {
3696 auto *Op = dyn_cast<Instruction>(U.get());
3697 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3701 I->dropAllReferences();
3703 for (
auto *
I : DeletedInstructions) {
3705 "trying to erase instruction with users.");
3706 I->eraseFromParent();
3712#ifdef EXPENSIVE_CHECKS
3723 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3724 "Expected non-empty mask.");
3727 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
3729 Reuses[Mask[
I]] = Prev[
I];
3737 assert(!Mask.empty() &&
"Expected non-empty mask.");
3739 if (Order.
empty()) {
3740 MaskOrder.
resize(Mask.size());
3741 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
3750 Order.
assign(Mask.size(), Mask.size());
3751 for (
unsigned I = 0,
E = Mask.size();
I <
E; ++
I)
3753 Order[MaskOrder[
I]] =
I;
3757std::optional<BoUpSLP::OrdersType>
3759 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3760 unsigned NumScalars = TE.Scalars.size();
3761 OrdersType CurrentOrder(NumScalars, NumScalars);
3764 const TreeEntry *STE =
nullptr;
3768 for (
unsigned I = 0;
I < NumScalars; ++
I) {
3769 Value *V = TE.Scalars[
I];
3770 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
3772 if (
const auto *LocalSTE = getTreeEntry(V)) {
3775 else if (STE != LocalSTE)
3777 return std::nullopt;
3779 std::distance(STE->Scalars.begin(),
find(STE->Scalars, V));
3780 if (Lane >= NumScalars)
3781 return std::nullopt;
3782 if (CurrentOrder[Lane] != NumScalars) {
3785 UsedPositions.
reset(CurrentOrder[Lane]);
3789 CurrentOrder[Lane] =
I;
3790 UsedPositions.
set(
I);
3795 if (STE && (UsedPositions.
count() > 1 || STE->Scalars.size() == 2)) {
3797 for (
unsigned I = 0;
I < NumScalars; ++
I)
3798 if (CurrentOrder[
I] !=
I && CurrentOrder[
I] != NumScalars)
3802 if (IsIdentityOrder(CurrentOrder))
3804 auto *It = CurrentOrder.
begin();
3805 for (
unsigned I = 0;
I < NumScalars;) {
3806 if (UsedPositions.
test(
I)) {
3810 if (*It == NumScalars) {
3816 return std::move(CurrentOrder);
3818 return std::nullopt;
3823enum class LoadsState { Gather, Vectorize, ScatterVectorize };
3828 bool CompareOpcodes =
true) {
3831 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3834 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3837 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3841 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
3861 if (
DL.getTypeSizeInBits(ScalarTy) !=
DL.getTypeAllocSizeInBits(ScalarTy))
3862 return LoadsState::Gather;
3868 auto *POIter = PointerOps.
begin();
3869 for (
Value *V : VL) {
3870 auto *L = cast<LoadInst>(V);
3872 return LoadsState::Gather;
3873 *POIter = L->getPointerOperand();
3886 if (Order.
empty()) {
3887 Ptr0 = PointerOps.
front();
3888 PtrN = PointerOps.
back();
3890 Ptr0 = PointerOps[Order.
front()];
3891 PtrN = PointerOps[Order.
back()];
3893 std::optional<int> Diff =
3896 if (
static_cast<unsigned>(*Diff) == VL.
size() - 1)
3897 return LoadsState::Vectorize;
3903 bool ProfitableGatherPointers =
3905 return L && L->isLoopInvariant(V);
3906 })) <= VL.
size() / 2 && VL.
size() > 2;
3907 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
3908 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
3910 (
GEP &&
GEP->getNumOperands() == 2);
3912 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3915 std::min(CommonAlignment, cast<LoadInst>(V)->
getAlign());
3919 return LoadsState::ScatterVectorize;
3923 return LoadsState::Gather;
3930 VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
3931 "Expected list of pointer operands.");
3936 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
3941 std::optional<int> Diff =
3947 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
3953 if (Bases.
size() > VL.
size() / 2 - 1)
3957 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
3963 bool AnyConsecutive =
false;
3964 for (
auto &
Base : Bases) {
3965 auto &Vec =
Base.second;
3966 if (Vec.size() > 1) {
3968 const std::tuple<Value *, int, unsigned> &
Y) {
3969 return std::get<1>(
X) < std::get<1>(
Y);
3971 int InitialOffset = std::get<1>(Vec[0]);
3973 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
3979 SortedIndices.
clear();
3980 if (!AnyConsecutive)
3983 for (
auto &
Base : Bases) {
3984 for (
auto &
T :
Base.second)
3989 "Expected SortedIndices to be the size of VL");
3993std::optional<BoUpSLP::OrdersType>
3995 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3996 Type *ScalarTy = TE.Scalars[0]->getType();
3999 Ptrs.
reserve(TE.Scalars.size());
4000 for (
Value *V : TE.Scalars) {
4001 auto *L = dyn_cast<LoadInst>(V);
4002 if (!L || !L->isSimple())
4003 return std::nullopt;
4009 return std::move(Order);
4010 return std::nullopt;
4021 if (VU->
getType() != V->getType())
4024 if (!VU->
hasOneUse() && !V->hasOneUse())
4030 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4036 bool IsReusedIdx =
false;
4038 if (IE2 == VU && !IE1)
4040 if (IE1 == V && !IE2)
4041 return V->hasOneUse();
4042 if (IE1 && IE1 != V) {
4045 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4048 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4050 if (IE2 && IE2 != VU) {
4053 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4056 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4058 }
while (!IsReusedIdx && (IE1 || IE2));
4062std::optional<BoUpSLP::OrdersType>
4066 if (!TE.ReuseShuffleIndices.empty()) {
4074 unsigned Sz = TE.Scalars.size();
4077 return std::nullopt;
4078 unsigned VF = TE.getVectorFactor();
4081 TE.ReuseShuffleIndices.end());
4082 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4084 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4085 return Idx && *Idx < Sz;
4088 if (TE.ReorderIndices.empty())
4089 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4092 for (
unsigned I = 0;
I < VF; ++
I) {
4093 int &
Idx = ReusedMask[
I];
4096 Value *V = TE.Scalars[ReorderMask[
Idx]];
4098 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
4104 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
4105 auto *It = ResOrder.
begin();
4106 for (
unsigned K = 0; K < VF; K += Sz) {
4110 std::iota(SubMask.begin(), SubMask.end(), 0);
4112 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
4113 std::advance(It, Sz);
4116 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
4117 return std::nullopt;
4118 return std::move(ResOrder);
4120 if (TE.State == TreeEntry::Vectorize &&
4121 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4122 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4124 return TE.ReorderIndices;
4125 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4127 if (!V1->
hasOneUse() || !V2->hasOneUse())
4129 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
4130 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4131 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4132 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4139 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4141 return *Idx1 < *Idx2;
4143 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4144 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4145 if (EE1->getOperand(0) != EE2->getOperand(0))
4149 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4151 return *Idx1 < *Idx2;
4155 auto IsIdentityOrder = [](
const OrdersType &Order) {
4156 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4161 if (!TE.ReorderIndices.empty())
4162 return TE.ReorderIndices;
4166 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) {
4167 PhiToId[TE.Scalars[Id]] = Id;
4168 Phis.push_back(TE.Scalars[Id]);
4171 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4172 ResOrder[Id] = PhiToId[Phis[Id]];
4173 if (IsIdentityOrder(ResOrder))
4174 return std::nullopt;
4175 return std::move(ResOrder);
4177 if (TE.State == TreeEntry::NeedToGather) {
4180 if (((TE.getOpcode() == Instruction::ExtractElement &&
4181 !TE.isAltShuffle()) ||
4184 return isa<UndefValue, ExtractElementInst>(V);
4187 [](
Value *V) { return isa<ExtractElementInst>(V); }))) &&
4190 auto *EE = dyn_cast<ExtractElementInst>(V);
4191 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4197 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
4198 if (Reuse || !CurrentOrder.
empty()) {
4199 if (!CurrentOrder.
empty())
4201 return std::move(CurrentOrder);
4210 int Sz = TE.Scalars.size();
4214 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
4215 if (It == TE.Scalars.begin())
4218 if (It != TE.Scalars.end()) {
4220 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4235 if (InsertFirstCost + PermuteCost < InsertIdxCost)
4236 return std::move(Order);
4240 return CurrentOrder;
4241 if (TE.Scalars.size() >= 4)
4245 return std::nullopt;
4255 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
4257 if (Cluster != FirstCluster)
4263void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
4266 const unsigned Sz =
TE.Scalars.size();
4268 if (
TE.State != TreeEntry::NeedToGather ||
4275 addMask(NewMask,
TE.ReuseShuffleIndices);
4277 TE.ReorderIndices.clear();
4284 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
4285 *
End =
TE.ReuseShuffleIndices.end();
4286 It !=
End; std::advance(It, Sz))
4287 std::iota(It, std::next(It, Sz), 0);
4306 ExternalUserReorderMap;
4312 for_each(VectorizableTree, [
this, &TTIRef, &VFToOrderedEntries,
4313 &GathersToOrders, &ExternalUserReorderMap,
4314 &AltShufflesToOrders, &PhisToOrders](
4315 const std::unique_ptr<TreeEntry> &TE) {
4318 findExternalStoreUsersReorderIndices(TE.get());
4319 if (!ExternalUserReorderIndices.
empty()) {
4320 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4322 std::move(ExternalUserReorderIndices));
4328 if (TE->isAltShuffle()) {
4331 unsigned Opcode0 = TE->getOpcode();
4332 unsigned Opcode1 = TE->getAltOpcode();
4335 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4336 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4337 OpcodeMask.
set(Lane);
4340 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4346 if (std::optional<OrdersType> CurrentOrder =
4356 const TreeEntry *UserTE = TE.get();
4358 if (UserTE->UserTreeIndices.size() != 1)
4361 return EI.UserTE->State == TreeEntry::Vectorize &&
4362 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4365 UserTE = UserTE->UserTreeIndices.back().UserTE;
4368 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4369 if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
4370 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4371 if (TE->State == TreeEntry::Vectorize &&
4372 TE->getOpcode() == Instruction::PHI)
4373 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
4378 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4380 auto It = VFToOrderedEntries.
find(VF);
4381 if (It == VFToOrderedEntries.
end())
4393 for (
const TreeEntry *OpTE : OrderedEntries) {
4396 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
4399 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4401 if (OpTE->State == TreeEntry::NeedToGather ||
4402 !OpTE->ReuseShuffleIndices.empty()) {
4403 auto It = GathersToOrders.find(OpTE);
4404 if (It != GathersToOrders.end())
4407 if (OpTE->isAltShuffle()) {
4408 auto It = AltShufflesToOrders.find(OpTE);
4409 if (It != AltShufflesToOrders.end())
4412 if (OpTE->State == TreeEntry::Vectorize &&
4413 OpTE->getOpcode() == Instruction::PHI) {
4414 auto It = PhisToOrders.
find(OpTE);
4415 if (It != PhisToOrders.
end())
4418 return OpTE->ReorderIndices;
4421 auto It = ExternalUserReorderMap.
find(OpTE);
4422 if (It != ExternalUserReorderMap.
end()) {
4423 const auto &ExternalUserReorderIndices = It->second;
4427 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4428 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
4429 ExternalUserReorderIndices.size();
4431 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
4432 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4439 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4440 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4443 unsigned E = Order.size();
4446 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4449 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4451 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4455 if (OrdersUses.empty())
4459 unsigned Cnt = OrdersUses.front().second;
4460 for (
const auto &Pair :
drop_begin(OrdersUses)) {
4461 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4462 BestOrder = Pair.first;
4467 if (BestOrder.
empty())
4472 unsigned E = BestOrder.
size();
4474 return I < E ? static_cast<int>(I) : PoisonMaskElem;
4477 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4479 if (TE->Scalars.size() != VF) {
4480 if (TE->ReuseShuffleIndices.size() == VF) {
4486 return EI.UserTE->Scalars.size() == VF ||
4487 EI.UserTE->Scalars.size() ==
4490 "All users must be of VF size.");
4493 reorderNodeWithReuses(*TE, Mask);
4497 if (TE->State == TreeEntry::Vectorize &&
4500 !TE->isAltShuffle()) {
4504 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
4505 TE->reorderOperands(Mask);
4508 TE->reorderOperands(Mask);
4509 assert(TE->ReorderIndices.empty() &&
4510 "Expected empty reorder sequence.");
4513 if (!TE->ReuseShuffleIndices.empty()) {
4520 addMask(NewReuses, TE->ReuseShuffleIndices);
4521 TE->ReuseShuffleIndices.swap(NewReuses);
4527bool BoUpSLP::canReorderOperands(
4528 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4531 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
4532 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
4533 return OpData.first ==
I &&
4534 OpData.second->State == TreeEntry::Vectorize;
4537 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
4539 if (
any_of(TE->UserTreeIndices,
4540 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
4544 Edges.emplace_back(
I, TE);
4550 if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())
4554 TreeEntry *Gather =
nullptr;
4556 [&Gather, UserTE,
I](TreeEntry *TE) {
4557 assert(TE->State != TreeEntry::Vectorize &&
4558 "Only non-vectorized nodes are expected.");
4559 if (
any_of(TE->UserTreeIndices,
4560 [UserTE,
I](
const EdgeInfo &EI) {
4561 return EI.UserTE == UserTE && EI.EdgeIdx == I;
4563 assert(TE->isSame(UserTE->getOperand(
I)) &&
4564 "Operand entry does not match operands.");
4585 for_each(VectorizableTree, [
this, &OrderedEntries, &GathersToOrders,
4587 const std::unique_ptr<TreeEntry> &TE) {
4588 if (TE->State != TreeEntry::Vectorize)
4590 if (std::optional<OrdersType> CurrentOrder =
4592 OrderedEntries.
insert(TE.get());
4593 if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
4594 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4603 while (!OrderedEntries.
empty()) {
4608 for (TreeEntry *TE : OrderedEntries) {
4609 if (!(TE->State == TreeEntry::Vectorize ||
4610 (TE->State == TreeEntry::NeedToGather &&
4611 GathersToOrders.
count(TE))) ||
4612 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4615 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
4617 !Visited.insert(TE).second) {
4623 for (
EdgeInfo &EI : TE->UserTreeIndices) {
4624 TreeEntry *UserTE = EI.
UserTE;
4625 auto It =
Users.find(UserTE);
4626 if (It ==
Users.end())
4627 It =
Users.insert({UserTE, {}}).first;
4628 It->second.emplace_back(EI.
EdgeIdx, TE);
4633 [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
4635 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
4637 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
4638 return Data1.first->Idx > Data2.first->Idx;
4640 for (
auto &
Data : UsersVec) {
4643 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
4646 [&OrderedEntries](
const std::pair<unsigned, TreeEntry *> &Op) {
4647 OrderedEntries.remove(Op.second);
4661 for (
const auto &Op :
Data.second) {
4662 TreeEntry *OpTE = Op.second;
4663 if (!VisitedOps.
insert(OpTE).second)
4665 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
4667 const auto &Order = [OpTE, &GathersToOrders]() ->
const OrdersType & {
4668 if (OpTE->State == TreeEntry::NeedToGather ||
4669 !OpTE->ReuseShuffleIndices.empty())
4670 return GathersToOrders.
find(OpTE)->second;
4671 return OpTE->ReorderIndices;
4674 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
4675 return P.second == OpTE;
4678 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4679 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4682 unsigned E = Order.size();
4685 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4688 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
4691 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
4693 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
4694 const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4695 const TreeEntry *TE) {
4696 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4697 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4698 (IgnoreReorder && TE->Idx == 0))
4700 if (TE->State == TreeEntry::NeedToGather) {
4701 auto It = GathersToOrders.
find(TE);
4702 if (It != GathersToOrders.
end())
4703 return !It->second.empty();
4708 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
4709 TreeEntry *UserTE = EI.
UserTE;
4710 if (!VisitedUsers.
insert(UserTE).second)
4715 if (AllowsReordering(UserTE))
4723 if (
static_cast<unsigned>(
count_if(
4724 Ops, [UserTE, &AllowsReordering](
4725 const std::pair<unsigned, TreeEntry *> &Op) {
4726 return AllowsReordering(Op.second) &&
4727 all_of(Op.second->UserTreeIndices,
4729 return EI.UserTE == UserTE;
4731 })) <= Ops.
size() / 2)
4732 ++Res.first->second;
4736 if (OrdersUses.empty()) {
4738 [&OrderedEntries](
const std::pair<unsigned, TreeEntry *> &Op) {
4739 OrderedEntries.remove(Op.second);
4745 unsigned Cnt = OrdersUses.front().second;
4746 for (
const auto &Pair :
drop_begin(OrdersUses)) {
4747 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4748 BestOrder = Pair.first;
4753 if (BestOrder.
empty()) {
4755 [&OrderedEntries](
const std::pair<unsigned, TreeEntry *> &Op) {
4756 OrderedEntries.remove(Op.second);
4765 unsigned E = BestOrder.
size();
4767 return I < E ? static_cast<int>(I) : PoisonMaskElem;
4769 for (
const std::pair<unsigned, TreeEntry *> &Op :
Data.second) {
4770 TreeEntry *TE = Op.second;
4771 OrderedEntries.remove(TE);
4772 if (!VisitedOps.
insert(TE).second)
4774 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
4775 reorderNodeWithReuses(*TE, Mask);
4779 if (TE->State != TreeEntry::Vectorize)
4781 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
4782 TE->ReorderIndices.empty()) &&
4783 "Non-matching sizes of user/operand entries.");
4785 if (IgnoreReorder && TE == VectorizableTree.front().get())
4786 IgnoreReorder =
false;
4789 for (TreeEntry *Gather : GatherOps) {
4790 assert(Gather->ReorderIndices.empty() &&
4791 "Unexpected reordering of gathers.");
4792 if (!Gather->ReuseShuffleIndices.empty()) {
4798 OrderedEntries.remove(Gather);
4802 if (
Data.first->State != TreeEntry::Vectorize ||
4803 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
4804 Data.first->getMainOp()) ||
4805 Data.first->isAltShuffle())
4806 Data.first->reorderOperands(Mask);
4807 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
4808 Data.first->isAltShuffle()) {
4811 if (
Data.first->ReuseShuffleIndices.empty() &&
4812 !
Data.first->ReorderIndices.empty() &&
4813 !
Data.first->isAltShuffle()) {
4816 OrderedEntries.insert(
Data.first);
4824 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
4825 VectorizableTree.front()->ReuseShuffleIndices.empty())
4826 VectorizableTree.front()->ReorderIndices.clear();
4832 for (
auto &TEPtr : VectorizableTree) {
4833 TreeEntry *Entry = TEPtr.get();
4836 if (Entry->State == TreeEntry::NeedToGather)
4840 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
4841 Value *Scalar = Entry->Scalars[Lane];
4842 int FoundLane = Entry->findLaneForValue(Scalar);
4845 auto ExtI = ExternallyUsedValues.
find(Scalar);
4846 if (ExtI != ExternallyUsedValues.
end()) {
4847 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
4848 << Lane <<
" from " << *Scalar <<
".\n");
4849 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
4851 for (
User *U : Scalar->users()) {
4862 if (TreeEntry *UseEntry = getTreeEntry(U)) {
4863 Value *UseScalar = UseEntry->Scalars[0];
4867 if (UseScalar != U ||
4868 UseEntry->State == TreeEntry::ScatterVectorize ||
4870 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
4872 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
4878 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
4881 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract:" << *U <<
" from lane "
4882 << Lane <<
" from " << *Scalar <<
".\n");
4883 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
4890BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
4892 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
4893 Value *V = TE->Scalars[Lane];
4895 static constexpr unsigned UsersLimit = 4;
4896 if (V->hasNUsesOrMore(UsersLimit))
4900 for (
User *U : V->users()) {
4901 auto *
SI = dyn_cast<StoreInst>(U);
4902 if (SI ==
nullptr || !
SI->isSimple() ||
4906 if (getTreeEntry(U))
4910 auto &StoresVec = PtrToStoresMap[
Ptr];
4913 if (StoresVec.size() > Lane)
4916 if (!StoresVec.empty() &&
4917 SI->getParent() != StoresVec.back()->getParent())
4920 if (!StoresVec.empty() &&
4921 SI->getValueOperand()->getType() !=
4922 StoresVec.back()->getValueOperand()->getType())
4924 StoresVec.push_back(SI);
4927 return PtrToStoresMap;
4931 OrdersType &ReorderIndices)
const {
4939 StoreOffsetVec[0] = {S0, 0};
4942 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
4944 std::optional<int> Diff =
4946 SI->getPointerOperand(), *DL, *SE,
4951 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
4956 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
4957 const std::pair<StoreInst *, int> &Pair2) {
4958 int Offset1 = Pair1.second;
4959 int Offset2 = Pair2.second;
4960 return Offset1 < Offset2;
4964 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
4965 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx-1].second + 1)
4970 ReorderIndices.reserve(StoresVec.
size());
4973 [SI](
const std::pair<StoreInst *, int> &Pair) {
4974 return Pair.first ==
SI;
4976 StoreOffsetVec.begin();
4977 ReorderIndices.push_back(
Idx);
4982 auto IsIdentityOrder = [](
const OrdersType &Order) {
4983 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4988 if (IsIdentityOrder(ReorderIndices))
4989 ReorderIndices.clear();
4996 for (
unsigned Idx : Order)
5003BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
5004 unsigned NumLanes =
TE->Scalars.size();
5007 collectUserStores(TE);
5016 for (
const auto &Pair : PtrToStoresMap) {
5017 auto &StoresVec = Pair.second;
5019 if (StoresVec.size() != NumLanes)
5024 if (!canFormVector(StoresVec, ReorderIndices))
5029 ExternalReorderIndices.
push_back(ReorderIndices);
5031 return ExternalReorderIndices;
5037 UserIgnoreList = &UserIgnoreLst;
5040 buildTree_rec(Roots, 0,
EdgeInfo());
5047 buildTree_rec(Roots, 0,
EdgeInfo());
5054 Value *NeedsScheduling =
nullptr;
5055 for (
Value *V : VL) {