72#ifdef EXPENSIVE_CHECKS
103using namespace slpvectorizer;
105#define SV_NAME "slp-vectorizer"
106#define DEBUG_TYPE "SLP"
108STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
112 cl::desc(
"Run the SLP vectorization passes"));
116 cl::desc(
"Only vectorize if you gain more than this "
121 cl::desc(
"Attempt to vectorize horizontal reductions"));
126 "Attempt to vectorize horizontal reductions feeding into a store"));
132 cl::desc(
"Allow optimization of original scalar identity operations on "
133 "matched horizontal reductions."));
137 cl::desc(
"Attempt to vectorize for this register size in bits"));
141 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
149 cl::desc(
"Limit the size of the SLP scheduling region per block"));
153 cl::desc(
"Attempt to vectorize for this register size in bits"));
157 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
161 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
167 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
176 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
180 cl::desc(
"Display the SLP trees with Graphviz"));
203 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
210 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
217 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
218 !isa<ExtractValueInst, UndefValue>(V))
220 auto *
I = dyn_cast<Instruction>(V);
221 if (!
I || isa<ExtractValueInst>(
I))
223 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
225 if (isa<ExtractElementInst>(
I))
227 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
236 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
252 for (
int I = 1,
E = VL.
size();
I <
E;
I++) {
253 auto *II = dyn_cast<Instruction>(VL[
I]);
274 Value *FirstNonUndef =
nullptr;
275 for (
Value *V : VL) {
276 if (isa<UndefValue>(V))
278 if (!FirstNonUndef) {
282 if (V != FirstNonUndef)
285 return FirstNonUndef !=
nullptr;
290 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
291 return Cmp->isCommutative();
292 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
293 return BO->isCommutative();
305 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
306 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
309 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
312 if (CI->getValue().uge(VT->getNumElements()))
314 Index *= VT->getNumElements();
315 Index += CI->getZExtValue();
319 const auto *
IV = cast<InsertValueInst>(InsertInst);
320 Type *CurrentType =
IV->getType();
321 for (
unsigned I :
IV->indices()) {
322 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
323 Index *= ST->getNumElements();
324 CurrentType = ST->getElementType(
I);
325 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
326 Index *= AT->getNumElements();
327 CurrentType = AT->getElementType();
360 if (MaskArg == UseMask::UndefsAsMask)
364 if (MaskArg == UseMask::FirstArg &&
Value < VF)
365 UseMask.reset(
Value);
366 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
367 UseMask.reset(
Value - VF);
375template <
bool IsPoisonOnly = false>
379 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
382 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
385 auto *
C = dyn_cast<Constant>(V);
387 if (!UseMask.empty()) {
389 while (
auto *II = dyn_cast<InsertElementInst>(
Base)) {
390 Base = II->getOperand(0);
391 if (isa<T>(II->getOperand(1)))
396 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
404 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
411 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
412 if (
Constant *Elem =
C->getAggregateElement(
I))
414 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
442static std::optional<TargetTransformInfo::ShuffleKind>
445 find_if(VL, [](
Value *V) {
return isa<ExtractElementInst>(V); });
448 auto *EI0 = cast<ExtractElementInst>(*It);
449 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
452 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
453 Value *Vec1 =
nullptr;
454 Value *Vec2 =
nullptr;
456 ShuffleMode CommonShuffleMode =
Unknown;
458 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
460 if (isa<UndefValue>(VL[
I]))
462 auto *EI = cast<ExtractElementInst>(VL[
I]);
463 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
465 auto *Vec = EI->getVectorOperand();
470 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
472 if (isa<UndefValue>(EI->getIndexOperand()))
474 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
480 unsigned IntIdx =
Idx->getValue().getZExtValue();
484 if (!Vec1 || Vec1 == Vec) {
486 }
else if (!Vec2 || Vec2 == Vec) {
492 if (CommonShuffleMode == Permute)
497 CommonShuffleMode = Permute;
500 CommonShuffleMode =
Select;
503 if (CommonShuffleMode ==
Select && Vec2)
513 unsigned Opcode =
E->getOpcode();
515 Opcode == Instruction::ExtractValue) &&
516 "Expected extractelement or extractvalue instruction.");
517 if (
Opcode == Instruction::ExtractElement) {
518 auto *CI = dyn_cast<ConstantInt>(
E->getOperand(1));
521 return CI->getZExtValue();
523 auto *EI = cast<ExtractValueInst>(
E);
524 if (EI->getNumIndices() != 1)
526 return *EI->idx_begin();
532struct InstructionsState {
534 Value *OpValue =
nullptr;
545 unsigned getAltOpcode()
const {
550 bool isAltShuffle()
const {
return AltOp != MainOp; }
553 unsigned CheckedOpcode =
I->getOpcode();
554 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
557 InstructionsState() =
delete;
559 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
568 auto *
I = dyn_cast<Instruction>(
Op);
569 if (
I && S.isOpcodeOrAlt(
I))
588 unsigned BaseIndex = 0);
596 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
597 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
598 BaseOp0 == Op0 || BaseOp1 == Op1 ||
609 "Assessing comparisons of different types?");
619 return (BasePred == Pred &&
621 (BasePred == SwappedPred &&
630 unsigned BaseIndex) {
633 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
635 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
636 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
637 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
639 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
641 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
642 unsigned AltOpcode =
Opcode;
643 unsigned AltIndex = BaseIndex;
647 auto *IBase = cast<Instruction>(VL[BaseIndex]);
650 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
654 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
656 for (
int Cnt = 0,
E = VL.
size(); Cnt <
E; Cnt++) {
657 auto *
I = cast<Instruction>(VL[Cnt]);
658 unsigned InstOpcode =
I->getOpcode();
659 if (IsBinOp && isa<BinaryOperator>(
I)) {
660 if (InstOpcode ==
Opcode || InstOpcode == AltOpcode)
664 AltOpcode = InstOpcode;
668 }
else if (IsCastOp && isa<CastInst>(
I)) {
669 Value *Op0 = IBase->getOperand(0);
671 Value *Op1 =
I->getOperand(0);
674 if (InstOpcode ==
Opcode || InstOpcode == AltOpcode)
676 if (
Opcode == AltOpcode) {
679 "Cast isn't safe for alternation, logic needs to be updated!");
680 AltOpcode = InstOpcode;
685 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
686 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
687 Type *Ty0 = BaseInst->getOperand(0)->getType();
688 Type *Ty1 = Inst->getOperand(0)->getType();
690 assert(InstOpcode ==
Opcode &&
"Expected same CmpInst opcode.");
698 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
703 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
704 if (AltIndex != BaseIndex) {
707 }
else if (BasePred != CurrentPred) {
710 "CmpInst isn't safe for alternation, logic needs to be updated!");
715 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
716 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
719 }
else if (InstOpcode ==
Opcode || InstOpcode == AltOpcode) {
720 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
721 if (Gep->getNumOperands() != 2 ||
722 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
723 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
724 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
726 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
727 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
728 auto *BaseLI = cast<LoadInst>(IBase);
729 if (!LI->isSimple() || !BaseLI->isSimple())
730 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
731 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
732 auto *
CallBase = cast<CallInst>(IBase);
734 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
735 if (Call->hasOperandBundles() &&
736 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
737 Call->op_begin() + Call->getBundleOperandsEndIndex(),
740 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
743 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
746 if (Mappings.
size() != BaseMappings.
size() ||
747 Mappings.
front().ISA != BaseMappings.
front().ISA ||
748 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
749 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
750 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
751 Mappings.
front().Shape.Parameters !=
752 BaseMappings.
front().Shape.Parameters)
753 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
758 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
761 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
762 cast<Instruction>(VL[AltIndex]));
778 case Instruction::Load: {
779 LoadInst *LI = cast<LoadInst>(UserInst);
782 case Instruction::Store: {
783 StoreInst *SI = cast<StoreInst>(UserInst);
784 return (SI->getPointerOperand() == Scalar);
786 case Instruction::Call: {
787 CallInst *CI = cast<CallInst>(UserInst);
790 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
791 Arg.value().get() == Scalar;
803 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
810 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
811 return LI->isSimple();
813 return SI->isSimple();
815 return !
MI->isVolatile();
823 bool ExtendingManyInputs =
false) {
827 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
829 (SubMask.
size() == Mask.size() &&
830 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
831 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
832 "SubMask with many inputs support must be larger than the mask.");
834 Mask.append(SubMask.
begin(), SubMask.
end());
838 int TermValue = std::min(Mask.size(), SubMask.
size());
839 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
841 (!ExtendingManyInputs &&
842 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
844 NewMask[
I] = Mask[SubMask[
I]];
860 const unsigned Sz = Order.
size();
863 for (
unsigned I = 0;
I < Sz; ++
I) {
865 UnusedIndices.
reset(Order[
I]);
867 MaskedIndices.
set(
I);
869 if (MaskedIndices.
none())
872 "Non-synced masked/available indices.");
876 assert(
Idx >= 0 &&
"Indices must be synced.");
888 const unsigned E = Indices.
size();
890 for (
unsigned I = 0;
I <
E; ++
I)
891 Mask[Indices[
I]] =
I;
897 assert(!Mask.empty() &&
"Expected non-empty mask.");
901 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
903 Scalars[Mask[
I]] = Prev[
I];
911 auto *
I = dyn_cast<Instruction>(V);
916 auto *IO = dyn_cast<Instruction>(V);
919 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
928 auto *
I = dyn_cast<Instruction>(V);
932 constexpr int UsesLimit = 8;
933 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(UsesLimit) &&
935 auto *IU = dyn_cast<Instruction>(U);
938 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
954 return !VL.
empty() &&
958namespace slpvectorizer {
980 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li),
981 DT(Dt), AC(AC), DB(DB),
DL(
DL), ORE(ORE), Builder(Se->getContext()) {
1034 return !VectorizableTree.
empty() &&
1035 !VectorizableTree.
front()->UserTreeIndices.empty();
1040 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1041 return VectorizableTree.
front()->Scalars;
1053 VectorizableTree.
clear();
1054 ScalarToTreeEntry.clear();
1055 MultiNodeScalars.clear();
1057 EntryToLastInstruction.clear();
1058 ExternalUses.
clear();
1059 for (
auto &Iter : BlocksSchedules) {
1060 BlockScheduling *BS = Iter.second.get();
1064 InstrElementSize.clear();
1065 UserIgnoreList =
nullptr;
1066 PostponedGathers.
clear();
1067 ValueToGatherNodes.
clear();
1124 return MaxVecRegSize;
1129 return MinVecRegSize;
1137 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1139 return MaxVF ? MaxVF : UINT_MAX;
1194 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1195 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1217 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1218 MaxLevel(MaxLevel) {}
1272 if (isa<LoadInst>(V1)) {
1274 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1276 static constexpr unsigned Limit = 8;
1277 if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
1280 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1282 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1285 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1288 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1290 ((
int)V1->getNumUses() == NumLanes ||
1291 AllUsersAreInternal(V1, V2)))
1297 auto *LI1 = dyn_cast<LoadInst>(V1);
1298 auto *LI2 = dyn_cast<LoadInst>(V2);
1300 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1305 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1306 LI2->getPointerOperand(),
DL, SE,
true);
1307 if (!Dist || *Dist == 0) {
1310 R.TTI->isLegalMaskedGather(
1318 if (std::abs(*Dist) > NumLanes / 2)
1327 auto *C1 = dyn_cast<Constant>(V1);
1328 auto *C2 = dyn_cast<Constant>(V2);
1342 if (isa<UndefValue>(V2))
1346 Value *EV2 =
nullptr;
1359 int Dist = Idx2 - Idx1;
1362 if (std::abs(Dist) == 0)
1364 if (std::abs(Dist) > NumLanes / 2)
1374 auto *I1 = dyn_cast<Instruction>(V1);
1375 auto *I2 = dyn_cast<Instruction>(V2);
1377 if (I1->getParent() != I2->getParent())
1385 if (S.getOpcode() &&
1386 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1387 !S.isAltShuffle()) &&
1389 return cast<Instruction>(V)->getNumOperands() ==
1390 S.MainOp->getNumOperands();
1396 if (isa<UndefValue>(V2))
1433 int ShallowScoreAtThisLevel =
1442 auto *I1 = dyn_cast<Instruction>(
LHS);
1443 auto *I2 = dyn_cast<Instruction>(
RHS);
1444 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1446 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1447 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1448 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1449 ShallowScoreAtThisLevel))
1450 return ShallowScoreAtThisLevel;
1451 assert(I1 && I2 &&
"Should have early exited.");
1458 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1459 OpIdx1 != NumOperands1; ++OpIdx1) {
1461 int MaxTmpScore = 0;
1462 unsigned MaxOpIdx2 = 0;
1463 bool FoundBest =
false;
1467 ? I2->getNumOperands()
1468 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1469 assert(FromIdx <= ToIdx &&
"Bad index");
1470 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1472 if (Op2Used.
count(OpIdx2))
1477 I1, I2, CurrLevel + 1, std::nullopt);
1480 TmpScore > MaxTmpScore) {
1481 MaxTmpScore = TmpScore;
1488 Op2Used.
insert(MaxOpIdx2);
1489 ShallowScoreAtThisLevel += MaxTmpScore;
1492 return ShallowScoreAtThisLevel;
1523 struct OperandData {
1524 OperandData() =
default;
1525 OperandData(
Value *V,
bool APO,
bool IsUsed)
1526 : V(V), APO(APO), IsUsed(IsUsed) {}
1536 bool IsUsed =
false;
1545 enum class ReorderingMode {
1564 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1565 return OpsVec[OpIdx][Lane];
1569 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1570 return OpsVec[OpIdx][Lane];
1575 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1576 OpIdx != NumOperands; ++OpIdx)
1577 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1579 OpsVec[OpIdx][Lane].IsUsed =
false;
1583 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1584 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1596 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1597 Value *IdxLaneV = getData(
Idx, Lane).V;
1598 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1601 for (
unsigned Ln = 0,
E = getNumLanes(); Ln <
E; ++Ln) {
1604 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1605 if (!isa<Instruction>(OpIdxLnV))
1607 Uniques.
insert(OpIdxLnV);
1609 int UniquesCount = Uniques.
size();
1610 int UniquesCntWithIdxLaneV =
1611 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1612 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1613 int UniquesCntWithOpIdxLaneV =
1614 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1615 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1618 UniquesCntWithOpIdxLaneV) -
1619 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1628 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1629 Value *IdxLaneV = getData(
Idx, Lane).V;
1630 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1639 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1640 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1642 return R.areAllUsersVectorized(IdxLaneI)
1650 static const int ScoreScaleFactor = 10;
1658 int Lane,
unsigned OpIdx,
unsigned Idx,
1668 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1669 if (Score <= -SplatScore) {
1674 Score += SplatScore;
1680 Score *= ScoreScaleFactor;
1681 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1699 std::optional<unsigned>
1700 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1703 unsigned NumOperands = getNumOperands();
1706 Value *OpLastLane = getData(OpIdx, LastLane).V;
1709 ReorderingMode RMode = ReorderingModes[OpIdx];
1710 if (RMode == ReorderingMode::Failed)
1711 return std::nullopt;
1714 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1720 std::optional<unsigned>
Idx;
1724 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1731 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1733 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1735 OperandData &OpData = getData(
Idx, Lane);
1737 bool OpAPO = OpData.APO;
1746 if (OpAPO != OpIdxAPO)
1751 case ReorderingMode::Load:
1752 case ReorderingMode::Constant:
1753 case ReorderingMode::Opcode: {
1754 bool LeftToRight = Lane > LastLane;
1755 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1756 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1757 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1758 OpIdx,
Idx, IsUsed);
1759 if (Score >
static_cast<int>(BestOp.Score)) {
1761 BestOp.Score = Score;
1762 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1766 case ReorderingMode::Splat:
1767 if (
Op == OpLastLane)
1770 case ReorderingMode::Failed:
1776 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1780 return std::nullopt;
1787 unsigned getBestLaneToStartReordering()
const {
1788 unsigned Min = UINT_MAX;
1789 unsigned SameOpNumber = 0;
1800 for (
int I = getNumLanes();
I > 0; --
I) {
1801 unsigned Lane =
I - 1;
1802 OperandsOrderData NumFreeOpsHash =
1803 getMaxNumOperandsThatCanBeReordered(Lane);
1806 if (NumFreeOpsHash.NumOfAPOs < Min) {
1807 Min = NumFreeOpsHash.NumOfAPOs;
1808 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1810 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1811 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1812 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1815 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1816 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1817 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1818 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1819 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
1820 if (It == HashMap.
end())
1821 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1827 unsigned BestLane = 0;
1828 unsigned CntMin = UINT_MAX;
1830 if (
Data.second.first < CntMin) {
1831 CntMin =
Data.second.first;
1832 BestLane =
Data.second.second;
1839 struct OperandsOrderData {
1842 unsigned NumOfAPOs = UINT_MAX;
1845 unsigned NumOpsWithSameOpcodeParent = 0;
1859 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1860 unsigned CntTrue = 0;
1861 unsigned NumOperands = getNumOperands();
1871 bool AllUndefs =
true;
1872 unsigned NumOpsWithSameOpcodeParent = 0;
1876 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1877 const OperandData &OpData = getData(OpIdx, Lane);
1882 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
1884 I->getParent() != Parent) {
1885 if (NumOpsWithSameOpcodeParent == 0) {
1886 NumOpsWithSameOpcodeParent = 1;
1888 Parent =
I->getParent();
1890 --NumOpsWithSameOpcodeParent;
1893 ++NumOpsWithSameOpcodeParent;
1897 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1898 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1902 OperandsOrderData
Data;
1903 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1904 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1912 assert((empty() || VL.
size() == getNumLanes()) &&
1913 "Expected same number of lanes");
1914 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
1915 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1916 OpsVec.
resize(NumOperands);
1917 unsigned NumLanes = VL.
size();
1918 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1919 OpsVec[OpIdx].
resize(NumLanes);
1920 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1921 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
1932 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
1933 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
1934 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1941 unsigned getNumOperands()
const {
return OpsVec.
size(); }
1944 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
1947 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
1948 return getData(OpIdx, Lane).V;
1952 bool empty()
const {
return OpsVec.
empty(); }
1955 void clear() { OpsVec.
clear(); }
1960 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
1961 bool OpAPO = getData(OpIdx, Lane).APO;
1962 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1966 bool FoundCandidate =
false;
1967 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
1968 OperandData &
Data = getData(OpI, Ln);
1969 if (
Data.APO != OpAPO ||
Data.IsUsed)
1972 FoundCandidate =
true;
1977 if (!FoundCandidate)
1987 : TLI(TLI),
DL(
DL), SE(SE), R(R) {
1989 appendOperandsOfVL(RootVL);
1996 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
1997 "Expected same num of lanes across all operands");
1998 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
1999 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2007 unsigned NumOperands = getNumOperands();
2008 unsigned NumLanes = getNumLanes();
2028 unsigned FirstLane = getBestLaneToStartReordering();
2031 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2032 Value *OpLane0 = getValue(OpIdx, FirstLane);
2035 if (isa<LoadInst>(OpLane0))
2036 ReorderingModes[OpIdx] = ReorderingMode::Load;
2037 else if (isa<Instruction>(OpLane0)) {
2039 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2040 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2042 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2044 else if (isa<Constant>(OpLane0))
2045 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2046 else if (isa<Argument>(OpLane0))
2048 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2051 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2058 auto &&SkipReordering = [
this]() {
2061 for (
const OperandData &
Data : Op0)
2064 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2083 if (SkipReordering())
2086 bool StrategyFailed =
false;
2094 for (
unsigned I = 0;
I < NumOperands; ++
I)
2095 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2097 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2100 int Lane = FirstLane +
Direction * Distance;
2101 if (Lane < 0 || Lane >= (
int)NumLanes)
2104 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2107 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2109 std::optional<unsigned> BestIdx = getBestOperand(
2110 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2117 swap(OpIdx, *BestIdx, Lane);
2120 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2122 StrategyFailed =
true;
2125 if (MainAltOps[OpIdx].
size() != 2) {
2126 OperandData &AltOp = getData(OpIdx, Lane);
2127 InstructionsState OpS =
2129 if (OpS.getOpcode() && OpS.isAltShuffle())
2136 if (!StrategyFailed)
2141#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2144 case ReorderingMode::Load:
2146 case ReorderingMode::Opcode:
2148 case ReorderingMode::Constant:
2150 case ReorderingMode::Splat:
2152 case ReorderingMode::Failed:
2173 const unsigned Indent = 2;
2176 OS <<
"Operand " << Cnt++ <<
"\n";
2177 for (
const OperandData &OpData : OpDataVec) {
2179 if (
Value *V = OpData.V)
2183 OS <<
", APO:" << OpData.APO <<
"}\n";
2205 int BestScore = Limit;
2206 std::optional<int>
Index;
2207 for (
int I : seq<int>(0, Candidates.size())) {
2209 Candidates[
I].second,
2212 if (Score > BestScore) {
2227 DeletedInstructions.insert(
I);
2233 return AnalyzedReductionsRoots.count(
I);
2238 AnalyzedReductionsRoots.insert(
I);
2252 AnalyzedReductionsRoots.clear();
2253 AnalyzedReductionVals.
clear();
2273 bool collectValuesToDemote(
2286 canReorderOperands(TreeEntry *UserTE,
2293 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2297 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2299 TreeEntry *TE =
nullptr;
2301 TE = getTreeEntry(V);
2302 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2304 auto It = MultiNodeScalars.find(V);
2305 if (It != MultiNodeScalars.end()) {
2306 for (TreeEntry *
E : It->second) {
2307 if (
is_contained(
E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2315 if (It != VL.
end()) {
2316 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2324 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2325 unsigned OpIdx)
const {
2326 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2327 const_cast<TreeEntry *
>(UserTE), OpIdx);
2331 bool areAllUsersVectorized(
2340 const TreeEntry *getOperandEntry(
const TreeEntry *
E,
unsigned Idx)
const;
2349 const EdgeInfo &EI);
2360 bool ResizeAllowed =
false)
const;
2371 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx,
bool PostponedPHIs);
2376 template <
typename BVTy,
typename ResTy,
typename...
Args>
2377 ResTy processBuildVector(
const TreeEntry *
E, Args &...Params);
2382 Value *createBuildVector(
const TreeEntry *
E);
2388 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
2395 std::optional<TargetTransformInfo::ShuffleKind>
2407 unsigned NumParts)
const;
2416 std::optional<TargetTransformInfo::ShuffleKind>
2417 isGatherShuffledSingleRegisterEntry(
2430 isGatherShuffledEntry(
2443 void setInsertPointAfterBundle(
const TreeEntry *
E);
2451 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2455 static void reorderInputsAccordingToOpcode(
2464 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2480 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2484 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2501 [Scalars](
Value *V,
int Idx) {
2502 return (isa<UndefValue>(V) &&
2503 Idx == PoisonMaskElem) ||
2504 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2507 if (!ReorderIndices.empty()) {
2514 return IsSame(Scalars, Mask);
2515 if (VL.
size() == ReuseShuffleIndices.size()) {
2517 return IsSame(Scalars, Mask);
2521 return IsSame(Scalars, ReuseShuffleIndices);
2524 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2525 return State == TreeEntry::NeedToGather &&
2526 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2527 UserTreeIndices.front().UserTE == UserEI.UserTE;
2531 bool hasEqualOperands(
const TreeEntry &TE)
const {
2532 if (
TE.getNumOperands() != getNumOperands())
2535 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
2536 unsigned PrevCount =
Used.count();
2537 for (
unsigned K = 0;
K <
E; ++
K) {
2540 if (getOperand(K) ==
TE.getOperand(
I)) {
2546 if (PrevCount ==
Used.count())
2555 unsigned getVectorFactor()
const {
2556 if (!ReuseShuffleIndices.empty())
2557 return ReuseShuffleIndices.size();
2558 return Scalars.
size();
2576 PossibleStridedVectorize,
2593 VecTreeTy &Container;
2617 assert(Operands[OpIdx].empty() &&
"Already resized?");
2619 "Number of operands is greater than the number of scalars.");
2625 void setOperandsInOrder() {
2627 auto *I0 = cast<Instruction>(Scalars[0]);
2628 Operands.resize(I0->getNumOperands());
2629 unsigned NumLanes = Scalars.size();
2630 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2631 OpIdx != NumOperands; ++OpIdx) {
2633 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2634 auto *
I = cast<Instruction>(Scalars[Lane]);
2635 assert(
I->getNumOperands() == NumOperands &&
2636 "Expected same number of operands");
2637 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2661 unsigned getNumOperands()
const {
return Operands.size(); }
2664 Value *getSingleOperand(
unsigned OpIdx)
const {
2666 assert(!Operands[OpIdx].empty() &&
"No operand available");
2671 bool isAltShuffle()
const {
return MainOp != AltOp; }
2674 unsigned CheckedOpcode =
I->getOpcode();
2675 return (getOpcode() == CheckedOpcode ||
2676 getAltOpcode() == CheckedOpcode);
2683 auto *
I = dyn_cast<Instruction>(
Op);
2684 if (
I && isOpcodeOrAlt(
I))
2689 void setOperations(
const InstructionsState &S) {
2703 unsigned getOpcode()
const {
2704 return MainOp ? MainOp->
getOpcode() : 0;
2707 unsigned getAltOpcode()
const {
2713 int findLaneForValue(
Value *V)
const {
2714 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2715 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2716 if (!ReorderIndices.
empty())
2717 FoundLane = ReorderIndices[FoundLane];
2718 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2719 if (!ReuseShuffleIndices.
empty()) {
2720 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2721 find(ReuseShuffleIndices, FoundLane));
2738 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2739 dbgs() <<
"Operand " << OpI <<
":\n";
2740 for (
const Value *V : Operands[OpI])
2743 dbgs() <<
"Scalars: \n";
2744 for (
Value *V : Scalars)
2746 dbgs() <<
"State: ";
2749 dbgs() <<
"Vectorize\n";
2751 case ScatterVectorize:
2752 dbgs() <<
"ScatterVectorize\n";
2754 case PossibleStridedVectorize:
2755 dbgs() <<
"PossibleStridedVectorize\n";
2758 dbgs() <<
"NeedToGather\n";
2761 dbgs() <<
"MainOp: ";
2763 dbgs() << *MainOp <<
"\n";
2766 dbgs() <<
"AltOp: ";
2768 dbgs() << *AltOp <<
"\n";
2771 dbgs() <<
"VectorizedValue: ";
2772 if (VectorizedValue)
2773 dbgs() << *VectorizedValue <<
"\n";
2776 dbgs() <<
"ReuseShuffleIndices: ";
2777 if (ReuseShuffleIndices.
empty())
2780 for (
int ReuseIdx : ReuseShuffleIndices)
2781 dbgs() << ReuseIdx <<
", ";
2783 dbgs() <<
"ReorderIndices: ";
2784 for (
unsigned ReorderIdx : ReorderIndices)
2785 dbgs() << ReorderIdx <<
", ";
2787 dbgs() <<
"UserTreeIndices: ";
2788 for (
const auto &EInfo : UserTreeIndices)
2789 dbgs() << EInfo <<
", ";
2799 dbgs() <<
"SLP: " << Banner <<
":\n";
2801 dbgs() <<
"SLP: Costs:\n";
2802 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2803 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2804 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2805 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2806 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
2812 std::optional<ScheduleData *> Bundle,
2813 const InstructionsState &S,
2814 const EdgeInfo &UserTreeIdx,
2817 TreeEntry::EntryState EntryState =
2818 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2819 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2820 ReuseShuffleIndices, ReorderIndices);
2824 TreeEntry::EntryState EntryState,
2825 std::optional<ScheduleData *> Bundle,
2826 const InstructionsState &S,
2827 const EdgeInfo &UserTreeIdx,
2830 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2831 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2832 "Need to vectorize gather entry?");
2833 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
2834 TreeEntry *
Last = VectorizableTree.
back().get();
2835 Last->Idx = VectorizableTree.
size() - 1;
2836 Last->State = EntryState;
2837 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2838 ReuseShuffleIndices.end());
2839 if (ReorderIndices.
empty()) {
2841 Last->setOperations(S);
2844 Last->Scalars.assign(VL.
size(),
nullptr);
2847 if (Idx >= VL.size())
2848 return UndefValue::get(VL.front()->getType());
2852 Last->setOperations(S);
2853 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
2855 if (
Last->State != TreeEntry::NeedToGather) {
2856 for (
Value *V : VL) {
2857 const TreeEntry *
TE = getTreeEntry(V);
2859 "Scalar already in tree!");
2862 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
2865 ScalarToTreeEntry[
V] =
Last;
2868 ScheduleData *BundleMember = *Bundle;
2869 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2872 "Bundle and VL out of sync");
2874 for (
Value *V : VL) {
2879 BundleMember->TE =
Last;
2880 BundleMember = BundleMember->NextInBundle;
2883 assert(!BundleMember &&
"Bundle and VL out of sync");
2885 MustGather.
insert(VL.begin(), VL.end());
2888 if (UserTreeIdx.UserTE)
2889 Last->UserTreeIndices.push_back(UserTreeIdx);
2896 TreeEntry::VecTreeTy VectorizableTree;
2901 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2902 VectorizableTree[
Id]->dump();
2908 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
2910 const TreeEntry *getTreeEntry(
Value *V)
const {
2911 return ScalarToTreeEntry.lookup(V);
2916 TreeEntry::EntryState getScalarsVectorizationState(
2946 using ValueToGatherNodesMap =
2948 ValueToGatherNodesMap ValueToGatherNodes;
2951 struct ExternalUser {
2975 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
2976 auto It = AliasCache.
find(Key);
2977 if (It != AliasCache.
end())
2982 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
2986 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3014 UserList ExternalUses;
3030 struct ScheduleData {
3033 enum { InvalidDeps = -1 };
3035 ScheduleData() =
default;
3037 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3038 FirstInBundle =
this;
3039 NextInBundle =
nullptr;
3040 NextLoadStore =
nullptr;
3041 IsScheduled =
false;
3042 SchedulingRegionID = BlockSchedulingRegionID;
3043 clearDependencies();
3050 if (hasValidDependencies()) {
3051 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3053 assert(UnscheduledDeps == Dependencies &&
"invariant");
3057 assert(isSchedulingEntity() &&
3058 "unexpected scheduled state");
3059 for (
const ScheduleData *BundleMember =
this; BundleMember;
3060 BundleMember = BundleMember->NextInBundle) {
3061 assert(BundleMember->hasValidDependencies() &&
3062 BundleMember->UnscheduledDeps == 0 &&
3063 "unexpected scheduled state");
3064 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3065 "only bundle is marked scheduled");
3069 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3070 "all bundle members must be in same basic block");
3076 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3080 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3084 bool isPartOfBundle()
const {
3085 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3090 bool isReady()
const {
3091 assert(isSchedulingEntity() &&
3092 "can't consider non-scheduling entity for ready list");
3093 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3099 int incrementUnscheduledDeps(
int Incr) {
3100 assert(hasValidDependencies() &&
3101 "increment of unscheduled deps would be meaningless");
3102 UnscheduledDeps += Incr;
3103 return FirstInBundle->unscheduledDepsInBundle();
3108 void resetUnscheduledDeps() {
3109 UnscheduledDeps = Dependencies;
3113 void clearDependencies() {
3114 Dependencies = InvalidDeps;
3115 resetUnscheduledDeps();
3116 MemoryDependencies.clear();
3117 ControlDependencies.clear();
3120 int unscheduledDepsInBundle()
const {
3121 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3123 for (
const ScheduleData *BundleMember =
this; BundleMember;
3124 BundleMember = BundleMember->NextInBundle) {
3125 if (BundleMember->UnscheduledDeps == InvalidDeps)
3127 Sum += BundleMember->UnscheduledDeps;
3133 if (!isSchedulingEntity()) {
3134 os <<
"/ " << *Inst;
3135 }
else if (NextInBundle) {
3137 ScheduleData *SD = NextInBundle;
3139 os <<
';' << *SD->Inst;
3140 SD = SD->NextInBundle;
3151 Value *OpValue =
nullptr;
3154 TreeEntry *
TE =
nullptr;
3158 ScheduleData *FirstInBundle =
nullptr;
3162 ScheduleData *NextInBundle =
nullptr;
3166 ScheduleData *NextLoadStore =
nullptr;
3180 int SchedulingRegionID = 0;
3183 int SchedulingPriority = 0;
3189 int Dependencies = InvalidDeps;
3195 int UnscheduledDeps = InvalidDeps;
3199 bool IsScheduled =
false;
3204 const BoUpSLP::ScheduleData &SD) {
3229 struct BlockScheduling {
3231 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3235 ScheduleStart =
nullptr;
3236 ScheduleEnd =
nullptr;
3237 FirstLoadStoreInRegion =
nullptr;
3238 LastLoadStoreInRegion =
nullptr;
3239 RegionHasStackSave =
false;
3243 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3246 ScheduleRegionSize = 0;
3250 ++SchedulingRegionID;
3254 if (BB !=
I->getParent())
3257 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3258 if (SD && isInSchedulingRegion(SD))
3263 ScheduleData *getScheduleData(
Value *V) {
3264 if (
auto *
I = dyn_cast<Instruction>(V))
3265 return getScheduleData(
I);
3269 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3271 return getScheduleData(V);
3272 auto I = ExtraScheduleDataMap.find(V);
3273 if (
I != ExtraScheduleDataMap.end()) {
3274 ScheduleData *SD =
I->second.lookup(Key);
3275 if (SD && isInSchedulingRegion(SD))
3281 bool isInSchedulingRegion(ScheduleData *SD)
const {
3282 return SD->SchedulingRegionID == SchedulingRegionID;
3287 template <
typename ReadyListType>
3288 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3289 SD->IsScheduled =
true;
3292 for (ScheduleData *BundleMember = SD; BundleMember;
3293 BundleMember = BundleMember->NextInBundle) {
3294 if (BundleMember->Inst != BundleMember->OpValue)
3300 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3301 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3302 if (OpDef && OpDef->hasValidDependencies() &&
3303 OpDef->incrementUnscheduledDeps(-1) == 0) {
3307 ScheduleData *DepBundle = OpDef->FirstInBundle;
3308 assert(!DepBundle->IsScheduled &&
3309 "already scheduled bundle gets ready");
3310 ReadyList.insert(DepBundle);
3312 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3320 if (TreeEntry *TE = BundleMember->TE) {
3322 int Lane = std::distance(
TE->Scalars.begin(),
3323 find(
TE->Scalars, BundleMember->Inst));
3324 assert(Lane >= 0 &&
"Lane not set");
3332 auto *
In = BundleMember->Inst;
3334 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3335 In->getNumOperands() ==
TE->getNumOperands()) &&
3336 "Missed TreeEntry operands?");
3339 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3340 OpIdx != NumOperands; ++OpIdx)
3341 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3346 for (
Use &U : BundleMember->Inst->operands())
3347 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3351 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3352 if (MemoryDepSD->hasValidDependencies() &&
3353 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3356 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3357 assert(!DepBundle->IsScheduled &&
3358 "already scheduled bundle gets ready");
3359 ReadyList.insert(DepBundle);
3361 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3365 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3366 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3369 ScheduleData *DepBundle = DepSD->FirstInBundle;
3370 assert(!DepBundle->IsScheduled &&
3371 "already scheduled bundle gets ready");
3372 ReadyList.insert(DepBundle);
3374 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3385 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3386 ScheduleStart->comesBefore(ScheduleEnd) &&
3387 "Not a valid scheduling region?");
3389 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3390 auto *SD = getScheduleData(
I);
3393 assert(isInSchedulingRegion(SD) &&
3394 "primary schedule data not in window?");
3395 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3396 "entire bundle in window!");
3398 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3401 for (
auto *SD : ReadyInsts) {
3402 assert(SD->isSchedulingEntity() && SD->isReady() &&
3403 "item in ready list not ready?");
3408 void doForAllOpcodes(
Value *V,
3410 if (ScheduleData *SD = getScheduleData(V))
3412 auto I = ExtraScheduleDataMap.find(V);
3413 if (
I != ExtraScheduleDataMap.end())
3414 for (
auto &
P :
I->second)
3415 if (isInSchedulingRegion(
P.second))
3420 template <
typename ReadyListType>
3421 void initialFillReadyList(ReadyListType &ReadyList) {
3422 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3423 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3424 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3426 ReadyList.insert(SD);
3428 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3443 std::optional<ScheduleData *>
3445 const InstructionsState &S);
3451 ScheduleData *allocateScheduleDataChunks();
3455 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3460 ScheduleData *PrevLoadStore,
3461 ScheduleData *NextLoadStore);
3465 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3469 void resetSchedule();
3490 ExtraScheduleDataMap;
3503 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3507 ScheduleData *LastLoadStoreInRegion =
nullptr;
3512 bool RegionHasStackSave =
false;
3515 int ScheduleRegionSize = 0;
3524 int SchedulingRegionID = 1;
3532 void scheduleBlock(BlockScheduling *BS);
3539 struct OrdersTypeDenseMapInfo {
3552 static unsigned getHashValue(
const OrdersType &V) {
3573 unsigned MaxVecRegSize;
3574 unsigned MinVecRegSize;
3599 struct ChildIteratorType
3601 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3612 return R.VectorizableTree[0].get();
3616 return {
N->UserTreeIndices.begin(),
N->Container};
3620 return {
N->UserTreeIndices.end(),
N->Container};
3625 class nodes_iterator {
3636 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3640 return nodes_iterator(R->VectorizableTree.begin());
3644 return nodes_iterator(R->VectorizableTree.end());
3647 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3658 OS << Entry->Idx <<
".\n";
3661 for (
auto *V : Entry->Scalars) {
3663 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3664 return EU.Scalar == V;
3674 if (Entry->State == TreeEntry::NeedToGather)
3676 if (Entry->State == TreeEntry::ScatterVectorize ||
3677 Entry->State == TreeEntry::PossibleStridedVectorize)
3678 return "color=blue";
3687 for (
auto *
I : DeletedInstructions) {
3688 for (
Use &U :
I->operands()) {
3689 auto *
Op = dyn_cast<Instruction>(U.get());
3690 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
3694 I->dropAllReferences();
3696 for (
auto *
I : DeletedInstructions) {
3698 "trying to erase instruction with users.");
3699 I->eraseFromParent();
3705#ifdef EXPENSIVE_CHECKS
3716 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3717 "Expected non-empty mask.");
3720 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
3722 Reuses[Mask[
I]] = Prev[
I];
3730 assert(!Mask.empty() &&
"Expected non-empty mask.");
3732 if (Order.
empty()) {
3733 MaskOrder.
resize(Mask.size());
3734 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
3743 Order.
assign(Mask.size(), Mask.size());
3744 for (
unsigned I = 0,
E = Mask.size();
I <
E; ++
I)
3746 Order[MaskOrder[
I]] =
I;
3750std::optional<BoUpSLP::OrdersType>
3752 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3753 unsigned NumScalars = TE.Scalars.size();
3754 OrdersType CurrentOrder(NumScalars, NumScalars);
3757 const TreeEntry *STE =
nullptr;
3761 for (
unsigned I = 0;
I < NumScalars; ++
I) {
3762 Value *V = TE.Scalars[
I];
3763 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
3765 if (
const auto *LocalSTE = getTreeEntry(V)) {
3768 else if (STE != LocalSTE)
3770 return std::nullopt;
3772 std::distance(STE->Scalars.begin(),
find(STE->Scalars, V));
3773 if (Lane >= NumScalars)
3774 return std::nullopt;
3775 if (CurrentOrder[Lane] != NumScalars) {
3778 UsedPositions.
reset(CurrentOrder[Lane]);
3782 CurrentOrder[Lane] =
I;
3783 UsedPositions.
set(
I);
3788 if (STE && (UsedPositions.
count() > 1 || STE->Scalars.size() == 2)) {
3790 for (
unsigned I = 0;
I < NumScalars; ++
I)
3791 if (CurrentOrder[
I] !=
I && CurrentOrder[
I] != NumScalars)
3795 if (IsIdentityOrder(CurrentOrder))
3797 auto *It = CurrentOrder.
begin();
3798 for (
unsigned I = 0;
I < NumScalars;) {
3799 if (UsedPositions.
test(
I)) {
3803 if (*It == NumScalars) {
3809 return std::move(CurrentOrder);
3811 return std::nullopt;
3816enum class LoadsState {
3820 PossibleStridedVectorize
3826 bool CompareOpcodes =
true) {
3829 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3832 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3835 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3839 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
3859 if (
DL.getTypeSizeInBits(ScalarTy) !=
DL.getTypeAllocSizeInBits(ScalarTy))
3860 return LoadsState::Gather;
3866 auto *POIter = PointerOps.
begin();
3867 for (
Value *V : VL) {
3868 auto *L = cast<LoadInst>(V);
3870 return LoadsState::Gather;
3871 *POIter = L->getPointerOperand();
3881 bool IsPossibleStrided =
false;
3885 if (Order.
empty()) {
3886 Ptr0 = PointerOps.
front();
3887 PtrN = PointerOps.
back();
3889 Ptr0 = PointerOps[Order.
front()];
3890 PtrN = PointerOps[Order.
back()];
3892 std::optional<int> Diff =
3895 if (
static_cast<unsigned>(*Diff) == VL.
size() - 1)
3896 return LoadsState::Vectorize;
3898 IsPossibleStrided = *Diff % (VL.
size() - 1) == 0;
3904 bool ProfitableGatherPointers =
3906 return L && L->isLoopInvariant(V);
3907 })) <= VL.
size() / 2 && VL.
size() > 2;
3908 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
3909 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
3911 (
GEP &&
GEP->getNumOperands() == 2);
3913 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3916 std::min(CommonAlignment, cast<LoadInst>(V)->
getAlign());
3920 return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
3921 : LoadsState::ScatterVectorize;
3925 return LoadsState::Gather;
3932 VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
3933 "Expected list of pointer operands.");
3938 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
3943 std::optional<int> Diff =
3949 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
3955 if (Bases.
size() > VL.
size() / 2 - 1)
3959 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
3965 bool AnyConsecutive =
false;
3966 for (
auto &
Base : Bases) {
3967 auto &Vec =
Base.second;
3968 if (Vec.size() > 1) {
3970 const std::tuple<Value *, int, unsigned> &
Y) {
3971 return std::get<1>(
X) < std::get<1>(
Y);
3973 int InitialOffset = std::get<1>(Vec[0]);
3975 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
3981 SortedIndices.
clear();
3982 if (!AnyConsecutive)
3985 for (
auto &
Base : Bases) {
3986 for (
auto &
T :
Base.second)
3991 "Expected SortedIndices to be the size of VL");
3995std::optional<BoUpSLP::OrdersType>
3997 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3998 Type *ScalarTy = TE.Scalars[0]->getType();
4001 Ptrs.
reserve(TE.Scalars.size());
4002 for (
Value *V : TE.Scalars) {
4003 auto *L = dyn_cast<LoadInst>(V);
4004 if (!L || !L->isSimple())
4005 return std::nullopt;
4011 return std::move(Order);
4012 return std::nullopt;
4023 if (VU->
getType() != V->getType())
4026 if (!VU->
hasOneUse() && !V->hasOneUse())
4032 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4038 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
4039 bool IsReusedIdx =
false;
4041 if (IE2 == VU && !IE1)
4043 if (IE1 == V && !IE2)
4044 return V->hasOneUse();
4045 if (IE1 && IE1 != V) {
4047 IsReusedIdx |= ReusedIdx.
test(Idx1);
4048 ReusedIdx.
set(Idx1);
4049 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4052 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4054 if (IE2 && IE2 != VU) {
4056 IsReusedIdx |= ReusedIdx.
test(Idx2);
4057 ReusedIdx.
set(Idx2);
4058 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4061 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4063 }
while (!IsReusedIdx && (IE1 || IE2));
4067std::optional<BoUpSLP::OrdersType>
4071 if (!TE.ReuseShuffleIndices.empty()) {
4079 unsigned Sz = TE.Scalars.size();
4082 return std::nullopt;
4083 unsigned VF = TE.getVectorFactor();
4086 TE.ReuseShuffleIndices.end());
4087 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4089 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4090 return Idx && *Idx < Sz;
4093 if (TE.ReorderIndices.empty())
4094 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4097 for (
unsigned I = 0;
I < VF; ++
I) {
4098 int &
Idx = ReusedMask[
I];
4101 Value *V = TE.Scalars[ReorderMask[
Idx]];
4103 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
4109 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
4110 auto *It = ResOrder.
begin();
4111 for (
unsigned K = 0; K < VF; K += Sz) {
4115 std::iota(SubMask.begin(), SubMask.end(), 0);
4117 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
4118 std::advance(It, Sz);
4121 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
4122 return std::nullopt;
4123 return std::move(ResOrder);
4125 if ((TE.State == TreeEntry::Vectorize ||
4126 TE.State == TreeEntry::PossibleStridedVectorize) &&
4127 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4128 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4130 return TE.ReorderIndices;
4131 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4132 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
4133 Value *V1 = TE.Scalars[I1];
4134 Value *V2 = TE.Scalars[I2];
4137 if (!V1->
hasOneUse() || !V2->hasOneUse())
4139 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
4140 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4141 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4142 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4149 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4151 return *Idx1 < *Idx2;
4153 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4154 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4155 if (EE1->getOperand(0) != EE2->getOperand(0))
4159 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4161 return *Idx1 < *Idx2;
4165 auto IsIdentityOrder = [](
const OrdersType &Order) {
4166 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4171 if (!TE.ReorderIndices.empty())
4172 return TE.ReorderIndices;
4175 std::iota(Phis.begin(), Phis.end(), 0);
4177 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4180 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4181 ResOrder[Id] = PhiToId[Phis[Id]];
4182 if (IsIdentityOrder(ResOrder))
4183 return std::nullopt;
4184 return std::move(ResOrder);
4186 if (TE.State == TreeEntry::NeedToGather) {
4189 if (((TE.getOpcode() == Instruction::ExtractElement &&
4190 !TE.isAltShuffle()) ||
4193 return isa<UndefValue, ExtractElementInst>(V);
4196 [](
Value *V) { return isa<ExtractElementInst>(V); }))) &&
4199 auto *EE = dyn_cast<ExtractElementInst>(V);
4200 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4206 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4208 if (Reuse || !CurrentOrder.
empty()) {
4209 if (!CurrentOrder.
empty())
4211 return std::move(CurrentOrder);
4220 int Sz = TE.Scalars.size();
4224 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
4225 if (It == TE.Scalars.begin())
4228 if (It != TE.Scalars.end()) {
4230 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4245 if (InsertFirstCost + PermuteCost < InsertIdxCost)
4246 return std::move(Order);
4250 return CurrentOrder;
4251 if (TE.Scalars.size() >= 4)
4255 return std::nullopt;
4265 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
4267 if (Cluster != FirstCluster)
4273void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
4276 const unsigned Sz =
TE.Scalars.size();
4278 if (
TE.State != TreeEntry::NeedToGather ||
4285 addMask(NewMask,
TE.ReuseShuffleIndices);
4287 TE.ReorderIndices.clear();
4294 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
4295 *
End =
TE.ReuseShuffleIndices.end();
4296 It !=
End; std::advance(It, Sz))
4297 std::iota(It, std::next(It, Sz), 0);
4316 ExternalUserReorderMap;
4322 for_each(VectorizableTree, [
this, &TTIRef, &VFToOrderedEntries,
4323 &GathersToOrders, &ExternalUserReorderMap,
4324 &AltShufflesToOrders, &PhisToOrders](
4325 const std::unique_ptr<TreeEntry> &TE) {
4328 findExternalStoreUsersReorderIndices(TE.get());
4329 if (!ExternalUserReorderIndices.
empty()) {
4330 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4332 std::move(ExternalUserReorderIndices));
4338 if (TE->isAltShuffle()) {
4341 unsigned Opcode0 = TE->getOpcode();
4342 unsigned Opcode1 = TE->getAltOpcode();
4345 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4346 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4350 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4356 if (std::optional<OrdersType> CurrentOrder =
4366 const TreeEntry *UserTE = TE.get();
4368 if (UserTE->UserTreeIndices.size() != 1)
4371 return EI.UserTE->State == TreeEntry::Vectorize &&
4372 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4375 UserTE = UserTE->UserTreeIndices.back().UserTE;
4378 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4379 if (!(TE->State == TreeEntry::Vectorize ||
4380 TE->State == TreeEntry::PossibleStridedVectorize) ||
4381 !TE->ReuseShuffleIndices.empty())
4382 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4383 if (TE->State == TreeEntry::Vectorize &&
4384 TE->getOpcode() == Instruction::PHI)
4385 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
4390 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4392 auto It = VFToOrderedEntries.
find(VF);
4393 if (It == VFToOrderedEntries.
end())
4408 for (
const TreeEntry *OpTE : OrderedEntries) {
4411 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
4414 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4416 if (OpTE->State == TreeEntry::NeedToGather ||
4417 !OpTE->ReuseShuffleIndices.empty()) {
4418 auto It = GathersToOrders.find(OpTE);
4419 if (It != GathersToOrders.end())
4422 if (OpTE->isAltShuffle()) {
4423 auto It = AltShufflesToOrders.find(OpTE);
4424 if (It != AltShufflesToOrders.end())
4427 if (OpTE->State == TreeEntry::Vectorize &&
4428 OpTE->getOpcode() == Instruction::PHI) {
4429 auto It = PhisToOrders.
find(OpTE);
4430 if (It != PhisToOrders.
end())
4433 return OpTE->ReorderIndices;
4436 auto It = ExternalUserReorderMap.
find(OpTE);
4437 if (It != ExternalUserReorderMap.
end()) {
4438 const auto &ExternalUserReorderIndices = It->second;
4442 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4443 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
4444 ExternalUserReorderIndices.size();
4446 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
4447 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4454 if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
4455 StridedVectorizeOrders.
push_back(Order);
4459 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4460 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4463 unsigned E = Order.size();
4466 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4469 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4471 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4475 if (OrdersUses.empty()) {
4476 if (StridedVectorizeOrders.
empty())
4479 for (
OrdersType &Order : StridedVectorizeOrders)
4480 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4484 for (
OrdersType &Order : StridedVectorizeOrders) {
4485 auto *It = OrdersUses.find(Order);
4486 if (It != OrdersUses.end())
4492 unsigned Cnt = OrdersUses.front().second;
4493 for (
const auto &Pair :
drop_begin(OrdersUses)) {
4494 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4495 BestOrder = Pair.first;
4500 if (BestOrder.
empty())
4505 unsigned E = BestOrder.
size();
4507 return I < E ? static_cast<int>(I) : PoisonMaskElem;
4510 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4512 if (TE->Scalars.size() != VF) {
4513 if (TE->ReuseShuffleIndices.size() == VF) {
4519 return EI.UserTE->Scalars.size() == VF ||
4520 EI.UserTE->Scalars.size() ==
4523 "All users must be of VF size.");
4526 reorderNodeWithReuses(*TE, Mask);
4530 if ((TE->State == TreeEntry::Vectorize ||
4531 TE->State == TreeEntry::PossibleStridedVectorize) &&
4534 !TE->isAltShuffle()) {
4538 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
4539 TE->reorderOperands(Mask);
4542 TE->reorderOperands(Mask);
4543 assert(TE->ReorderIndices.empty() &&
4544 "Expected empty reorder sequence.");
4547 if (!TE->ReuseShuffleIndices.empty()) {
4554 addMask(NewReuses, TE->ReuseShuffleIndices);
4555 TE->ReuseShuffleIndices.swap(NewReuses);
4561bool BoUpSLP::canReorderOperands(
4562 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4565 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
4566 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
4567 return OpData.first ==
I &&
4568 OpData.second->State == TreeEntry::Vectorize;
4571 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
4574 if (TE->State == TreeEntry::PossibleStridedVectorize)
4577 if (
any_of(TE->UserTreeIndices,
4578 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
4582 Edges.emplace_back(
I, TE);
4588 if (TE->State != TreeEntry::Vectorize &&
4589 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
4593 TreeEntry *Gather =
nullptr;
4595 [&Gather, UserTE,
I](TreeEntry *TE) {
4596 assert(TE->State != TreeEntry::Vectorize &&
4597 "Only non-vectorized nodes are expected.");
4598 if (
any_of(TE->UserTreeIndices,
4599 [UserTE,
I](
const EdgeInfo &EI) {
4600 return EI.UserTE == UserTE && EI.EdgeIdx == I;
4602 assert(TE->isSame(UserTE->getOperand(
I)) &&
4603 "Operand entry does not match operands.");
4624 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4625 if (TE->State != TreeEntry::Vectorize &&
4626 TE->State != TreeEntry::PossibleStridedVectorize)
4628 if (std::optional<OrdersType> CurrentOrder =
4630 OrderedEntries.
insert(TE.get());
4631 if (!(TE->State == TreeEntry::Vectorize ||
4632 TE->State == TreeEntry::PossibleStridedVectorize) ||
4633 !TE->ReuseShuffleIndices.empty())
4634 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4643 while (!OrderedEntries.
empty()) {
4648 for (TreeEntry *TE : OrderedEntries) {
4649 if (!(TE->State == TreeEntry::Vectorize ||
4650 TE->State == TreeEntry::PossibleStridedVectorize ||
4651 (TE->State == TreeEntry::NeedToGather &&
4652 GathersToOrders.
count(TE))) ||
4653 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4656 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
4658 !Visited.
insert(TE).second) {
4664 for (
EdgeInfo &EI : TE->UserTreeIndices) {
4665 TreeEntry *UserTE = EI.
UserTE;
4666 auto It =
Users.find(UserTE);
4667 if (It ==
Users.end())
4668 It =
Users.insert({UserTE, {}}).first;
4669 It->second.emplace_back(EI.
EdgeIdx, TE);
4673 for (TreeEntry *TE : Filtered)
4674 OrderedEntries.remove(TE);
4676 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
4678 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
4679 return Data1.first->Idx > Data2.first->Idx;
4681 for (
auto &
Data : UsersVec) {
4684 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
4686 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
4687 OrderedEntries.remove(
Op.second);
4703 for (
const auto &
Op :
Data.second) {
4704 TreeEntry *OpTE =
Op.second;
4705 if (!VisitedOps.
insert(OpTE).second)
4707 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
4709 const auto &Order = [OpTE, &GathersToOrders]() ->
const OrdersType & {
4710 if (OpTE->State == TreeEntry::NeedToGather ||
4711 !OpTE->ReuseShuffleIndices.empty())
4712 return GathersToOrders.
find(OpTE)->second;
4713 return OpTE->ReorderIndices;
4716 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
4717 return P.second == OpTE;
4720 if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
4725 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4726 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4729 unsigned E = Order.size();
4732 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4735 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
4738 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
4740 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
4741 const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4742 const TreeEntry *TE) {
4743 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4744 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4745 (IgnoreReorder && TE->Idx == 0))
4747 if (TE->State == TreeEntry::NeedToGather) {
4748 auto It = GathersToOrders.
find(TE);
4749 if (It != GathersToOrders.
end())
4750 return !It->second.empty();
4755 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
4756 TreeEntry *UserTE = EI.
UserTE;
4757 if (!VisitedUsers.
insert(UserTE).second)
4762 if (AllowsReordering(UserTE))
4770 if (
static_cast<unsigned>(
count_if(
4771 Ops, [UserTE, &AllowsReordering](
4772 const std::pair<unsigned, TreeEntry *> &
Op) {
4773 return AllowsReordering(
Op.second) &&
4776 return EI.UserTE == UserTE;
4778 })) <= Ops.
size() / 2)
4779 ++Res.first->second;
4783 if (OrdersUses.empty()) {
4784 if (StridedVectorizeOrders.
empty() ||
4785 (
Data.first->ReorderIndices.empty() &&
4786 Data.first->ReuseShuffleIndices.empty() &&
4788 Data.first == VectorizableTree.front().get()))) {
4789 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
4790 OrderedEntries.remove(
Op.second);
4794 for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
4795 OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
4800 for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
4801 auto *It = OrdersUses.find(Pair.first);
4802 if (It != OrdersUses.end())
4803 It->second += Pair.second;
4808 unsigned Cnt = OrdersUses.front().second;
4809 for (
const auto &Pair :
drop_begin(OrdersUses)) {
4810 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4811 BestOrder = Pair.first;
4816 if (BestOrder.
empty()) {
4817 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
4818 OrderedEntries.remove(
Op.second);
4826 unsigned E = BestOrder.
size();
4828 return I < E ? static_cast<int>(I) : PoisonMaskElem;
4830 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
4831 TreeEntry *TE =
Op.second;
4832 OrderedEntries.remove(TE);
4833 if (!VisitedOps.
insert(TE).second)
4835 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
4836 reorderNodeWithReuses(*TE, Mask);
4840 if (TE->State != TreeEntry::Vectorize &&
4841 TE->State != TreeEntry::PossibleStridedVectorize &&
4842 (TE->State != TreeEntry::ScatterVectorize ||
4843 TE->ReorderIndices.empty()))
4845 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
4846 TE->ReorderIndices.empty()) &&
4847 "Non-matching sizes of user/operand entries.");
4849 if (IgnoreReorder && TE == VectorizableTree.front().get())
4850 IgnoreReorder =
false;
4853 for (TreeEntry *Gather : GatherOps) {
4854 assert(Gather->ReorderIndices.empty() &&
4855 "Unexpected reordering of gathers.");
4856 if (!Gather->ReuseShuffleIndices.empty()) {
4862 OrderedEntries.remove(Gather);
4866 if (
Data.first->State != TreeEntry::Vectorize ||
4867 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
4868 Data.first->getMainOp()) ||
4869 Data.first->isAltShuffle())
4870 Data.first->reorderOperands(Mask);
4871 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
4872 Data.first->isAltShuffle() ||
4873 Data.first->State == TreeEntry::PossibleStridedVectorize) {
4876 if (
Data.first->ReuseShuffleIndices.empty() &&
4877 !
Data.first->ReorderIndices.empty() &&
4878 !
Data.first->isAltShuffle()) {
4881 OrderedEntries.insert(
Data.first);
4889 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
4890 VectorizableTree.front()->ReuseShuffleIndices.empty())
4891 VectorizableTree.front()->ReorderIndices.clear();
4897 for (
auto &TEPtr : VectorizableTree) {
4898 TreeEntry *Entry = TEPtr.get();
4901 if (Entry->State == TreeEntry::NeedToGather)
4905 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
4906 Value *Scalar = Entry->Scalars[Lane];
4907 if (!isa<Instruction>(Scalar))
4909 int FoundLane = Entry->findLaneForValue(Scalar);
4912 const auto *ExtI = ExternallyUsedValues.
find(Scalar);
4913 if (ExtI != ExternallyUsedValues.
end()) {
4914 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
4915 << Lane <<
" from " << *Scalar <<
".\n");
4916 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
4918 for (
User *U : Scalar->users()) {
4929 if (TreeEntry *UseEntry = getTreeEntry(U)) {
4930 Value *UseScalar = UseEntry->Scalars[0];
4934 if (UseScalar != U ||
4935 UseEntry->State == TreeEntry::ScatterVectorize ||
4936 UseEntry->State == TreeEntry::PossibleStridedVectorize ||
4938 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
4940 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
4946 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
4949 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract:" << *U <<
" from lane "
4950 << Lane <<
" from " << *Scalar <<
".\n");
4951 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
4958BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
4960 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
4961 Value *V = TE->Scalars[Lane];
4963 static constexpr unsigned UsersLimit = 4;
4964 if (V->hasNUsesOrMore(UsersLimit))
4968 for (
User *U : V->users()) {
4969 auto *SI = dyn_cast<StoreInst>(U);
4970 if (SI ==
nullptr || !SI->isSimple() ||
4974 if (getTreeEntry(U))
4978 auto &StoresVec = PtrToStoresMap[
Ptr];
4981 if (StoresVec.size() > Lane)
4984 if (!StoresVec.empty() &&
4985 SI->getParent() != StoresVec.back()->getParent())
4988 if (!StoresVec.empty() &&
4989 SI->getValueOperand()->getType() !=
4990 StoresVec.back()->getValueOperand()->getType())
4992 StoresVec.push_back(SI);
4995 return PtrToStoresMap;
4999 OrdersType &ReorderIndices)
const {
5007 StoreOffsetVec[0] = {S0, 0};
5010 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
5012 std::optional<int> Diff =
5014 SI->getPointerOperand(), *DL, *SE,
5019 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
5024 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
5025 const std::pair<StoreInst *, int> &Pair2) {
5026 int Offset1 = Pair1.second;
5027 int Offset2 = Pair2.second;
5028 return Offset1 < Offset2;
5032 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5033 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx - 1].second + 1)
5038 ReorderIndices.reserve(StoresVec.
size());
5041 [SI](
const std::pair<StoreInst *, int> &Pair) {
5042 return Pair.first ==
SI;
5044 StoreOffsetVec.begin();
5045 ReorderIndices.push_back(
Idx);
5050 auto IsIdentityOrder = [](
const OrdersType &Order) {