72#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
113 cl::desc(
"Run the SLP vectorization passes"));
117 cl::desc(
"Only vectorize if you gain more than this "
122 cl::desc(
"Attempt to vectorize horizontal reductions"));
127 "Attempt to vectorize horizontal reductions feeding into a store"));
131 cl::desc(
"Attempt to vectorize for this register size in bits"));
135 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
139 cl::desc(
"Maximum depth of the lookup for consecutive stores."));
147 cl::desc(
"Limit the size of the SLP scheduling region per block"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
159 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
165 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
174 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
178 cl::desc(
"Display the SLP trees with Graphviz"));
201 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
208 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
215 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
216 !isa<ExtractValueInst, UndefValue>(V))
218 auto *
I = dyn_cast<Instruction>(V);
219 if (!
I || isa<ExtractValueInst>(
I))
221 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
223 if (isa<ExtractElementInst>(
I))
225 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
239 for (
int I = 1,
E = VL.
size();
I <
E;
I++) {
240 auto *II = dyn_cast<Instruction>(VL[
I]);
261 Value *FirstNonUndef =
nullptr;
262 for (
Value *V : VL) {
263 if (isa<UndefValue>(V))
265 if (!FirstNonUndef) {
269 if (V != FirstNonUndef)
272 return FirstNonUndef !=
nullptr;
277 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
278 return Cmp->isCommutative();
279 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
280 return BO->isCommutative();
292 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
293 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
296 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
299 if (CI->getValue().uge(VT->getNumElements()))
301 Index *= VT->getNumElements();
302 Index += CI->getZExtValue();
306 const auto *
IV = cast<InsertValueInst>(InsertInst);
307 Type *CurrentType =
IV->getType();
308 for (
unsigned I :
IV->indices()) {
309 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
310 Index *= ST->getNumElements();
311 CurrentType = ST->getElementType(
I);
312 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
313 Index *= AT->getNumElements();
314 CurrentType = AT->getElementType();
347 if (MaskArg == UseMask::UndefsAsMask)
348 UseMask.reset(
P.index());
351 if (MaskArg == UseMask::FirstArg &&
P.value() < VF)
352 UseMask.reset(
P.value());
353 else if (MaskArg == UseMask::SecondArg &&
P.value() >= VF)
354 UseMask.reset(
P.value() - VF);
362template <
bool IsPoisonOnly = false>
366 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
369 auto *VecTy = dyn_cast<FixedVectorType>(V->
getType());
372 auto *
C = dyn_cast<Constant>(V);
374 if (!UseMask.empty()) {
376 while (
auto *II = dyn_cast<InsertElementInst>(
Base)) {
377 if (isa<T>(II->getOperand(1)))
379 Base = II->getOperand(0);
383 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
391 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
398 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
399 if (
Constant *Elem =
C->getAggregateElement(
I))
401 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
449static std::optional<TargetTransformInfo::ShuffleKind>
452 find_if(VL, [](
Value *V) {
return isa<ExtractElementInst>(V); });
455 auto *EI0 = cast<ExtractElementInst>(*It);
456 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
459 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
460 Value *Vec1 =
nullptr;
461 Value *Vec2 =
nullptr;
463 ShuffleMode CommonShuffleMode =
Unknown;
465 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
467 if (isa<UndefValue>(VL[
I]))
469 auto *EI = cast<ExtractElementInst>(VL[
I]);
470 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
472 auto *Vec = EI->getVectorOperand();
477 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
479 if (isa<UndefValue>(EI->getIndexOperand()))
481 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
487 unsigned IntIdx =
Idx->getValue().getZExtValue();
491 if (!Vec1 || Vec1 == Vec) {
493 }
else if (!Vec2 || Vec2 == Vec) {
499 if (CommonShuffleMode == Permute)
504 CommonShuffleMode = Permute;
507 CommonShuffleMode =
Select;
510 if (CommonShuffleMode ==
Select && Vec2)
520 unsigned Opcode =
E->getOpcode();
521 assert((Opcode == Instruction::ExtractElement ||
522 Opcode == Instruction::ExtractValue) &&
523 "Expected extractelement or extractvalue instruction.");
524 if (Opcode == Instruction::ExtractElement) {
525 auto *CI = dyn_cast<ConstantInt>(
E->getOperand(1));
528 return CI->getZExtValue();
530 auto *EI = cast<ExtractValueInst>(
E);
531 if (EI->getNumIndices() != 1)
533 return *EI->idx_begin();
539struct InstructionsState {
541 Value *OpValue =
nullptr;
552 unsigned getAltOpcode()
const {
557 bool isAltShuffle()
const {
return AltOp != MainOp; }
560 unsigned CheckedOpcode =
I->getOpcode();
561 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
564 InstructionsState() =
delete;
566 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
575 auto *
I = dyn_cast<Instruction>(Op);
576 if (
I && S.isOpcodeOrAlt(
I))
595 unsigned BaseIndex = 0);
603 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
604 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
605 BaseOp0 == Op0 || BaseOp1 == Op1 ||
616 "Assessing comparisons of different types?");
626 return (BasePred == Pred &&
628 (BasePred == SwappedPred &&
637 unsigned BaseIndex) {
640 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
642 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
643 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
644 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
646 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
648 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
649 unsigned AltOpcode = Opcode;
650 unsigned AltIndex = BaseIndex;
654 auto *IBase = cast<Instruction>(VL[BaseIndex]);
657 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
661 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
663 for (
int Cnt = 0,
E = VL.
size(); Cnt <
E; Cnt++) {
664 auto *
I = cast<Instruction>(VL[Cnt]);
665 unsigned InstOpcode =
I->getOpcode();
666 if (IsBinOp && isa<BinaryOperator>(
I)) {
667 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
671 AltOpcode = InstOpcode;
675 }
else if (IsCastOp && isa<CastInst>(
I)) {
676 Value *Op0 = IBase->getOperand(0);
678 Value *Op1 =
I->getOperand(0);
681 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
683 if (Opcode == AltOpcode) {
686 "Cast isn't safe for alternation, logic needs to be updated!");
687 AltOpcode = InstOpcode;
692 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
693 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
694 Type *Ty0 = BaseInst->getOperand(0)->getType();
695 Type *Ty1 = Inst->getOperand(0)->getType();
697 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
705 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
710 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
711 if (AltIndex != BaseIndex) {
714 }
else if (BasePred != CurrentPred) {
717 "CmpInst isn't safe for alternation, logic needs to be updated!");
722 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
723 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
726 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
727 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
728 if (Gep->getNumOperands() != 2 ||
729 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
730 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
731 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
733 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
734 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
735 auto *BaseLI = cast<LoadInst>(IBase);
736 if (!LI->isSimple() || !BaseLI->isSimple())
737 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
738 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
739 auto *
CallBase = cast<CallInst>(IBase);
741 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
742 if (Call->hasOperandBundles() &&
743 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
744 Call->op_begin() + Call->getBundleOperandsEndIndex(),
747 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
750 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
755 Mappings.front().ScalarName != BaseMappings.
front().ScalarName ||
756 Mappings.front().VectorName != BaseMappings.
front().VectorName ||
757 Mappings.front().Shape.VF != BaseMappings.
front().Shape.VF ||
758 Mappings.front().Shape.Parameters !=
759 BaseMappings.
front().Shape.Parameters)
760 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
765 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
768 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
769 cast<Instruction>(VL[AltIndex]));
775 Type *Ty = VL[0]->getType();
776 for (
int i = 1, e = VL.
size(); i < e; i++)
789 case Instruction::Load: {
790 LoadInst *LI = cast<LoadInst>(UserInst);
793 case Instruction::Store: {
795 return (
SI->getPointerOperand() == Scalar);
797 case Instruction::Call: {
798 CallInst *CI = cast<CallInst>(UserInst);
800 for (
unsigned i = 0, e = CI->
arg_size(); i != e; ++i) {
815 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
822 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
823 return LI->isSimple();
825 return SI->isSimple();
827 return !
MI->isVolatile();
836 Mask.append(SubMask.
begin(), SubMask.
end());
840 int TermValue = std::min(Mask.size(), SubMask.
size());
841 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
843 Mask[SubMask[
I]] >= TermValue)
845 NewMask[
I] = Mask[SubMask[
I]];
861 const unsigned Sz = Order.
size();
864 for (
unsigned I = 0;
I < Sz; ++
I) {
866 UnusedIndices.
reset(Order[
I]);
868 MaskedIndices.
set(
I);
870 if (MaskedIndices.
none())
873 "Non-synced masked/available indices.");
877 assert(
Idx >= 0 &&
"Indices must be synced.");
889 const unsigned E = Indices.
size();
891 for (
unsigned I = 0;
I <
E; ++
I)
892 Mask[Indices[
I]] =
I;
898 assert(!Mask.empty() &&
"Expected non-empty mask.");
902 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
904 Scalars[Mask[
I]] = Prev[
I];
912 auto *
I = dyn_cast<Instruction>(V);
917 auto *IO = dyn_cast<Instruction>(V);
920 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
929 auto *
I = dyn_cast<Instruction>(V);
933 constexpr int UsesLimit = 8;
934 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(UsesLimit) &&
936 auto *IU = dyn_cast<Instruction>(U);
939 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
955 return !VL.
empty() &&
959namespace slpvectorizer {
980 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li),
981 DT(Dt), AC(AC), DB(DB),
DL(
DL), ORE(ORE), Builder(Se->getContext()) {
1030 return !VectorizableTree.
empty() &&
1031 VectorizableTree.
front()->State == TreeEntry::Vectorize;
1036 assert(!VectorizableTree.
empty() &&
"No tree to get the first node from");
1037 return VectorizableTree.
front()->getMainOp();
1042 return !VectorizableTree.
empty() &&
1043 !VectorizableTree.
front()->UserTreeIndices.empty();
1055 VectorizableTree.
clear();
1056 ScalarToTreeEntry.clear();
1058 EntryToLastInstruction.clear();
1059 ExternalUses.
clear();
1060 for (
auto &Iter : BlocksSchedules) {
1061 BlockScheduling *BS = Iter.second.get();
1065 InstrElementSize.clear();
1066 UserIgnoreList =
nullptr;
1088 std::optional<OrdersType>
getReorderingData(
const TreeEntry &TE,
bool TopToBottom);
1120 return MaxVecRegSize;
1125 return MinVecRegSize;
1133 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1135 return MaxVF ? MaxVF : UINT_MAX;
1190 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1191 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1210 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1211 MaxLevel(MaxLevel) {}
1265 if (isa<LoadInst>(V1)) {
1267 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1269 static constexpr unsigned Limit = 8;
1270 if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
1273 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1275 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1278 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1281 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1283 ((
int)V1->getNumUses() == NumLanes ||
1284 AllUsersAreInternal(V1, V2)))
1290 auto *LI1 = dyn_cast<LoadInst>(V1);
1291 auto *LI2 = dyn_cast<LoadInst>(V2);
1293 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1298 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1299 LI2->getPointerOperand(),
DL, SE,
true);
1300 if (!Dist || *Dist == 0) {
1303 R.TTI->isLegalMaskedGather(
1311 if (std::abs(*Dist) > NumLanes / 2)
1320 auto *C1 = dyn_cast<Constant>(V1);
1321 auto *C2 = dyn_cast<Constant>(V2);
1331 if (isa<UndefValue>(V2))
1333 Value *EV2 =
nullptr;
1346 int Dist = Idx2 - Idx1;
1349 if (std::abs(Dist) == 0)
1351 if (std::abs(Dist) > NumLanes / 2)
1361 auto *I1 = dyn_cast<Instruction>(V1);
1362 auto *I2 = dyn_cast<Instruction>(V2);
1364 if (I1->getParent() != I2->getParent())
1372 if (S.getOpcode() &&
1373 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1374 !S.isAltShuffle()) &&
1376 return cast<Instruction>(V)->getNumOperands() ==
1377 S.MainOp->getNumOperands();
1383 if (isa<UndefValue>(V2))
1420 int ShallowScoreAtThisLevel =
1429 auto *I1 = dyn_cast<Instruction>(
LHS);
1430 auto *I2 = dyn_cast<Instruction>(
RHS);
1431 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1433 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1434 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1435 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1436 ShallowScoreAtThisLevel))
1437 return ShallowScoreAtThisLevel;
1438 assert(I1 && I2 &&
"Should have early exited.");
1445 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1446 OpIdx1 != NumOperands1; ++OpIdx1) {
1448 int MaxTmpScore = 0;
1449 unsigned MaxOpIdx2 = 0;
1450 bool FoundBest =
false;
1454 ? I2->getNumOperands()
1455 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1456 assert(FromIdx <= ToIdx &&
"Bad index");
1457 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1459 if (Op2Used.
count(OpIdx2))
1464 I1, I2, CurrLevel + 1, std::nullopt);
1467 TmpScore > MaxTmpScore) {
1468 MaxTmpScore = TmpScore;
1475 Op2Used.
insert(MaxOpIdx2);
1476 ShallowScoreAtThisLevel += MaxTmpScore;
1479 return ShallowScoreAtThisLevel;
1510 struct OperandData {
1511 OperandData() =
default;
1512 OperandData(
Value *V,
bool APO,
bool IsUsed)
1513 : V(V), APO(APO), IsUsed(IsUsed) {}
1523 bool IsUsed =
false;
1532 enum class ReorderingMode {
1551 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1552 return OpsVec[OpIdx][Lane];
1556 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1557 return OpsVec[OpIdx][Lane];
1562 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1563 OpIdx != NumOperands; ++OpIdx)
1564 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1566 OpsVec[OpIdx][Lane].IsUsed =
false;
1570 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1571 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1583 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1584 Value *IdxLaneV = getData(
Idx, Lane).V;
1585 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1588 for (
unsigned Ln = 0,
E = getNumLanes(); Ln <
E; ++Ln) {
1591 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1592 if (!isa<Instruction>(OpIdxLnV))
1594 Uniques.
insert(OpIdxLnV);
1596 int UniquesCount = Uniques.
size();
1597 int UniquesCntWithIdxLaneV =
1598 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1599 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1600 int UniquesCntWithOpIdxLaneV =
1601 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1602 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1605 UniquesCntWithOpIdxLaneV) -
1606 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1615 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1616 Value *IdxLaneV = getData(
Idx, Lane).V;
1617 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1626 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1627 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1629 return R.areAllUsersVectorized(IdxLaneI, std::nullopt)
1637 static const int ScoreScaleFactor = 10;
1645 int Lane,
unsigned OpIdx,
unsigned Idx,
1655 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1656 if (Score <= -SplatScore) {
1661 Score += SplatScore;
1667 Score *= ScoreScaleFactor;
1668 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1686 std::optional<unsigned> getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1689 unsigned NumOperands = getNumOperands();
1692 Value *OpLastLane = getData(OpIdx, LastLane).V;
1695 ReorderingMode RMode = ReorderingModes[OpIdx];
1696 if (RMode == ReorderingMode::Failed)
1697 return std::nullopt;
1700 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1706 std::optional<unsigned>
Idx;
1710 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1717 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1719 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1721 OperandData &OpData = getData(
Idx, Lane);
1722 Value *Op = OpData.V;
1723 bool OpAPO = OpData.APO;
1732 if (OpAPO != OpIdxAPO)
1737 case ReorderingMode::Load:
1738 case ReorderingMode::Constant:
1739 case ReorderingMode::Opcode: {
1740 bool LeftToRight = Lane > LastLane;
1741 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1742 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1743 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1744 OpIdx,
Idx, IsUsed);
1745 if (Score >
static_cast<int>(BestOp.Score)) {
1747 BestOp.Score = Score;
1748 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1752 case ReorderingMode::Splat:
1753 if (Op == OpLastLane)
1756 case ReorderingMode::Failed:
1762 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1766 return std::nullopt;
1773 unsigned getBestLaneToStartReordering()
const {
1774 unsigned Min = UINT_MAX;
1775 unsigned SameOpNumber = 0;
1786 for (
int I = getNumLanes();
I > 0; --
I) {
1787 unsigned Lane =
I - 1;
1788 OperandsOrderData NumFreeOpsHash =
1789 getMaxNumOperandsThatCanBeReordered(Lane);
1792 if (NumFreeOpsHash.NumOfAPOs < Min) {
1793 Min = NumFreeOpsHash.NumOfAPOs;
1794 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1796 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1797 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1798 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1801 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1802 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1803 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1804 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1805 auto It = HashMap.
find(NumFreeOpsHash.Hash);
1806 if (It == HashMap.
end())
1807 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1813 unsigned BestLane = 0;
1814 unsigned CntMin = UINT_MAX;
1816 if (
Data.second.first < CntMin) {
1817 CntMin =
Data.second.first;
1818 BestLane =
Data.second.second;
1825 struct OperandsOrderData {
1828 unsigned NumOfAPOs = UINT_MAX;
1831 unsigned NumOpsWithSameOpcodeParent = 0;
1845 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1846 unsigned CntTrue = 0;
1847 unsigned NumOperands = getNumOperands();
1857 bool AllUndefs =
true;
1858 unsigned NumOpsWithSameOpcodeParent = 0;
1862 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1863 const OperandData &OpData = getData(OpIdx, Lane);
1868 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
1870 I->getParent() != Parent) {
1871 if (NumOpsWithSameOpcodeParent == 0) {
1872 NumOpsWithSameOpcodeParent = 1;
1874 Parent =
I->getParent();
1876 --NumOpsWithSameOpcodeParent;
1879 ++NumOpsWithSameOpcodeParent;
1883 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1884 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1888 OperandsOrderData
Data;
1889 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1890 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1898 assert((empty() || VL.
size() == getNumLanes()) &&
1899 "Expected same number of lanes");
1900 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
1901 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1902 OpsVec.
resize(NumOperands);
1903 unsigned NumLanes = VL.
size();
1904 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1905 OpsVec[OpIdx].
resize(NumLanes);
1906 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1907 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
1918 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
1919 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
1920 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1927 unsigned getNumOperands()
const {
return OpsVec.
size(); }
1930 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
1933 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
1934 return getData(OpIdx, Lane).V;
1938 bool empty()
const {
return OpsVec.
empty(); }
1941 void clear() { OpsVec.
clear(); }
1946 bool shouldBroadcast(
Value *Op,
unsigned OpIdx,
unsigned Lane) {
1947 bool OpAPO = getData(OpIdx, Lane).APO;
1948 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1952 bool FoundCandidate =
false;
1953 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
1954 OperandData &
Data = getData(OpI, Ln);
1955 if (
Data.APO != OpAPO ||
Data.IsUsed)
1958 FoundCandidate =
true;
1963 if (!FoundCandidate)
1973 : TLI(TLI),
DL(
DL), SE(SE), R(R) {
1975 appendOperandsOfVL(RootVL);
1982 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
1983 "Expected same num of lanes across all operands");
1984 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
1985 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
1993 unsigned NumOperands = getNumOperands();
1994 unsigned NumLanes = getNumLanes();
2014 unsigned FirstLane = getBestLaneToStartReordering();
2017 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2018 Value *OpLane0 = getValue(OpIdx, FirstLane);
2021 if (isa<LoadInst>(OpLane0))
2022 ReorderingModes[OpIdx] = ReorderingMode::Load;
2023 else if (isa<Instruction>(OpLane0)) {
2025 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2026 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2028 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2030 else if (isa<Constant>(OpLane0))
2031 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2032 else if (isa<Argument>(OpLane0))
2034 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2037 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2044 auto &&SkipReordering = [
this]() {
2047 for (
const OperandData &
Data : Op0)
2050 if (
any_of(Op, [&UniqueValues](
const OperandData &
Data) {
2069 if (SkipReordering())
2072 bool StrategyFailed =
false;
2080 for (
unsigned I = 0;
I < NumOperands; ++
I)
2081 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2083 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2086 int Lane = FirstLane +
Direction * Distance;
2087 if (Lane < 0 || Lane >= (
int)NumLanes)
2090 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2093 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2095 std::optional<unsigned> BestIdx = getBestOperand(
2096 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2103 swap(OpIdx, *BestIdx, Lane);
2106 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2108 StrategyFailed =
true;
2111 if (MainAltOps[OpIdx].
size() != 2) {
2112 OperandData &AltOp = getData(OpIdx, Lane);
2113 InstructionsState OpS =
2115 if (OpS.getOpcode() && OpS.isAltShuffle())
2122 if (!StrategyFailed)
2127#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2130 case ReorderingMode::Load:
2132 case ReorderingMode::Opcode:
2134 case ReorderingMode::Constant:
2136 case ReorderingMode::Splat:
2138 case ReorderingMode::Failed:
2159 const unsigned Indent = 2;
2162 OS <<
"Operand " << Cnt++ <<
"\n";
2163 for (
const OperandData &OpData : OpDataVec) {
2164 OS.
indent(Indent) <<
"{";
2165 if (
Value *V = OpData.V)
2169 OS <<
", APO:" << OpData.APO <<
"}\n";
2191 int BestScore = Limit;
2192 std::optional<int>
Index;
2193 for (
int I : seq<int>(0, Candidates.size())) {
2195 Candidates[
I].second,
2198 if (Score > BestScore) {
2213 DeletedInstructions.insert(
I);
2219 return AnalyzedReductionsRoots.count(
I);
2224 AnalyzedReductionsRoots.insert(
I);
2238 AnalyzedReductionsRoots.clear();
2239 AnalyzedReductionVals.
clear();
2260 canReorderOperands(TreeEntry *UserTE,
2267 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2271 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2273 TreeEntry *TE =
nullptr;
2275 TE = getTreeEntry(V);
2278 if (It != VL.
end() && TE->isSame(VL))
2285 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2286 unsigned OpIdx)
const {
2287 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2288 const_cast<TreeEntry *
>(UserTE), OpIdx);
2306 const EdgeInfo &EI);
2321 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
2326 Value *createBuildVector(
const TreeEntry *
E);
2333 const APInt &ShuffledIndices,
2334 bool NeedToShuffle)
const;
2340 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
2349 std::optional<TargetTransformInfo::ShuffleKind>
2361 void setInsertPointAfterBundle(
const TreeEntry *
E);
2368 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2372 static void reorderInputsAccordingToOpcode(
2381 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2397 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2401 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2410 [Scalars](
Value *V,
int Idx) {
2411 return (isa<UndefValue>(V) &&
2412 Idx == UndefMaskElem) ||
2413 (Idx != UndefMaskElem && V == Scalars[Idx]);
2416 if (!ReorderIndices.empty()) {
2423 return IsSame(Scalars, Mask);
2424 if (VL.
size() == ReuseShuffleIndices.size()) {
2426 return IsSame(Scalars, Mask);
2430 return IsSame(Scalars, ReuseShuffleIndices);
2433 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2434 return State == TreeEntry::NeedToGather &&
2435 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2436 UserTreeIndices.front().UserTE == UserEI.UserTE;
2440 bool hasEqualOperands(
const TreeEntry &TE)
const {
2441 if (
TE.getNumOperands() != getNumOperands())
2444 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
2445 unsigned PrevCount =
Used.count();
2446 for (
unsigned K = 0; K <
E; ++K) {
2449 if (getOperand(K) ==
TE.getOperand(
I)) {
2455 if (PrevCount ==
Used.count())
2464 unsigned getVectorFactor()
const {
2465 if (!ReuseShuffleIndices.empty())
2466 return ReuseShuffleIndices.size();
2467 return Scalars.
size();
2474 Value *VectorizedValue =
nullptr;
2479 enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
2494 VecTreeTy &Container;
2518 assert(Operands[OpIdx].empty() &&
"Already resized?");
2520 "Number of operands is greater than the number of scalars.");
2526 void setOperandsInOrder() {
2528 auto *I0 = cast<Instruction>(Scalars[0]);
2529 Operands.resize(I0->getNumOperands());
2530 unsigned NumLanes = Scalars.size();
2531 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2532 OpIdx != NumOperands; ++OpIdx) {
2534 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2535 auto *
I = cast<Instruction>(Scalars[Lane]);
2536 assert(
I->getNumOperands() == NumOperands &&
2537 "Expected same number of operands");
2538 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2562 unsigned getNumOperands()
const {
return Operands.size(); }
2565 Value *getSingleOperand(
unsigned OpIdx)
const {
2567 assert(!Operands[OpIdx].empty() &&
"No operand available");
2572 bool isAltShuffle()
const {
return MainOp != AltOp; }
2575 unsigned CheckedOpcode =
I->getOpcode();
2576 return (getOpcode() == CheckedOpcode ||
2577 getAltOpcode() == CheckedOpcode);
2584 auto *
I = dyn_cast<Instruction>(Op);
2585 if (
I && isOpcodeOrAlt(
I))
2590 void setOperations(
const InstructionsState &S) {
2604 unsigned getOpcode()
const {
2605 return MainOp ? MainOp->
getOpcode() : 0;
2608 unsigned getAltOpcode()
const {
2614 int findLaneForValue(
Value *V)
const {
2615 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2616 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2617 if (!ReorderIndices.
empty())
2618 FoundLane = ReorderIndices[FoundLane];
2619 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2620 if (!ReuseShuffleIndices.
empty()) {
2621 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2622 find(ReuseShuffleIndices, FoundLane));
2631 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2632 dbgs() <<
"Operand " << OpI <<
":\n";
2633 for (
const Value *V : Operands[OpI])
2636 dbgs() <<
"Scalars: \n";
2637 for (
Value *V : Scalars)
2639 dbgs() <<
"State: ";
2642 dbgs() <<
"Vectorize\n";
2644 case ScatterVectorize:
2645 dbgs() <<
"ScatterVectorize\n";
2648 dbgs() <<
"NeedToGather\n";
2651 dbgs() <<
"MainOp: ";
2653 dbgs() << *MainOp <<
"\n";
2656 dbgs() <<
"AltOp: ";
2658 dbgs() << *AltOp <<
"\n";
2661 dbgs() <<
"VectorizedValue: ";
2662 if (VectorizedValue)
2663 dbgs() << *VectorizedValue <<
"\n";
2666 dbgs() <<
"ReuseShuffleIndices: ";
2667 if (ReuseShuffleIndices.
empty())
2670 for (
int ReuseIdx : ReuseShuffleIndices)
2671 dbgs() << ReuseIdx <<
", ";
2673 dbgs() <<
"ReorderIndices: ";
2674 for (
unsigned ReorderIdx : ReorderIndices)
2675 dbgs() << ReorderIdx <<
", ";
2677 dbgs() <<
"UserTreeIndices: ";
2678 for (
const auto &EInfo : UserTreeIndices)
2679 dbgs() << EInfo <<
", ";
2689 dbgs() <<
"SLP: Calculated costs for Tree:\n";
E->dump();
2690 dbgs() <<
"SLP: Costs:\n";
2691 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2692 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2693 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2694 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = " <<
2695 ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
2700 TreeEntry *newTreeEntry(
ArrayRef<Value *> VL, std::optional<ScheduleData *> Bundle,
2701 const InstructionsState &S,
2702 const EdgeInfo &UserTreeIdx,
2705 TreeEntry::EntryState EntryState =
2706 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2707 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2708 ReuseShuffleIndices, ReorderIndices);
2712 TreeEntry::EntryState EntryState,
2713 std::optional<ScheduleData *> Bundle,
2714 const InstructionsState &S,
2715 const EdgeInfo &UserTreeIdx,
2718 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2719 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2720 "Need to vectorize gather entry?");
2721 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
2722 TreeEntry *
Last = VectorizableTree.
back().get();
2723 Last->Idx = VectorizableTree.
size() - 1;
2724 Last->State = EntryState;
2725 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2726 ReuseShuffleIndices.end());
2727 if (ReorderIndices.
empty()) {
2729 Last->setOperations(S);
2732 Last->Scalars.assign(VL.
size(),
nullptr);
2735 if (Idx >= VL.size())
2736 return UndefValue::get(VL.front()->getType());
2740 Last->setOperations(S);
2741 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
2743 if (
Last->State != TreeEntry::NeedToGather) {
2744 for (
Value *V : VL) {
2745 assert(!getTreeEntry(V) &&
"Scalar already in tree!");
2746 ScalarToTreeEntry[V] =
Last;
2749 ScheduleData *BundleMember = *Bundle;
2750 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2753 "Bundle and VL out of sync");
2755 for (
Value *V : VL) {
2758 assert(BundleMember &&
"Unexpected end of bundle.");
2759 BundleMember->TE =
Last;
2760 BundleMember = BundleMember->NextInBundle;
2763 assert(!BundleMember &&
"Bundle and VL out of sync");
2765 MustGather.
insert(VL.begin(), VL.end());
2768 if (UserTreeIdx.UserTE)
2769 Last->UserTreeIndices.push_back(UserTreeIdx);
2776 TreeEntry::VecTreeTy VectorizableTree;
2781 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2782 VectorizableTree[
Id]->dump();
2788 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
2790 const TreeEntry *getTreeEntry(
Value *V)
const {
2791 return ScalarToTreeEntry.lookup(V);
2812 struct ExternalUser {
2834 AliasCacheKey key = std::make_pair(Inst1, Inst2);
2835 std::optional<bool> &result = AliasCache[key];
2839 bool aliased =
true;
2847 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
2875 UserList ExternalUses;
2891 struct ScheduleData {
2894 enum { InvalidDeps = -1 };
2896 ScheduleData() =
default;
2898 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
2899 FirstInBundle =
this;
2900 NextInBundle =
nullptr;
2901 NextLoadStore =
nullptr;
2902 IsScheduled =
false;
2903 SchedulingRegionID = BlockSchedulingRegionID;
2904 clearDependencies();
2911 if (hasValidDependencies()) {
2912 assert(UnscheduledDeps <= Dependencies &&
"invariant");
2914 assert(UnscheduledDeps == Dependencies &&
"invariant");
2918 assert(isSchedulingEntity() &&
2919 "unexpected scheduled state");
2920 for (
const ScheduleData *BundleMember =
this; BundleMember;
2921 BundleMember = BundleMember->NextInBundle) {
2922 assert(BundleMember->hasValidDependencies() &&
2923 BundleMember->UnscheduledDeps == 0 &&
2924 "unexpected scheduled state");
2925 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
2926 "only bundle is marked scheduled");
2930 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
2931 "all bundle members must be in same basic block");
2937 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
2941 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
2945 bool isPartOfBundle()
const {
2946 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
2951 bool isReady()
const {
2952 assert(isSchedulingEntity() &&
2953 "can't consider non-scheduling entity for ready list");
2954 return unscheduledDepsInBundle() == 0 && !IsScheduled;
2960 int incrementUnscheduledDeps(
int Incr) {
2961 assert(hasValidDependencies() &&
2962 "increment of unscheduled deps would be meaningless");
2963 UnscheduledDeps += Incr;
2964 return FirstInBundle->unscheduledDepsInBundle();
2969 void resetUnscheduledDeps() {
2970 UnscheduledDeps = Dependencies;
2974 void clearDependencies() {
2975 Dependencies = InvalidDeps;
2976 resetUnscheduledDeps();
2977 MemoryDependencies.clear();
2978 ControlDependencies.clear();
2981 int unscheduledDepsInBundle()
const {
2982 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
2984 for (
const ScheduleData *BundleMember =
this; BundleMember;
2985 BundleMember = BundleMember->NextInBundle) {
2986 if (BundleMember->UnscheduledDeps == InvalidDeps)
2988 Sum += BundleMember->UnscheduledDeps;
2994 if (!isSchedulingEntity()) {
2995 os <<
"/ " << *Inst;
2996 }
else if (NextInBundle) {
2998 ScheduleData *SD = NextInBundle;
3000 os <<
';' << *SD->Inst;
3001 SD = SD->NextInBundle;
3012 Value *OpValue =
nullptr;
3015 TreeEntry *
TE =
nullptr;
3019 ScheduleData *FirstInBundle =
nullptr;
3023 ScheduleData *NextInBundle =
nullptr;
3027 ScheduleData *NextLoadStore =
nullptr;
3041 int SchedulingRegionID = 0;
3044 int SchedulingPriority = 0;
3050 int Dependencies = InvalidDeps;
3056 int UnscheduledDeps = InvalidDeps;
3060 bool IsScheduled =
false;
3065 const BoUpSLP::ScheduleData &SD) {
3090 struct BlockScheduling {
3092 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3096 ScheduleStart =
nullptr;
3097 ScheduleEnd =
nullptr;
3098 FirstLoadStoreInRegion =
nullptr;
3099 LastLoadStoreInRegion =
nullptr;
3100 RegionHasStackSave =
false;
3104 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3107 ScheduleRegionSize = 0;
3111 ++SchedulingRegionID;
3115 if (BB !=
I->getParent())
3118 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3119 if (SD && isInSchedulingRegion(SD))
3124 ScheduleData *getScheduleData(
Value *V) {
3125 if (
auto *
I = dyn_cast<Instruction>(V))
3126 return getScheduleData(
I);
3130 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3132 return getScheduleData(V);
3133 auto I = ExtraScheduleDataMap.find(V);
3134 if (
I != ExtraScheduleDataMap.end()) {
3135 ScheduleData *SD =
I->second.lookup(Key);
3136 if (SD && isInSchedulingRegion(SD))
3142 bool isInSchedulingRegion(ScheduleData *SD)
const {
3143 return SD->SchedulingRegionID == SchedulingRegionID;
3148 template <
typename ReadyListType>
3149 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3150 SD->IsScheduled =
true;
3153 for (ScheduleData *BundleMember = SD; BundleMember;
3154 BundleMember = BundleMember->NextInBundle) {
3155 if (BundleMember->Inst != BundleMember->OpValue)
3161 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3162 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3163 if (OpDef && OpDef->hasValidDependencies() &&
3164 OpDef->incrementUnscheduledDeps(-1) == 0) {
3168 ScheduleData *DepBundle = OpDef->FirstInBundle;
3169 assert(!DepBundle->IsScheduled &&
3170 "already scheduled bundle gets ready");
3171 ReadyList.insert(DepBundle);
3173 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3181 if (TreeEntry *TE = BundleMember->TE) {
3183 int Lane = std::distance(
TE->Scalars.begin(),
3184 find(
TE->Scalars, BundleMember->Inst));
3185 assert(Lane >= 0 &&
"Lane not set");
3193 auto *
In = BundleMember->Inst;
3195 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3196 In->getNumOperands() ==
TE->getNumOperands()) &&
3197 "Missed TreeEntry operands?");
3200 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3201 OpIdx != NumOperands; ++OpIdx)
3202 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3207 for (
Use &U : BundleMember->Inst->operands())
3208 if (
auto *
I = dyn_cast<Instruction>(U.get()))
3212 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3213 if (MemoryDepSD->hasValidDependencies() &&
3214 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3217 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3218 assert(!DepBundle->IsScheduled &&
3219 "already scheduled bundle gets ready");
3220 ReadyList.insert(DepBundle);
3222 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3226 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3227 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3230 ScheduleData *DepBundle = DepSD->FirstInBundle;
3231 assert(!DepBundle->IsScheduled &&
3232 "already scheduled bundle gets ready");
3233 ReadyList.insert(DepBundle);
3235 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3247 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3248 ScheduleStart->comesBefore(ScheduleEnd) &&
3249 "Not a valid scheduling region?");
3251 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3252 auto *SD = getScheduleData(
I);
3255 assert(isInSchedulingRegion(SD) &&
3256 "primary schedule data not in window?");
3257 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3258 "entire bundle in window!");
3260 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3263 for (
auto *SD : ReadyInsts) {
3264 assert(SD->isSchedulingEntity() && SD->isReady() &&
3265 "item in ready list not ready?");
3270 void doForAllOpcodes(
Value *V,
3272 if (ScheduleData *SD = getScheduleData(V))
3274 auto I = ExtraScheduleDataMap.find(V);
3275 if (
I != ExtraScheduleDataMap.end())
3276 for (
auto &
P :
I->second)
3277 if (isInSchedulingRegion(
P.second))
3282 template <
typename ReadyListType>
3283 void initialFillReadyList(ReadyListType &ReadyList) {
3284 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3285 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3286 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3288 ReadyList.insert(SD);
3290 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3305 std::optional<ScheduleData *>
3307 const InstructionsState &S);
3313 ScheduleData *allocateScheduleDataChunks();
3317 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3322 ScheduleData *PrevLoadStore,
3323 ScheduleData *NextLoadStore);
3327 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3331 void resetSchedule();
3336 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3352 ExtraScheduleDataMap;
3365 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3369 ScheduleData *LastLoadStoreInRegion =
nullptr;
3374 bool RegionHasStackSave =
false;
3377 int ScheduleRegionSize = 0;
3386 int SchedulingRegionID = 1;
3394 void scheduleBlock(BlockScheduling *BS);
3401 struct OrdersTypeDenseMapInfo {
3414 static unsigned getHashValue(
const OrdersType &V) {
3435 unsigned MaxVecRegSize;
3436 unsigned MinVecRegSize;
3461 struct ChildIteratorType
3463 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3474 return R.VectorizableTree[0].get();
3478 return {
N->UserTreeIndices.begin(),
N->Container};
3482 return {
N->UserTreeIndices.end(),
N->Container};
3487 class nodes_iterator {
3498 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3502 return nodes_iterator(R->VectorizableTree.begin());
3506 return nodes_iterator(R->VectorizableTree.end());
3509 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3520 OS << Entry->Idx <<
".\n";
3523 for (
auto *V : Entry->Scalars) {
3525 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3526 return EU.Scalar == V;
3536 if (Entry->State == TreeEntry::NeedToGather)
3538 if (Entry->State == TreeEntry::ScatterVectorize)
3539 return "color=blue";
3548 for (
auto *
I : DeletedInstructions) {
3549 for (
Use &U :
I->operands()) {
3550 auto *Op = dyn_cast<Instruction>(U.get());
3551 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3555 I->dropAllReferences();
3557 for (
auto *
I : DeletedInstructions) {
3559 "trying to erase instruction with users.");
3560 I->eraseFromParent();
3566#ifdef EXPENSIVE_CHECKS
3577 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3578 "Expected non-empty mask.");
3581 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
3583 Reuses[Mask[
I]] = Prev[
I];
3591 assert(!Mask.empty() &&
"Expected non-empty mask.");
3593 if (Order.
empty()) {
3594 MaskOrder.
resize(Mask.size());
3595 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
3604 Order.
assign(Mask.size(), Mask.size());
3605 for (
unsigned I = 0,
E = Mask.size();
I <
E; ++
I)
3607 Order[MaskOrder[
I]] =
I;
3611std::optional<BoUpSLP::OrdersType>
3613 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3614 unsigned NumScalars = TE.Scalars.size();
3615 OrdersType CurrentOrder(NumScalars, NumScalars);
3618 const TreeEntry *STE =
nullptr;
3622 for (
unsigned I = 0;
I < NumScalars; ++
I) {
3623 Value *V = TE.Scalars[
I];
3624 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
3626 if (
const auto *LocalSTE = getTreeEntry(V)) {
3629 else if (STE != LocalSTE)
3631 return std::nullopt;
3633 std::distance(STE->Scalars.begin(),
find(STE->Scalars, V));
3634 if (Lane >= NumScalars)
3635 return std::nullopt;
3636 if (CurrentOrder[Lane] != NumScalars) {
3639 UsedPositions.
reset(CurrentOrder[Lane]);
3643 CurrentOrder[Lane] =
I;
3644 UsedPositions.
set(
I);
3649 if (STE && (UsedPositions.
count() > 1 || STE->Scalars.size() == 2)) {
3651 for (
unsigned I = 0;
I < NumScalars; ++
I)
3652 if (CurrentOrder[
I] !=
I && CurrentOrder[
I] != NumScalars)
3656 if (IsIdentityOrder(CurrentOrder)) {
3657 CurrentOrder.
clear();
3658 return CurrentOrder;
3660 auto *It = CurrentOrder.
begin();
3661 for (
unsigned I = 0;
I < NumScalars;) {
3662 if (UsedPositions.
test(
I)) {
3666 if (*It == NumScalars) {
3672 return CurrentOrder;
3674 return std::nullopt;
3679enum class LoadsState { Gather, Vectorize, ScatterVectorize };
3684 bool CompareOpcodes =
true) {
3687 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3690 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3693 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3697 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
3717 if (
DL.getTypeSizeInBits(ScalarTy) !=
DL.getTypeAllocSizeInBits(ScalarTy))
3718 return LoadsState::Gather;
3724 auto *POIter = PointerOps.
begin();
3725 for (
Value *V : VL) {
3726 auto *L = cast<LoadInst>(V);
3728 return LoadsState::Gather;
3729 *POIter = L->getPointerOperand();
3742 if (Order.
empty()) {
3743 Ptr0 = PointerOps.
front();
3744 PtrN = PointerOps.
back();
3746 Ptr0 = PointerOps[Order.
front()];
3747 PtrN = PointerOps[Order.
back()];
3749 std::optional<int> Diff =
3752 if (
static_cast<unsigned>(*Diff) == VL.
size() - 1)
3753 return LoadsState::Vectorize;
3759 bool ProfitableGatherPointers =
3762 })) <= VL.
size() / 2 && VL.
size() > 2;
3763 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
3764 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
3766 (
GEP &&
GEP->getNumOperands() == 2);
3768 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3771 std::min(CommonAlignment, cast<LoadInst>(V)->
getAlign());
3775 return LoadsState::ScatterVectorize;
3779 return LoadsState::Gather;
3786 VL, [](
const Value *V) {
return V->
getType()->isPointerTy(); }) &&
3787 "Expected list of pointer operands.");
3792 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
3797 std::optional<int> Diff =
3803 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
3809 if (Bases.
size() > VL.
size() / 2 - 1)
3813 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
3819 bool AnyConsecutive =
false;
3820 for (
auto &
Base : Bases) {
3821 auto &Vec =
Base.second;
3822 if (Vec.size() > 1) {
3824 const std::tuple<Value *, int, unsigned> &
Y) {
3825 return std::get<1>(
X) < std::get<1>(
Y);
3827 int InitialOffset = std::get<1>(Vec[0]);
3829 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
3835 SortedIndices.
clear();
3836 if (!AnyConsecutive)
3839 for (
auto &
Base : Bases) {
3840 for (
auto &
T :
Base.second)
3845 "Expected SortedIndices to be the size of VL");
3849std::optional<BoUpSLP::OrdersType>
3851 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3852 Type *ScalarTy = TE.Scalars[0]->getType();
3855 Ptrs.
reserve(TE.Scalars.size());
3856 for (
Value *V : TE.Scalars) {
3857 auto *L = dyn_cast<LoadInst>(V);
3858 if (!L || !L->isSimple())
3859 return std::nullopt;
3866 return std::nullopt;
3886 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
3901 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
3908 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
3910 }
while (IE1 || IE2);
3918 if (!TE.ReuseShuffleIndices.empty()) {
3926 unsigned Sz = TE.Scalars.size();
3929 return std::nullopt;
3930 unsigned VF = TE.getVectorFactor();
3933 TE.ReuseShuffleIndices.end());
3934 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
3936 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
3937 return Idx && *Idx < Sz;
3940 if (TE.ReorderIndices.empty())
3941 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
3944 for (
unsigned I = 0;
I < VF; ++
I) {
3945 int &
Idx = ReusedMask[
I];
3948 Value *V = TE.Scalars[ReorderMask[
Idx]];
3950 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
3956 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
3957 auto *It = ResOrder.
begin();
3958 for (
unsigned K = 0; K < VF; K += Sz) {
3962 std::iota(SubMask.begin(), SubMask.end(), 0);
3964 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
3965 std::advance(It, Sz);
3968 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
3972 if (TE.State == TreeEntry::Vectorize &&
3973 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
3974 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
3976 return TE.ReorderIndices;
3977 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
3979 if (!V1->
hasOneUse() || !V2->hasOneUse())
3981 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
3982 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
3983 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
3984 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
3991 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
3993 return *Idx1 < *Idx2;
3995 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
3996 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
3997 if (EE1->getOperand(0) != EE2->getOperand(0))
4001 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4003 return *Idx1 < *Idx2;
4007 auto IsIdentityOrder = [](
const OrdersType &Order) {
4008 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4013 if (!TE.ReorderIndices.empty())
4014 return TE.ReorderIndices;
4018 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) {
4019 PhiToId[TE.Scalars[Id]] = Id;
4020 Phis.push_back(TE.Scalars[Id]);
4023 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4024 ResOrder[Id] = PhiToId[Phis[Id]];
4025 if (IsIdentityOrder(ResOrder))
4029 if (TE.State == TreeEntry::NeedToGather) {
4032 if (((TE.getOpcode() == Instruction::ExtractElement &&
4033 !TE.isAltShuffle()) ||
4036 return isa<UndefValue, ExtractElementInst>(V);
4039 [](
Value *V) { return isa<ExtractElementInst>(V); }))) &&
4042 auto *EE = dyn_cast<ExtractElementInst>(V);
4043 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4049 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
4050 if (Reuse || !CurrentOrder.
empty()) {
4051 if (!CurrentOrder.
empty())
4053 return CurrentOrder;
4057 return CurrentOrder;
4058 if (TE.Scalars.size() >= 4)
4062 return std::nullopt;
4072 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
4074 if (Cluster != FirstCluster)
4080void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
4083 const unsigned Sz =
TE.Scalars.size();
4085 if (
TE.State != TreeEntry::NeedToGather ||
4092 addMask(NewMask,
TE.ReuseShuffleIndices);
4094 TE.ReorderIndices.clear();
4101 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
4102 *End =
TE.ReuseShuffleIndices.end();
4103 It != End; std::advance(It, Sz))
4104 std::iota(It, std::next(It, Sz), 0);
4123 ExternalUserReorderMap;
4129 for_each(VectorizableTree, [
this, &TTIRef, &VFToOrderedEntries,
4130 &GathersToOrders, &ExternalUserReorderMap,
4131 &AltShufflesToOrders, &PhisToOrders](
4132 const std::unique_ptr<TreeEntry> &TE) {
4135 findExternalStoreUsersReorderIndices(TE.get());
4136 if (!ExternalUserReorderIndices.
empty()) {
4137 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4139 std::move(ExternalUserReorderIndices));
4145 if (TE->isAltShuffle()) {
4148 unsigned Opcode0 = TE->getOpcode();
4149 unsigned Opcode1 = TE->getAltOpcode();
4152 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4153 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4154 OpcodeMask.
set(Lane);
4157 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4163 if (std::optional<OrdersType> CurrentOrder =
4173 const TreeEntry *UserTE = TE.get();
4175 if (UserTE->UserTreeIndices.size() != 1)
4178 return EI.UserTE->State == TreeEntry::Vectorize &&
4179 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4182 UserTE = UserTE->UserTreeIndices.back().UserTE;
4185 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4186 if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
4187 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4188 if (TE->State == TreeEntry::Vectorize &&
4189 TE->getOpcode() == Instruction::PHI)
4190 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
4195 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4197 auto It = VFToOrderedEntries.
find(VF);
4198 if (It == VFToOrderedEntries.
end())
4210 for (
const TreeEntry *OpTE : OrderedEntries) {
4213 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
4216 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4218 if (OpTE->State == TreeEntry::NeedToGather ||
4219 !OpTE->ReuseShuffleIndices.empty()) {
4220 auto It = GathersToOrders.find(OpTE);
4221 if (It != GathersToOrders.end())
4224 if (OpTE->isAltShuffle()) {
4225 auto It = AltShufflesToOrders.find(OpTE);
4226 if (It != AltShufflesToOrders.end())
4229 if (OpTE->State == TreeEntry::Vectorize &&
4230 OpTE->getOpcode() == Instruction::PHI) {
4231 auto It = PhisToOrders.
find(OpTE);
4232 if (It != PhisToOrders.
end())
4235 return OpTE->ReorderIndices;
4238 auto It = ExternalUserReorderMap.
find(OpTE);
4239 if (It != ExternalUserReorderMap.
end()) {
4240 const auto &ExternalUserReorderIndices = It->second;
4244 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4245 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
4246 ExternalUserReorderIndices.size();
4248 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
4249 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4256 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4257 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4260 unsigned E = Order.size();
4263 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
4266 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4268 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4272 if (OrdersUses.empty())
4276 unsigned Cnt = OrdersUses.front().second;
4277 for (
const auto &Pair :
drop_begin(OrdersUses)) {
4278 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4279 BestOrder = Pair.first;
4284 if (BestOrder.
empty())
4289 unsigned E = BestOrder.
size();
4291 return I < E ? static_cast<int>(I) : UndefMaskElem;
4294 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4296 if (TE->Scalars.size() != VF) {
4297 if (TE->ReuseShuffleIndices.size() == VF) {
4303 return EI.UserTE->Scalars.size() == VF ||
4304 EI.UserTE->Scalars.size() ==
4307 "All users must be of VF size.");
4310 reorderNodeWithReuses(*TE, Mask);
4314 if (TE->State == TreeEntry::Vectorize &&
4317 !TE->isAltShuffle()) {
4321 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
4322 TE->reorderOperands(Mask);
4325 TE->reorderOperands(Mask);
4326 assert(TE->ReorderIndices.empty() &&
4327 "Expected empty reorder sequence.");
4330 if (!TE->ReuseShuffleIndices.empty()) {
4337 addMask(NewReuses, TE->ReuseShuffleIndices);
4338 TE->ReuseShuffleIndices.swap(NewReuses);
4344bool BoUpSLP::canReorderOperands(
4345 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4348 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
4349 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
4350 return OpData.first ==
I &&
4351 OpData.second->State == TreeEntry::Vectorize;
4354 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
4356 if (
any_of(TE->UserTreeIndices,
4357 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
4361 Edges.emplace_back(
I, TE);
4367 if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())
4371 TreeEntry *Gather =
nullptr;
4373 [&Gather, UserTE,
I](TreeEntry *TE) {
4374 assert(TE->State != TreeEntry::Vectorize &&
4375 "Only non-vectorized nodes are expected.");
4376 if (
any_of(TE->UserTreeIndices,
4377 [UserTE,
I](
const EdgeInfo &EI) {
4378 return EI.UserTE == UserTE && EI.EdgeIdx == I;
4380 assert(TE->isSame(UserTE->getOperand(
I)) &&
4381 "Operand entry does not match operands.");
4402 for_each(VectorizableTree, [
this, &OrderedEntries, &GathersToOrders,
4404 const std::unique_ptr<TreeEntry> &TE) {
4405 if (TE->State != TreeEntry::Vectorize)
4407 if (std::optional<OrdersType> CurrentOrder =
4409 OrderedEntries.
insert(TE.get());
4410 if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
4411 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4420 while (!OrderedEntries.
empty()) {
4425 for (TreeEntry *TE : OrderedEntries) {
4426 if (!(TE->State == TreeEntry::Vectorize ||
4427 (TE->State == TreeEntry::NeedToGather &&
4428 GathersToOrders.
count(TE))) ||
4429 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4432 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
4434 !Visited.insert(TE).second) {
4440 for (
EdgeInfo &EI : TE->UserTreeIndices) {
4441 TreeEntry *UserTE = EI.
UserTE;
4442 auto It =
Users.find(UserTE);
4443 if (It ==
Users.end())
4444 It =
Users.insert({UserTE, {}}).first;
4445 It->second.emplace_back(EI.
EdgeIdx, TE);
4450 [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
4452 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
4454 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
4455 return Data1.first->Idx > Data2.first->Idx;
4457 for (
auto &
Data : UsersVec) {
4460 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
4463 [&OrderedEntries](
const std::pair<unsigned, TreeEntry *> &Op) {
4464 OrderedEntries.remove(Op.second);
4478 for (
const auto &Op :
Data.second) {
4479 TreeEntry *OpTE = Op.second;
4480 if (!VisitedOps.
insert(OpTE).second)
4482 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
4484 const auto &Order = [OpTE, &GathersToOrders]() ->
const OrdersType & {
4485 if (OpTE->State == TreeEntry::NeedToGather ||
4486 !OpTE->ReuseShuffleIndices.empty())
4487 return GathersToOrders.
find(OpTE)->second;
4488 return OpTE->ReorderIndices;
4491 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
4492 return P.second == OpTE;
4495 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4496 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4499 unsigned E = Order.size();
4502 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
4505 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
4508 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
4510 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
4511 const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4512 const TreeEntry *TE) {
4513 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4514 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4515 (IgnoreReorder && TE->Idx == 0))
4517 if (TE->State == TreeEntry::NeedToGather) {
4518 auto It = GathersToOrders.
find(TE);
4519 if (It != GathersToOrders.
end())
4520 return !It->second.empty();
4525 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
4526 TreeEntry *UserTE = EI.
UserTE;
4527 if (!VisitedUsers.
insert(UserTE).second)
4532 if (AllowsReordering(UserTE))
4540 if (
static_cast<unsigned>(
count_if(
4541 Ops, [UserTE, &AllowsReordering](
4542 const std::pair<unsigned, TreeEntry *> &Op) {
4543 return AllowsReordering(Op.second) &&
4544 all_of(Op.second->UserTreeIndices,
4546 return EI.UserTE == UserTE;
4548 })) <= Ops.
size() / 2)
4549 ++Res.first->second;
4553 if (OrdersUses.empty()) {
4555 [&OrderedEntries](
const std::pair<unsigned, TreeEntry *> &Op) {
4556 OrderedEntries.remove(Op.second);
4562 unsigned Cnt = OrdersUses.front().second;
4563 for (
const auto &Pair :
drop_begin(OrdersUses)) {
4564 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4565 BestOrder = Pair.first;
4570 if (BestOrder.
empty()) {
4572 [&OrderedEntries](
const std::pair<unsigned, TreeEntry *> &Op) {
4573 OrderedEntries.remove(Op.second);
4582 unsigned E = BestOrder.
size();
4584 return I < E ? static_cast<int>(I) : UndefMaskElem;
4586 for (
const std::pair<unsigned, TreeEntry *> &Op :
Data.second) {
4587 TreeEntry *TE = Op.second;
4588 OrderedEntries.remove(TE);
4589 if (!VisitedOps.
insert(TE).second)
4591 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
4592 reorderNodeWithReuses(*TE, Mask);
4596 if (TE->State != TreeEntry::Vectorize)
4598 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
4599 TE->ReorderIndices.empty()) &&
4600 "Non-matching sizes of user/operand entries.");
4602 if (IgnoreReorder && TE == VectorizableTree.front().get())
4603 IgnoreReorder =
false;
4606 for (TreeEntry *Gather : GatherOps) {
4607 assert(Gather->ReorderIndices.empty() &&
4608 "Unexpected reordering of gathers.");
4609 if (!Gather->ReuseShuffleIndices.empty()) {
4615 OrderedEntries.remove(Gather);
4619 if (
Data.first->State != TreeEntry::Vectorize ||
4620 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
4621 Data.first->getMainOp()) ||
4622 Data.first->isAltShuffle())
4623 Data.first->reorderOperands(Mask);
4624 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
4625 Data.first->isAltShuffle()) {
4628 if (
Data.first->ReuseShuffleIndices.empty() &&
4629 !
Data.first->ReorderIndices.empty() &&
4630 !
Data.first->isAltShuffle()) {
4633 OrderedEntries.insert(
Data.first);
4641 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
4642 VectorizableTree.front()->ReuseShuffleIndices.empty())
4643 VectorizableTree.front()->ReorderIndices.clear();
4649 for (
auto &TEPtr : VectorizableTree) {
4650 TreeEntry *Entry = TEPtr.get();
4653 if (Entry->State == TreeEntry::NeedToGather)
4657 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
4658 Value *Scalar = Entry->Scalars[Lane];
4659 int FoundLane = Entry->findLaneForValue(Scalar);
4662 auto ExtI = ExternallyUsedValues.
find(Scalar);
4663 if (ExtI != ExternallyUsedValues.
end()) {
4664 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
4665 << Lane <<
" from " << *Scalar <<
".\n");
4666 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
4668 for (
User *U : Scalar->users()) {
4679 if (TreeEntry *UseEntry = getTreeEntry(U)) {
4680 Value *UseScalar = UseEntry->Scalars[0];
4684 if (UseScalar != U ||
4685 UseEntry->State == TreeEntry::ScatterVectorize ||
4687 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
4689 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
4695 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
4698 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract:" << *U <<
" from lane "
4699 << Lane <<
" from " << *Scalar <<
".\n");
4700 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
4707BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
4709 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
4710 Value *V = TE->Scalars[Lane];
4712 static constexpr unsigned UsersLimit = 4;
4718 auto *
SI = dyn_cast<StoreInst>(U);
4719 if (SI ==
nullptr || !
SI->isSimple() ||
4723 if (getTreeEntry(U))
4727 auto &StoresVec = PtrToStoresMap[
Ptr];
4730 if (StoresVec.size() > Lane)
4733 if (!StoresVec.empty() &&
4734 SI->getParent() != StoresVec.back()->getParent())
4737 if (!StoresVec.empty() &&
4738 SI->getValueOperand()->getType() !=
4739 StoresVec.back()->getValueOperand()->getType())
4741 StoresVec.push_back(SI);
4744 return PtrToStoresMap;
4748 OrdersType &ReorderIndices)
const {
4756 StoreOffsetVec[0] = {S0, 0};
4759 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
4761 std::optional<int> Diff =
4763 SI->getPointerOperand(), *DL, *SE,
4768 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
4773 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
4774 const std::pair<StoreInst *, int> &Pair2) {
4775 int Offset1 = Pair1.second;
4776 int Offset2 = Pair2.second;
4777 return Offset1 < Offset2;
4781 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
4782 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx-1].second + 1)
4787 ReorderIndices.reserve(StoresVec.
size());
4790 [SI](
const std::pair<StoreInst *, int> &Pair) {
4791 return Pair.first ==
SI;
4793 StoreOffsetVec.begin();
4794 ReorderIndices.push_back(
Idx);
4799 auto IsIdentityOrder = [](
const OrdersType &Order) {
4800 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4805 if (IsIdentityOrder(ReorderIndices))
4806 ReorderIndices.clear();
4813 for (
unsigned Idx : Order)
4820BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
4821 unsigned NumLanes =
TE->Scalars.size();
4824 collectUserStores(TE);
4833 for (
const auto &Pair : PtrToStoresMap) {
4834 auto &StoresVec = Pair.second;
4836 if (StoresVec.size() != NumLanes)
4841 if (!canFormVector(StoresVec, ReorderIndices))
4846 ExternalReorderIndices.
push_back(ReorderIndices);
4848 return ExternalReorderIndices;
4854 UserIgnoreList = &UserIgnoreLst;
4857 buildTree_rec(Roots, 0,
EdgeInfo());
4864 buildTree_rec(Roots, 0,
EdgeInfo());
4871 Value *NeedsScheduling =
nullptr;
4872 for (
Value *V : VL) {
4875 if (!NeedsScheduling) {
4876 NeedsScheduling = V;
4881 return NeedsScheduling;
4892 bool AllowAlternate) {
4896 if (
auto *LI = dyn_cast<LoadInst>(V)) {
4899 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
4904 if (isa<ExtractElementInst, UndefValue>(V))
4906 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
4908 !isa<UndefValue>(EI->getIndexOperand()))
4911 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
4914 if ((isa<BinaryOperator, CastInst>(
I)) &&
4924 : cast<CastInst>(
I)->getOperand(0)->getType()));
4926 if (isa<CastInst>(
I)) {
4927 std::pair<size_t, size_t> OpVals =
4933 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
4935 if (CI->isCommutative())
4941 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
4955 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
4956 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
4957 SubKey =
hash_value(Gep->getPointerOperand());
4961 !isa<ConstantInt>(
I->getOperand(1))) {
4969 return std::make_pair(Key, SubKey);
4980 const EdgeInfo &UserTreeIdx) {
4985 auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
4987 this](
const InstructionsState &S) {
4990 for (
Value *V : VL) {
4997 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
5002 size_t NumUniqueScalarValues = UniqueValues.
size();
5003 if (NumUniqueScalarValues == VL.size()) {
5004 ReuseShuffleIndicies.
clear();
5007 if (NumUniqueScalarValues <= 1 ||
5008 (UniquePositions.size() == 1 &&
all_of(UniqueValues,
5010 return isa<UndefValue>(V) ||
5015 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
5029 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
5034 cast<Instruction>(
I)->getOpcode() ==
5035 cast<Instruction>(S.MainOp)->getOpcode();
5037 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
5038 if (TryToFindDuplicates(S))
5039 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
5040 ReuseShuffleIndicies);
5045 if (S.getOpcode() == Instruction::ExtractElement &&
5046 isa<ScalableVectorType>(
5047 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
5048 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
5049 if (TryToFindDuplicates(S))
5050 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
5051 ReuseShuffleIndicies);
5056 if (S.OpValue->getType()->isVectorTy() &&
5057 !isa<InsertElementInst>(S.OpValue)) {
5059 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
5063 if (
StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
5064 if (
SI->getValueOperand()->getType()->isVectorTy()) {
5065 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
5066 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);