73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Only vectorize if you gain more than this "
123 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
128 cl::desc(
"Attempt to vectorize horizontal reductions"));
133 "Attempt to vectorize horizontal reductions feeding into a store"));
139 cl::desc(
"Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
144 cl::desc(
"Attempt to vectorize for this register size in bits"));
148 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
156 cl::desc(
"Limit the size of the SLP scheduling region per block"));
160 cl::desc(
"Attempt to vectorize for this register size in bits"));
164 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
168 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
174 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
183 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
187 cl::desc(
"The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
192 cl::desc(
"The maximum stride, considered to be profitable."));
196 cl::desc(
"Display the SLP trees with Graphviz"));
200 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
227 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
234 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
241 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
242 !isa<ExtractValueInst, UndefValue>(V))
244 auto *
I = dyn_cast<Instruction>(V);
245 if (!
I || isa<ExtractValueInst>(
I))
247 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
249 if (isa<ExtractElementInst>(
I))
251 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
260 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
276 for (
int I = 1, E = VL.
size();
I < E;
I++) {
277 auto *II = dyn_cast<Instruction>(VL[
I]);
298 Value *FirstNonUndef =
nullptr;
299 for (
Value *V : VL) {
300 if (isa<UndefValue>(V))
302 if (!FirstNonUndef) {
306 if (V != FirstNonUndef)
309 return FirstNonUndef !=
nullptr;
314 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
315 return Cmp->isCommutative();
316 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
317 return BO->isCommutative() ||
318 (BO->getOpcode() == Instruction::Sub &&
324 ICmpInst::Predicate Pred;
325 if (match(U.getUser(),
326 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
327 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
331 return match(U.getUser(),
332 m_Intrinsic<Intrinsic::abs>(
333 m_Specific(U.get()), m_ConstantInt(Flag))) &&
334 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
337 (BO->getOpcode() == Instruction::FSub &&
340 return match(U.getUser(),
341 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
343 return I->isCommutative();
351 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
352 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
355 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
358 if (CI->getValue().uge(VT->getNumElements()))
360 Index *= VT->getNumElements();
361 Index += CI->getZExtValue();
365 const auto *
IV = cast<InsertValueInst>(InsertInst);
366 Type *CurrentType =
IV->getType();
367 for (
unsigned I :
IV->indices()) {
368 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
369 Index *= ST->getNumElements();
370 CurrentType = ST->getElementType(
I);
371 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
372 Index *= AT->getNumElements();
373 CurrentType = AT->getElementType();
406 if (MaskArg == UseMask::UndefsAsMask)
410 if (MaskArg == UseMask::FirstArg &&
Value < VF)
411 UseMask.reset(
Value);
412 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
413 UseMask.reset(
Value - VF);
421template <
bool IsPoisonOnly = false>
425 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
428 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
431 auto *
C = dyn_cast<Constant>(V);
433 if (!UseMask.empty()) {
435 while (
auto *II = dyn_cast<InsertElementInst>(
Base)) {
436 Base = II->getOperand(0);
437 if (isa<T>(II->getOperand(1)))
444 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
452 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
459 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
460 if (
Constant *Elem =
C->getAggregateElement(
I))
462 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
490static std::optional<TargetTransformInfo::ShuffleKind>
492 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
495 auto *EI0 = cast<ExtractElementInst>(*It);
496 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
499 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
500 Value *Vec1 =
nullptr;
501 Value *Vec2 =
nullptr;
503 ShuffleMode CommonShuffleMode =
Unknown;
505 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
507 if (isa<UndefValue>(VL[
I]))
509 auto *EI = cast<ExtractElementInst>(VL[
I]);
510 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
512 auto *Vec = EI->getVectorOperand();
517 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
519 if (isa<UndefValue>(EI->getIndexOperand()))
521 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
527 unsigned IntIdx =
Idx->getValue().getZExtValue();
531 if (!Vec1 || Vec1 == Vec) {
533 }
else if (!Vec2 || Vec2 == Vec) {
539 if (CommonShuffleMode == Permute)
544 CommonShuffleMode = Permute;
547 CommonShuffleMode =
Select;
550 if (CommonShuffleMode ==
Select && Vec2)
561 assert((Opcode == Instruction::ExtractElement ||
562 Opcode == Instruction::ExtractValue) &&
563 "Expected extractelement or extractvalue instruction.");
564 if (Opcode == Instruction::ExtractElement) {
565 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
568 return CI->getZExtValue();
570 auto *EI = cast<ExtractValueInst>(E);
571 if (EI->getNumIndices() != 1)
573 return *EI->idx_begin();
579struct InstructionsState {
581 Value *OpValue =
nullptr;
592 unsigned getAltOpcode()
const {
597 bool isAltShuffle()
const {
return AltOp != MainOp; }
600 unsigned CheckedOpcode =
I->getOpcode();
601 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
604 InstructionsState() =
delete;
606 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
615 auto *
I = dyn_cast<Instruction>(
Op);
616 if (
I && S.isOpcodeOrAlt(
I))
635 unsigned BaseIndex = 0);
643 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
644 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
645 BaseOp0 == Op0 || BaseOp1 == Op1 ||
656 "Assessing comparisons of different types?");
666 return (BasePred == Pred &&
668 (BasePred == SwappedPred &&
677 unsigned BaseIndex) {
680 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
682 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
683 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
684 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
686 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
688 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
689 unsigned AltOpcode = Opcode;
690 unsigned AltIndex = BaseIndex;
692 bool SwappedPredsCompatible = [&]() {
696 UniquePreds.
insert(BasePred);
697 UniqueNonSwappedPreds.
insert(BasePred);
698 for (
Value *V : VL) {
699 auto *
I = dyn_cast<CmpInst>(V);
705 UniqueNonSwappedPreds.
insert(CurrentPred);
706 if (!UniquePreds.
contains(CurrentPred) &&
707 !UniquePreds.
contains(SwappedCurrentPred))
708 UniquePreds.
insert(CurrentPred);
713 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
717 auto *IBase = cast<Instruction>(VL[BaseIndex]);
720 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
724 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
726 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
727 auto *
I = cast<Instruction>(VL[Cnt]);
728 unsigned InstOpcode =
I->getOpcode();
729 if (IsBinOp && isa<BinaryOperator>(
I)) {
730 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
734 AltOpcode = InstOpcode;
738 }
else if (IsCastOp && isa<CastInst>(
I)) {
739 Value *Op0 = IBase->getOperand(0);
741 Value *Op1 =
I->getOperand(0);
744 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
746 if (Opcode == AltOpcode) {
749 "Cast isn't safe for alternation, logic needs to be updated!");
750 AltOpcode = InstOpcode;
755 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
756 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
757 Type *Ty0 = BaseInst->getOperand(0)->getType();
758 Type *Ty1 = Inst->getOperand(0)->getType();
760 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
767 if ((E == 2 || SwappedPredsCompatible) &&
768 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
773 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
774 if (AltIndex != BaseIndex) {
777 }
else if (BasePred != CurrentPred) {
780 "CmpInst isn't safe for alternation, logic needs to be updated!");
785 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
786 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
789 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
790 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
791 if (Gep->getNumOperands() != 2 ||
792 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
793 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
794 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
796 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
797 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
798 auto *BaseLI = cast<LoadInst>(IBase);
799 if (!LI->isSimple() || !BaseLI->isSimple())
800 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
801 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
802 auto *
CallBase = cast<CallInst>(IBase);
804 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
805 if (Call->hasOperandBundles() &&
806 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
807 Call->op_begin() + Call->getBundleOperandsEndIndex(),
810 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
813 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
816 if (Mappings.
size() != BaseMappings.
size() ||
817 Mappings.
front().ISA != BaseMappings.
front().ISA ||
818 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
819 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
820 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
821 Mappings.
front().Shape.Parameters !=
822 BaseMappings.
front().Shape.Parameters)
823 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
828 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
831 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
832 cast<Instruction>(VL[AltIndex]));
848 case Instruction::Load: {
849 LoadInst *LI = cast<LoadInst>(UserInst);
852 case Instruction::Store: {
853 StoreInst *SI = cast<StoreInst>(UserInst);
854 return (SI->getPointerOperand() == Scalar);
856 case Instruction::Call: {
857 CallInst *CI = cast<CallInst>(UserInst);
860 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
861 Arg.value().get() == Scalar;
873 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
880 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
881 return LI->isSimple();
883 return SI->isSimple();
885 return !
MI->isVolatile();
893 bool ExtendingManyInputs =
false) {
897 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
899 (SubMask.
size() == Mask.size() &&
900 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
901 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
902 "SubMask with many inputs support must be larger than the mask.");
904 Mask.append(SubMask.
begin(), SubMask.
end());
908 int TermValue = std::min(Mask.size(), SubMask.
size());
909 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
911 (!ExtendingManyInputs &&
912 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
914 NewMask[
I] = Mask[SubMask[
I]];
930 const unsigned Sz = Order.
size();
933 for (
unsigned I = 0;
I < Sz; ++
I) {
935 UnusedIndices.
reset(Order[
I]);
937 MaskedIndices.
set(
I);
939 if (MaskedIndices.
none())
942 "Non-synced masked/available indices.");
946 assert(
Idx >= 0 &&
"Indices must be synced.");
958 const unsigned E = Indices.
size();
960 for (
unsigned I = 0;
I < E; ++
I)
961 Mask[Indices[
I]] =
I;
967 assert(!Mask.empty() &&
"Expected non-empty mask.");
971 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
973 Scalars[Mask[
I]] = Prev[
I];
981 auto *
I = dyn_cast<Instruction>(V);
986 auto *IO = dyn_cast<Instruction>(V);
989 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
998 auto *
I = dyn_cast<Instruction>(V);
1002 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1004 auto *IU = dyn_cast<Instruction>(U);
1007 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1023 return !VL.
empty() &&
1027namespace slpvectorizer {
1032 struct ScheduleData;
1057 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1058 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1112 return !VectorizableTree.
empty() &&
1113 !VectorizableTree.
front()->UserTreeIndices.empty();
1118 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1119 return VectorizableTree.
front()->Scalars;
1131 VectorizableTree.
clear();
1132 ScalarToTreeEntry.clear();
1133 MultiNodeScalars.clear();
1135 EntryToLastInstruction.clear();
1136 ExternalUses.
clear();
1137 ExternalUsesAsGEPs.clear();
1138 for (
auto &Iter : BlocksSchedules) {
1139 BlockScheduling *BS = Iter.second.get();
1143 ReductionBitWidth = 0;
1144 CastMaxMinBWSizes.reset();
1145 ExtraBitWidthNodes.
clear();
1146 InstrElementSize.clear();
1147 UserIgnoreList =
nullptr;
1148 PostponedGathers.
clear();
1149 ValueToGatherNodes.
clear();
1206 return MaxVecRegSize;
1211 return MinVecRegSize;
1219 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1221 return MaxVF ? MaxVF : UINT_MAX;
1265 bool TryRecursiveCheck =
true)
const;
1289 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1290 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1312 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1313 MaxLevel(MaxLevel) {}
1367 if (isa<LoadInst>(V1)) {
1369 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1374 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1376 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1379 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1382 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1384 ((
int)V1->getNumUses() == NumLanes ||
1385 AllUsersAreInternal(V1, V2)))
1391 auto *LI1 = dyn_cast<LoadInst>(V1);
1392 auto *LI2 = dyn_cast<LoadInst>(V2);
1394 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1399 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1400 LI2->getPointerOperand(),
DL, SE,
true);
1401 if (!Dist || *Dist == 0) {
1404 R.TTI->isLegalMaskedGather(
1412 if (std::abs(*Dist) > NumLanes / 2)
1421 auto *C1 = dyn_cast<Constant>(V1);
1422 auto *C2 = dyn_cast<Constant>(V2);
1436 if (isa<UndefValue>(V2))
1440 Value *EV2 =
nullptr;
1453 int Dist = Idx2 - Idx1;
1456 if (std::abs(Dist) == 0)
1458 if (std::abs(Dist) > NumLanes / 2)
1468 auto *I1 = dyn_cast<Instruction>(V1);
1469 auto *I2 = dyn_cast<Instruction>(V2);
1471 if (I1->getParent() != I2->getParent())
1479 if (S.getOpcode() &&
1480 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1481 !S.isAltShuffle()) &&
1483 return cast<Instruction>(V)->getNumOperands() ==
1484 S.MainOp->getNumOperands();
1490 if (isa<UndefValue>(V2))
1527 int ShallowScoreAtThisLevel =
1536 auto *I1 = dyn_cast<Instruction>(
LHS);
1537 auto *I2 = dyn_cast<Instruction>(
RHS);
1538 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1540 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1541 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1542 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1543 ShallowScoreAtThisLevel))
1544 return ShallowScoreAtThisLevel;
1545 assert(I1 && I2 &&
"Should have early exited.");
1552 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1553 OpIdx1 != NumOperands1; ++OpIdx1) {
1555 int MaxTmpScore = 0;
1556 unsigned MaxOpIdx2 = 0;
1557 bool FoundBest =
false;
1561 ? I2->getNumOperands()
1562 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1563 assert(FromIdx <= ToIdx &&
"Bad index");
1564 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1566 if (Op2Used.
count(OpIdx2))
1571 I1, I2, CurrLevel + 1, std::nullopt);
1574 TmpScore > MaxTmpScore) {
1575 MaxTmpScore = TmpScore;
1582 Op2Used.
insert(MaxOpIdx2);
1583 ShallowScoreAtThisLevel += MaxTmpScore;
1586 return ShallowScoreAtThisLevel;
1617 struct OperandData {
1618 OperandData() =
default;
1619 OperandData(
Value *V,
bool APO,
bool IsUsed)
1620 : V(V), APO(APO), IsUsed(IsUsed) {}
1630 bool IsUsed =
false;
1639 enum class ReorderingMode {
1658 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1659 return OpsVec[OpIdx][Lane];
1663 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1664 return OpsVec[OpIdx][Lane];
1669 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1670 OpIdx != NumOperands; ++OpIdx)
1671 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1673 OpsVec[OpIdx][Lane].IsUsed =
false;
1677 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1678 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1690 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1691 Value *IdxLaneV = getData(
Idx, Lane).V;
1692 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1695 for (
unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1698 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1699 if (!isa<Instruction>(OpIdxLnV))
1701 Uniques.
insert(OpIdxLnV);
1703 int UniquesCount = Uniques.
size();
1704 int UniquesCntWithIdxLaneV =
1705 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1706 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1707 int UniquesCntWithOpIdxLaneV =
1708 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1709 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1712 UniquesCntWithOpIdxLaneV) -
1713 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1722 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1723 Value *IdxLaneV = getData(
Idx, Lane).V;
1724 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1733 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1734 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1736 return R.areAllUsersVectorized(IdxLaneI)
1744 static const int ScoreScaleFactor = 10;
1752 int Lane,
unsigned OpIdx,
unsigned Idx,
1762 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1763 if (Score <= -SplatScore) {
1768 Score += SplatScore;
1774 Score *= ScoreScaleFactor;
1775 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1793 std::optional<unsigned>
1794 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1797 unsigned NumOperands = getNumOperands();
1800 Value *OpLastLane = getData(OpIdx, LastLane).V;
1803 ReorderingMode RMode = ReorderingModes[OpIdx];
1804 if (RMode == ReorderingMode::Failed)
1805 return std::nullopt;
1808 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1814 std::optional<unsigned>
Idx;
1818 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1825 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1827 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1829 OperandData &OpData = getData(
Idx, Lane);
1831 bool OpAPO = OpData.APO;
1840 if (OpAPO != OpIdxAPO)
1845 case ReorderingMode::Load:
1846 case ReorderingMode::Constant:
1847 case ReorderingMode::Opcode: {
1848 bool LeftToRight = Lane > LastLane;
1849 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1850 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1851 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1852 OpIdx,
Idx, IsUsed);
1853 if (Score >
static_cast<int>(BestOp.Score)) {
1855 BestOp.Score = Score;
1856 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1860 case ReorderingMode::Splat:
1861 if (
Op == OpLastLane)
1864 case ReorderingMode::Failed:
1870 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1874 return std::nullopt;
1881 unsigned getBestLaneToStartReordering()
const {
1882 unsigned Min = UINT_MAX;
1883 unsigned SameOpNumber = 0;
1894 for (
int I = getNumLanes();
I > 0; --
I) {
1895 unsigned Lane =
I - 1;
1896 OperandsOrderData NumFreeOpsHash =
1897 getMaxNumOperandsThatCanBeReordered(Lane);
1900 if (NumFreeOpsHash.NumOfAPOs < Min) {
1901 Min = NumFreeOpsHash.NumOfAPOs;
1902 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1904 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1905 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1906 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1909 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1910 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1911 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1912 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1913 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
1914 if (It == HashMap.
end())
1915 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1921 unsigned BestLane = 0;
1922 unsigned CntMin = UINT_MAX;
1924 if (
Data.second.first < CntMin) {
1925 CntMin =
Data.second.first;
1926 BestLane =
Data.second.second;
1933 struct OperandsOrderData {
1936 unsigned NumOfAPOs = UINT_MAX;
1939 unsigned NumOpsWithSameOpcodeParent = 0;
1953 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1954 unsigned CntTrue = 0;
1955 unsigned NumOperands = getNumOperands();
1965 bool AllUndefs =
true;
1966 unsigned NumOpsWithSameOpcodeParent = 0;
1970 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1971 const OperandData &OpData = getData(OpIdx, Lane);
1976 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
1978 I->getParent() != Parent) {
1979 if (NumOpsWithSameOpcodeParent == 0) {
1980 NumOpsWithSameOpcodeParent = 1;
1982 Parent =
I->getParent();
1984 --NumOpsWithSameOpcodeParent;
1987 ++NumOpsWithSameOpcodeParent;
1991 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1992 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1996 OperandsOrderData
Data;
1997 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1998 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2006 assert((empty() || VL.
size() == getNumLanes()) &&
2007 "Expected same number of lanes");
2008 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2009 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2010 constexpr unsigned IntrinsicNumOperands = 2;
2011 if (isa<IntrinsicInst>(VL[0]))
2012 NumOperands = IntrinsicNumOperands;
2013 OpsVec.
resize(NumOperands);
2014 unsigned NumLanes = VL.
size();
2015 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2016 OpsVec[OpIdx].
resize(NumLanes);
2017 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2018 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2029 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2030 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2031 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2038 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2041 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2044 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2045 return getData(OpIdx, Lane).V;
2049 bool empty()
const {
return OpsVec.
empty(); }
2052 void clear() { OpsVec.
clear(); }
2057 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2058 bool OpAPO = getData(OpIdx, Lane).APO;
2059 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2063 bool FoundCandidate =
false;
2064 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2065 OperandData &
Data = getData(OpI, Ln);
2066 if (
Data.APO != OpAPO ||
Data.IsUsed)
2069 FoundCandidate =
true;
2074 if (!FoundCandidate)
2083 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R) {
2085 appendOperandsOfVL(RootVL);
2092 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2093 "Expected same num of lanes across all operands");
2094 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2095 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2103 unsigned NumOperands = getNumOperands();
2104 unsigned NumLanes = getNumLanes();
2124 unsigned FirstLane = getBestLaneToStartReordering();
2127 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2128 Value *OpLane0 = getValue(OpIdx, FirstLane);
2131 if (isa<LoadInst>(OpLane0))
2132 ReorderingModes[OpIdx] = ReorderingMode::Load;
2133 else if (isa<Instruction>(OpLane0)) {
2135 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2136 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2138 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2140 else if (isa<Constant>(OpLane0))
2141 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2142 else if (isa<Argument>(OpLane0))
2144 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2147 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2154 auto &&SkipReordering = [
this]() {
2157 for (
const OperandData &
Data : Op0)
2160 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2179 if (SkipReordering())
2182 bool StrategyFailed =
false;
2190 for (
unsigned I = 0;
I < NumOperands; ++
I)
2191 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2193 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2196 int Lane = FirstLane +
Direction * Distance;
2197 if (Lane < 0 || Lane >= (
int)NumLanes)
2200 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2203 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2205 std::optional<unsigned> BestIdx = getBestOperand(
2206 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2213 swap(OpIdx, *BestIdx, Lane);
2216 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2218 StrategyFailed =
true;
2221 if (MainAltOps[OpIdx].
size() != 2) {
2222 OperandData &AltOp = getData(OpIdx, Lane);
2223 InstructionsState OpS =
2225 if (OpS.getOpcode() && OpS.isAltShuffle())
2232 if (!StrategyFailed)
2237#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2240 case ReorderingMode::Load:
2242 case ReorderingMode::Opcode:
2244 case ReorderingMode::Constant:
2246 case ReorderingMode::Splat:
2248 case ReorderingMode::Failed:
2269 const unsigned Indent = 2;
2272 OS <<
"Operand " << Cnt++ <<
"\n";
2273 for (
const OperandData &OpData : OpDataVec) {
2275 if (
Value *V = OpData.V)
2279 OS <<
", APO:" << OpData.APO <<
"}\n";
2301 int BestScore = Limit;
2302 std::optional<int>
Index;
2303 for (
int I : seq<int>(0, Candidates.size())) {
2305 Candidates[
I].second,
2308 if (Score > BestScore) {
2323 DeletedInstructions.insert(
I);
2329 return AnalyzedReductionsRoots.count(
I);
2334 AnalyzedReductionsRoots.insert(
I);
2348 AnalyzedReductionsRoots.clear();
2349 AnalyzedReductionVals.
clear();
2350 AnalyzedMinBWVals.
clear();
2367 bool collectValuesToDemote(
const TreeEntry &E,
bool IsProfitableToDemoteRoot,
2371 unsigned &MaxDepthLevel,
2372 bool &IsProfitableToDemote,
2373 bool IsTruncRoot)
const;
2383 canReorderOperands(TreeEntry *UserTE,
2390 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2394 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2396 TreeEntry *TE =
nullptr;
2398 TE = getTreeEntry(V);
2399 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2401 auto It = MultiNodeScalars.find(V);
2402 if (It != MultiNodeScalars.end()) {
2403 for (TreeEntry *E : It->second) {
2404 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2412 if (It != VL.
end()) {
2413 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2421 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2422 unsigned OpIdx)
const {
2423 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2424 const_cast<TreeEntry *
>(UserTE), OpIdx);
2428 bool areAllUsersVectorized(
2437 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
2441 getCastContextHint(
const TreeEntry &TE)
const;
2450 const EdgeInfo &EI);
2461 bool ResizeAllowed =
false)
const;
2472 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
2477 template <
typename BVTy,
typename ResTy,
typename...
Args>
2478 ResTy processBuildVector(
const TreeEntry *E, Args &...Params);
2483 Value *createBuildVector(
const TreeEntry *E);
2489 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
2496 std::optional<TargetTransformInfo::ShuffleKind>
2508 unsigned NumParts)
const;
2520 std::optional<TargetTransformInfo::ShuffleKind>
2521 isGatherShuffledSingleRegisterEntry(
2538 isGatherShuffledEntry(
2541 unsigned NumParts,
bool ForOrder =
false);
2551 void setInsertPointAfterBundle(
const TreeEntry *E);
2559 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2572 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2588 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2592 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2609 [Scalars](
Value *V,
int Idx) {
2610 return (isa<UndefValue>(V) &&
2611 Idx == PoisonMaskElem) ||
2612 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2615 if (!ReorderIndices.empty()) {
2622 return IsSame(Scalars, Mask);
2623 if (VL.
size() == ReuseShuffleIndices.size()) {
2625 return IsSame(Scalars, Mask);
2629 return IsSame(Scalars, ReuseShuffleIndices);
2632 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2633 return State == TreeEntry::NeedToGather &&
2634 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2635 UserTreeIndices.front().UserTE == UserEI.UserTE;
2639 bool hasEqualOperands(
const TreeEntry &TE)
const {
2640 if (
TE.getNumOperands() != getNumOperands())
2643 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
2644 unsigned PrevCount =
Used.count();
2645 for (
unsigned K = 0;
K < E; ++
K) {
2648 if (getOperand(K) ==
TE.getOperand(
I)) {
2654 if (PrevCount ==
Used.count())
2663 unsigned getVectorFactor()
const {
2664 if (!ReuseShuffleIndices.empty())
2665 return ReuseShuffleIndices.size();
2666 return Scalars.
size();
2701 VecTreeTy &Container;
2725 assert(Operands[OpIdx].empty() &&
"Already resized?");
2727 "Number of operands is greater than the number of scalars.");
2733 void setOperandsInOrder() {
2735 auto *I0 = cast<Instruction>(Scalars[0]);
2736 Operands.resize(I0->getNumOperands());
2737 unsigned NumLanes = Scalars.size();
2738 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2739 OpIdx != NumOperands; ++OpIdx) {
2741 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2742 auto *
I = cast<Instruction>(Scalars[Lane]);
2743 assert(
I->getNumOperands() == NumOperands &&
2744 "Expected same number of operands");
2745 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2769 unsigned getNumOperands()
const {
return Operands.size(); }
2772 Value *getSingleOperand(
unsigned OpIdx)
const {
2774 assert(!Operands[OpIdx].empty() &&
"No operand available");
2779 bool isAltShuffle()
const {
return MainOp != AltOp; }
2782 unsigned CheckedOpcode =
I->getOpcode();
2783 return (getOpcode() == CheckedOpcode ||
2784 getAltOpcode() == CheckedOpcode);
2791 auto *
I = dyn_cast<Instruction>(
Op);
2792 if (
I && isOpcodeOrAlt(
I))
2797 void setOperations(
const InstructionsState &S) {
2811 unsigned getOpcode()
const {
2812 return MainOp ? MainOp->
getOpcode() : 0;
2815 unsigned getAltOpcode()
const {
2821 int findLaneForValue(
Value *V)
const {
2822 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2823 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2824 if (!ReorderIndices.
empty())
2825 FoundLane = ReorderIndices[FoundLane];
2826 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2827 if (!ReuseShuffleIndices.
empty()) {
2828 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2829 find(ReuseShuffleIndices, FoundLane));
2843 bool isNonPowOf2Vec()
const {
2845 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
2846 "Reshuffling not supported with non-power-of-2 vectors yet.");
2847 return IsNonPowerOf2;
2854 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2855 dbgs() <<
"Operand " << OpI <<
":\n";
2856 for (
const Value *V : Operands[OpI])
2859 dbgs() <<
"Scalars: \n";
2860 for (
Value *V : Scalars)
2862 dbgs() <<
"State: ";
2865 dbgs() <<
"Vectorize\n";
2867 case ScatterVectorize:
2868 dbgs() <<
"ScatterVectorize\n";
2870 case StridedVectorize:
2871 dbgs() <<
"StridedVectorize\n";
2874 dbgs() <<
"NeedToGather\n";
2877 dbgs() <<
"MainOp: ";
2879 dbgs() << *MainOp <<
"\n";
2882 dbgs() <<
"AltOp: ";
2884 dbgs() << *AltOp <<
"\n";
2887 dbgs() <<
"VectorizedValue: ";
2888 if (VectorizedValue)
2889 dbgs() << *VectorizedValue <<
"\n";
2892 dbgs() <<
"ReuseShuffleIndices: ";
2893 if (ReuseShuffleIndices.
empty())
2896 for (
int ReuseIdx : ReuseShuffleIndices)
2897 dbgs() << ReuseIdx <<
", ";
2899 dbgs() <<
"ReorderIndices: ";
2900 for (
unsigned ReorderIdx : ReorderIndices)
2901 dbgs() << ReorderIdx <<
", ";
2903 dbgs() <<
"UserTreeIndices: ";
2904 for (
const auto &EInfo : UserTreeIndices)
2905 dbgs() << EInfo <<
", ";
2912 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
2915 dbgs() <<
"SLP: " << Banner <<
":\n";
2917 dbgs() <<
"SLP: Costs:\n";
2918 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2919 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2920 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2921 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2922 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
2928 std::optional<ScheduleData *> Bundle,
2929 const InstructionsState &S,
2930 const EdgeInfo &UserTreeIdx,
2933 TreeEntry::EntryState EntryState =
2934 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2935 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2936 ReuseShuffleIndices, ReorderIndices);
2940 TreeEntry::EntryState EntryState,
2941 std::optional<ScheduleData *> Bundle,
2942 const InstructionsState &S,
2943 const EdgeInfo &UserTreeIdx,
2946 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2947 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2948 "Need to vectorize gather entry?");
2949 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
2950 TreeEntry *
Last = VectorizableTree.
back().get();
2951 Last->Idx = VectorizableTree.
size() - 1;
2952 Last->State = EntryState;
2953 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2954 ReuseShuffleIndices.end());
2955 if (ReorderIndices.
empty()) {
2957 Last->setOperations(S);
2960 Last->Scalars.assign(VL.
size(),
nullptr);
2963 if (Idx >= VL.size())
2964 return UndefValue::get(VL.front()->getType());
2968 Last->setOperations(S);
2969 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
2971 if (
Last->State != TreeEntry::NeedToGather) {
2972 for (
Value *V : VL) {
2973 const TreeEntry *
TE = getTreeEntry(V);
2975 "Scalar already in tree!");
2978 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
2981 ScalarToTreeEntry[
V] =
Last;
2984 ScheduleData *BundleMember = *Bundle;
2985 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2988 "Bundle and VL out of sync");
2990 for (
Value *V : VL) {
2995 BundleMember->TE =
Last;
2996 BundleMember = BundleMember->NextInBundle;
2999 assert(!BundleMember &&
"Bundle and VL out of sync");
3002 bool AllConstsOrCasts =
true;
3005 auto *
I = dyn_cast<CastInst>(V);
3006 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3009 if (AllConstsOrCasts)
3011 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3012 MustGather.
insert(VL.begin(), VL.end());
3015 if (UserTreeIdx.UserTE) {
3016 Last->UserTreeIndices.push_back(UserTreeIdx);
3017 assert((!
Last->isNonPowOf2Vec() ||
Last->ReorderIndices.empty()) &&
3018 "Reordering isn't implemented for non-power-of-2 nodes yet");
3025 TreeEntry::VecTreeTy VectorizableTree;
3030 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3031 VectorizableTree[
Id]->dump();
3037 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3039 const TreeEntry *getTreeEntry(
Value *V)
const {
3040 return ScalarToTreeEntry.lookup(V);
3049 bool areAltOperandsProfitable(
const InstructionsState &S,
3054 TreeEntry::EntryState getScalarsVectorizationState(
3084 using ValueToGatherNodesMap =
3086 ValueToGatherNodesMap ValueToGatherNodes;
3089 struct ExternalUser {
3113 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3114 auto It = AliasCache.
find(Key);
3115 if (It != AliasCache.
end())
3120 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3124 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3156 UserList ExternalUses;
3176 struct ScheduleData {
3179 enum { InvalidDeps = -1 };
3181 ScheduleData() =
default;
3183 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3184 FirstInBundle =
this;
3185 NextInBundle =
nullptr;
3186 NextLoadStore =
nullptr;
3187 IsScheduled =
false;
3188 SchedulingRegionID = BlockSchedulingRegionID;
3189 clearDependencies();
3196 if (hasValidDependencies()) {
3197 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3199 assert(UnscheduledDeps == Dependencies &&
"invariant");
3203 assert(isSchedulingEntity() &&
3204 "unexpected scheduled state");
3205 for (
const ScheduleData *BundleMember =
this; BundleMember;
3206 BundleMember = BundleMember->NextInBundle) {
3207 assert(BundleMember->hasValidDependencies() &&
3208 BundleMember->UnscheduledDeps == 0 &&
3209 "unexpected scheduled state");
3210 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3211 "only bundle is marked scheduled");
3215 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3216 "all bundle members must be in same basic block");
3222 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3226 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3230 bool isPartOfBundle()
const {
3231 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3236 bool isReady()
const {
3237 assert(isSchedulingEntity() &&
3238 "can't consider non-scheduling entity for ready list");
3239 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3245 int incrementUnscheduledDeps(
int Incr) {
3246 assert(hasValidDependencies() &&
3247 "increment of unscheduled deps would be meaningless");
3248 UnscheduledDeps += Incr;
3249 return FirstInBundle->unscheduledDepsInBundle();
3254 void resetUnscheduledDeps() {
3255 UnscheduledDeps = Dependencies;
3259 void clearDependencies() {
3260 Dependencies = InvalidDeps;
3261 resetUnscheduledDeps();
3262 MemoryDependencies.clear();
3263 ControlDependencies.clear();
3266 int unscheduledDepsInBundle()
const {
3267 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3269 for (
const ScheduleData *BundleMember =
this; BundleMember;
3270 BundleMember = BundleMember->NextInBundle) {
3271 if (BundleMember->UnscheduledDeps == InvalidDeps)
3273 Sum += BundleMember->UnscheduledDeps;
3279 if (!isSchedulingEntity()) {
3280 os <<
"/ " << *Inst;
3281 }
else if (NextInBundle) {
3283 ScheduleData *SD = NextInBundle;
3285 os <<
';' << *SD->Inst;
3286 SD = SD->NextInBundle;
3297 Value *OpValue =
nullptr;
3300 TreeEntry *
TE =
nullptr;
3304 ScheduleData *FirstInBundle =
nullptr;
3308 ScheduleData *NextInBundle =
nullptr;
3312 ScheduleData *NextLoadStore =
nullptr;
3326 int SchedulingRegionID = 0;
3329 int SchedulingPriority = 0;
3335 int Dependencies = InvalidDeps;
3341 int UnscheduledDeps = InvalidDeps;
3345 bool IsScheduled =
false;
3350 const BoUpSLP::ScheduleData &SD) {
3375 struct BlockScheduling {
3377 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3381 ScheduleStart =
nullptr;
3382 ScheduleEnd =
nullptr;
3383 FirstLoadStoreInRegion =
nullptr;
3384 LastLoadStoreInRegion =
nullptr;
3385 RegionHasStackSave =
false;
3389 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3392 ScheduleRegionSize = 0;
3396 ++SchedulingRegionID;
3400 if (BB !=
I->getParent())
3403 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3404 if (SD && isInSchedulingRegion(SD))
3409 ScheduleData *getScheduleData(
Value *V) {
3410 if (
auto *
I = dyn_cast<Instruction>(V))
3411 return getScheduleData(
I);
3415 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3417 return getScheduleData(V);
3418 auto I = ExtraScheduleDataMap.find(V);
3419 if (
I != ExtraScheduleDataMap.end()) {
3420 ScheduleData *SD =
I->second.lookup(Key);
3421 if (SD && isInSchedulingRegion(SD))
3427 bool isInSchedulingRegion(ScheduleData *SD)
const {
3428 return SD->SchedulingRegionID == SchedulingRegionID;
3433 template <
typename ReadyListType>
3434 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3435 SD->IsScheduled =
true;
3438 for (ScheduleData *BundleMember = SD; BundleMember;
3439 BundleMember = BundleMember->NextInBundle) {
3440 if (BundleMember->Inst != BundleMember->OpValue)
3446 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3447 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3448 if (OpDef && OpDef->hasValidDependencies() &&
3449 OpDef->incrementUnscheduledDeps(-1) == 0) {
3453 ScheduleData *DepBundle = OpDef->FirstInBundle;
3454 assert(!DepBundle->IsScheduled &&
3455 "already scheduled bundle gets ready");
3456 ReadyList.insert(DepBundle);
3458 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3466 if (TreeEntry *TE = BundleMember->TE) {
3468 int Lane = std::distance(
TE->Scalars.begin(),
3469 find(
TE->Scalars, BundleMember->Inst));
3470 assert(Lane >= 0 &&
"Lane not set");
3478 auto *
In = BundleMember->Inst;
3481 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3482 In->getNumOperands() ==
TE->getNumOperands()) &&
3483 "Missed TreeEntry operands?");
3486 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3487 OpIdx != NumOperands; ++OpIdx)
3488 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3493 for (
Use &U : BundleMember->Inst->operands())
3494 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3498 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3499 if (MemoryDepSD->hasValidDependencies() &&
3500 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3503 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3504 assert(!DepBundle->IsScheduled &&
3505 "already scheduled bundle gets ready");
3506 ReadyList.insert(DepBundle);
3508 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3512 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3513 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3516 ScheduleData *DepBundle = DepSD->FirstInBundle;
3517 assert(!DepBundle->IsScheduled &&
3518 "already scheduled bundle gets ready");
3519 ReadyList.insert(DepBundle);
3521 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3532 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3533 ScheduleStart->comesBefore(ScheduleEnd) &&
3534 "Not a valid scheduling region?");
3536 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3537 auto *SD = getScheduleData(
I);
3540 assert(isInSchedulingRegion(SD) &&
3541 "primary schedule data not in window?");
3542 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3543 "entire bundle in window!");
3545 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3548 for (
auto *SD : ReadyInsts) {
3549 assert(SD->isSchedulingEntity() && SD->isReady() &&
3550 "item in ready list not ready?");
3555 void doForAllOpcodes(
Value *V,
3557 if (ScheduleData *SD = getScheduleData(V))
3559 auto I = ExtraScheduleDataMap.find(V);
3560 if (
I != ExtraScheduleDataMap.end())
3561 for (
auto &
P :
I->second)
3562 if (isInSchedulingRegion(
P.second))
3567 template <
typename ReadyListType>
3568 void initialFillReadyList(ReadyListType &ReadyList) {
3569 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3570 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3571 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3573 ReadyList.insert(SD);
3575 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3590 std::optional<ScheduleData *>
3592 const InstructionsState &S);
3598 ScheduleData *allocateScheduleDataChunks();
3602 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3607 ScheduleData *PrevLoadStore,
3608 ScheduleData *NextLoadStore);
3612 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3616 void resetSchedule();
3637 ExtraScheduleDataMap;
3650 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3654 ScheduleData *LastLoadStoreInRegion =
nullptr;
3659 bool RegionHasStackSave =
false;
3662 int ScheduleRegionSize = 0;
3671 int SchedulingRegionID = 1;
3679 void scheduleBlock(BlockScheduling *BS);
3686 struct OrdersTypeDenseMapInfo {
3699 static unsigned getHashValue(
const OrdersType &V) {
3720 unsigned MaxVecRegSize;
3721 unsigned MinVecRegSize;
3736 unsigned ReductionBitWidth = 0;
3740 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3759 struct ChildIteratorType
3761 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3772 return R.VectorizableTree[0].get();
3776 return {
N->UserTreeIndices.begin(),
N->Container};
3780 return {
N->UserTreeIndices.end(),
N->Container};
3785 class nodes_iterator {
3796 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3800 return nodes_iterator(R->VectorizableTree.begin());
3804 return nodes_iterator(R->VectorizableTree.end());
3807 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3818 OS << Entry->Idx <<
".\n";
3821 for (
auto *V : Entry->Scalars) {
3823 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3824 return EU.Scalar == V;
3834 if (Entry->State == TreeEntry::NeedToGather)
3836 if (Entry->State == TreeEntry::ScatterVectorize ||
3837 Entry->State == TreeEntry::StridedVectorize)
3838 return "color=blue";
3847 for (
auto *
I : DeletedInstructions) {
3848 for (
Use &U :
I->operands()) {
3849 auto *
Op = dyn_cast<Instruction>(U.get());
3850 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
3854 I->dropAllReferences();
3856 for (
auto *
I : DeletedInstructions) {
3858 "trying to erase instruction with users.");
3859 I->eraseFromParent();
3865#ifdef EXPENSIVE_CHECKS
3876 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3877 "Expected non-empty mask.");
3880 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
3882 Reuses[Mask[
I]] = Prev[
I];
3890 bool BottomOrder =
false) {
3891 assert(!Mask.empty() &&
"Expected non-empty mask.");
3892 unsigned Sz = Mask.size();
3895 if (Order.
empty()) {
3897 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
3899 PrevOrder.
swap(Order);
3902 for (
unsigned I = 0;
I < Sz; ++
I)
3904 Order[
I] = PrevOrder[Mask[
I]];
3906 return Data.value() == Sz ||
Data.index() ==
Data.value();
3915 if (Order.
empty()) {
3917 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
3927 for (
unsigned I = 0;
I < Sz; ++
I)
3929 Order[MaskOrder[
I]] =
I;
3933std::optional<BoUpSLP::OrdersType>
3935 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3939 Type *ScalarTy = GatheredScalars.
front()->getType();
3940 int NumScalars = GatheredScalars.
size();
3942 return std::nullopt;
3945 if (NumParts == 0 || NumParts >= NumScalars)
3951 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
3953 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
3956 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
3957 return std::nullopt;
3958 OrdersType CurrentOrder(NumScalars, NumScalars);
3959 if (GatherShuffles.
size() == 1 &&
3961 Entries.front().front()->isSame(TE.Scalars)) {
3964 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
3965 return CurrentOrder;
3969 return all_of(Mask, [&](
int I) {
3976 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
3977 (Entries.size() != 1 ||
3978 Entries.front().front()->ReorderIndices.empty())) ||
3979 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
3980 return std::nullopt;
3985 for (
int I : seq<int>(0, NumParts)) {
3986 if (ShuffledSubMasks.
test(
I))
3988 const int VF = GetVF(
I);
3993 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
3994 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
3995 ShuffledSubMasks.
set(
I);
3999 int FirstMin = INT_MAX;
4000 int SecondVecFound =
false;
4001 for (
int K : seq<int>(0, PartSz)) {
4002 int Idx = Mask[
I * PartSz + K];
4004 Value *V = GatheredScalars[
I * PartSz + K];
4006 SecondVecFound =
true;
4015 SecondVecFound =
true;
4019 FirstMin = (FirstMin / PartSz) * PartSz;
4021 if (SecondVecFound) {
4022 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4023 ShuffledSubMasks.
set(
I);
4026 for (
int K : seq<int>(0, PartSz)) {
4027 int Idx = Mask[
I * PartSz + K];
4031 if (
Idx >= PartSz) {
4032 SecondVecFound =
true;
4035 if (CurrentOrder[
I * PartSz +
Idx] >
4036 static_cast<unsigned>(
I * PartSz + K) &&
4037 CurrentOrder[
I * PartSz +
Idx] !=
4038 static_cast<unsigned>(
I * PartSz +
Idx))
4039 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4042 if (SecondVecFound) {
4043 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4044 ShuffledSubMasks.
set(
I);
4049 int PartSz = NumScalars / NumParts;
4050 if (!ExtractShuffles.
empty())
4051 TransformMaskToOrder(
4052 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4053 if (!ExtractShuffles[
I])
4056 for (
unsigned Idx : seq<unsigned>(0, PartSz)) {
4057 int K =
I * PartSz +
Idx;
4060 if (!TE.ReuseShuffleIndices.empty())
4061 K = TE.ReuseShuffleIndices[K];
4062 if (!TE.ReorderIndices.empty())
4063 K = std::distance(TE.ReorderIndices.begin(),
4064 find(TE.ReorderIndices, K));
4065 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4068 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4070 .getKnownMinValue());
4075 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4076 if (ShuffledSubMasks.
any())
4077 return std::nullopt;
4078 PartSz = NumScalars;
4081 if (!Entries.empty())
4082 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4083 if (!GatherShuffles[
I])
4085 return std::max(Entries[
I].front()->getVectorFactor(),
4086 Entries[
I].back()->getVectorFactor());
4089 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4090 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4091 return std::nullopt;
4092 return std::move(CurrentOrder);
4097 bool CompareOpcodes =
true) {
4100 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4103 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4106 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4110 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4115template <
typename T>
4117 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4119 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4120 return CommonAlignment;
4125 unsigned Sz = Order.
size();
4127 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4138static std::optional<Value *>
4144 const SCEV *PtrSCEVLowest =
nullptr;
4145 const SCEV *PtrSCEVHighest =
nullptr;
4151 return std::nullopt;
4153 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4154 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4158 if (isa<SCEVCouldNotCompute>(Diff))
4159 return std::nullopt;
4161 PtrSCEVLowest = PtrSCEV;
4165 if (isa<SCEVCouldNotCompute>(Diff1))
4166 return std::nullopt;
4168 PtrSCEVHighest = PtrSCEV;
4174 if (isa<SCEVCouldNotCompute>(Dist))
4175 return std::nullopt;
4176 int Size =
DL.getTypeStoreSize(ElemTy);
4177 auto TryGetStride = [&](
const SCEV *Dist,
4178 const SCEV *Multiplier) ->
const SCEV * {
4179 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4180 if (M->getOperand(0) == Multiplier)
4181 return M->getOperand(1);
4182 if (M->getOperand(1) == Multiplier)
4183 return M->getOperand(0);
4186 if (Multiplier == Dist)
4191 const SCEV *Stride =
nullptr;
4192 if (
Size != 1 || SCEVs.
size() > 2) {
4194 Stride = TryGetStride(Dist, Sz);
4196 return std::nullopt;
4198 if (!Stride || isa<SCEVConstant>(Stride))
4199 return std::nullopt;
4202 using DistOrdPair = std::pair<int64_t, int>;
4204 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4206 bool IsConsecutive =
true;
4207 for (
const SCEV *PtrSCEV : SCEVs) {
4209 if (PtrSCEV != PtrSCEVLowest) {
4211 const SCEV *Coeff = TryGetStride(Diff, Stride);
4213 return std::nullopt;
4214 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4215 if (!SC || isa<SCEVCouldNotCompute>(SC))
4216 return std::nullopt;
4220 return std::nullopt;
4221 Dist = SC->getAPInt().getZExtValue();
4225 return std::nullopt;
4226 auto Res = Offsets.emplace(Dist, Cnt);
4228 return std::nullopt;
4230 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4233 if (Offsets.size() != SCEVs.
size())
4234 return std::nullopt;
4235 SortedIndices.
clear();
4236 if (!IsConsecutive) {
4240 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4241 SortedIndices[Cnt] = Pair.second;
4262 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4268 const unsigned Sz = VL.
size();
4270 auto *POIter = PointerOps.
begin();
4271 for (
Value *V : VL) {
4272 auto *L = cast<LoadInst>(V);
4275 *POIter = L->getPointerOperand();
4286 "supported with VectorizeNonPowerOf2");
4290 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4301 if (Order.
empty()) {
4302 Ptr0 = PointerOps.
front();
4303 PtrN = PointerOps.
back();
4305 Ptr0 = PointerOps[Order.
front()];
4306 PtrN = PointerOps[Order.
back()];
4308 std::optional<int> Diff =
4311 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4314 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4326 (
static_cast<unsigned>(std::abs(*Diff)) <=
4329 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4330 *Diff == -(
static_cast<int>(Sz) - 1))) {
4331 int Stride = *Diff /
static_cast<int>(Sz - 1);
4332 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4344 else if (
Ptr != Ptr0)
4349 if (((Dist / Stride) * Stride) != Dist ||
4350 !Dists.
insert(Dist).second)
4353 if (Dists.
size() == Sz)
4359 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment) {
4360 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4362 unsigned MaxVF = std::max<unsigned>(
bit_floor(VL.
size() / 2), MinVF);
4363 MaxVF = std::min(
getMaximumVF(Sz, Instruction::Load), MaxVF);
4364 for (
unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4365 unsigned VectorizedCnt = 0;
4367 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End;
4368 Cnt += VF, ++VectorizedCnt) {
4386 if (VectorizedCnt == VL.
size() / VF) {
4390 Instruction::Load, VecTy,
4396 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
4400 Instruction::Load, SubVecTy, LI0->getAlign(),
4401 LI0->getPointerAddressSpace(),
CostKind,
4406 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4411 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4416 "Expected only consecutive, strided or masked gather loads.");
4419 for (
int Idx : seq<int>(0, VL.
size()))
4423 ShuffleMask,
CostKind,
I * VF, SubVecTy);
4428 if (MaskedGatherCost > VecLdCost)
4438 bool ProfitableGatherPointers =
4441 return L->isLoopInvariant(V);
4443 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
4444 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
4446 (
GEP &&
GEP->getNumOperands() == 2 &&
4447 isa<Constant, Instruction>(
GEP->getOperand(1)));
4449 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4454 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4473 "Expected list of pointer operands.");
4478 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4483 std::optional<int> Diff =
4489 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4495 if (Bases.
size() > VL.
size() / 2 - 1)
4499 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4505 bool AnyConsecutive =
false;
4506 for (
auto &
Base : Bases) {
4507 auto &Vec =
Base.second;
4508 if (Vec.size() > 1) {
4510 const std::tuple<Value *, int, unsigned> &
Y) {
4511 return std::get<1>(
X) < std::get<1>(
Y);
4513 int InitialOffset = std::get<1>(Vec[0]);
4515 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4521 SortedIndices.
clear();
4522 if (!AnyConsecutive)
4525 for (
auto &
Base : Bases) {
4526 for (
auto &
T :
Base.second)
4531 "Expected SortedIndices to be the size of VL");
4535std::optional<BoUpSLP::OrdersType>
4537 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
4538 Type *ScalarTy = TE.Scalars[0]->getType();
4541 Ptrs.
reserve(TE.Scalars.size());
4542 for (
Value *V : TE.Scalars) {
4543 auto *L = dyn_cast<LoadInst>(V);
4544 if (!L || !L->isSimple())
4545 return std::nullopt;
4551 return std::move(Order);
4552 return std::nullopt;
4563 if (VU->
getType() != V->getType())
4566 if (!VU->
hasOneUse() && !V->hasOneUse())
4572 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4578 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
4579 bool IsReusedIdx =
false;
4581 if (IE2 == VU && !IE1)
4583 if (IE1 == V && !IE2)
4584 return V->hasOneUse();
4585 if (IE1 && IE1 != V) {
4587 IsReusedIdx |= ReusedIdx.
test(Idx1);
4588 ReusedIdx.
set(Idx1);
4589 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4592 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4594 if (IE2 && IE2 != VU) {
4596 IsReusedIdx |= ReusedIdx.
test(Idx2);
4597 ReusedIdx.
set(Idx2);
4598 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4601 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4603 }
while (!IsReusedIdx && (IE1 || IE2));
4607std::optional<BoUpSLP::OrdersType>
4610 if (TE.isNonPowOf2Vec())
4611 return std::nullopt;
4615 if (!TE.ReuseShuffleIndices.empty()) {
4617 return std::nullopt;
4625 unsigned Sz = TE.Scalars.size();
4626 if (TE.State == TreeEntry::NeedToGather) {
4627 if (std::optional<OrdersType> CurrentOrder =
4632 ::addMask(Mask, TE.ReuseShuffleIndices);
4633 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4634 unsigned Sz = TE.Scalars.size();
4635 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
4638 Res[
Idx + K * Sz] =
I + K * Sz;
4640 return std::move(Res);
4643 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4645 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4646 return std::nullopt;
4650 if (TE.ReorderIndices.empty())
4651 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4654 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4655 unsigned VF = ReorderMask.
size();
4657 unsigned NumParts = VF / Sz;
4659 for (
unsigned I = 0;
I < VF;
I += Sz) {
4661 unsigned UndefCnt = 0;
4670 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
4672 return std::nullopt;
4674 for (
unsigned K = 0; K < NumParts; ++K)
4675 ResOrder[Val + Sz * K] =
I + K;
4677 return std::move(ResOrder);
4679 unsigned VF = TE.getVectorFactor();
4682 TE.ReuseShuffleIndices.end());
4683 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4685 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4686 return Idx && *Idx < Sz;
4689 if (TE.ReorderIndices.empty())
4690 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4693 for (
unsigned I = 0;
I < VF; ++
I) {
4694 int &
Idx = ReusedMask[
I];
4697 Value *V = TE.Scalars[ReorderMask[
Idx]];
4699 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
4705 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
4706 auto *It = ResOrder.
begin();
4707 for (
unsigned K = 0; K < VF; K += Sz) {
4711 std::iota(SubMask.begin(), SubMask.end(), 0);
4713 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
4714 std::advance(It, Sz);
4716 if (TE.State == TreeEntry::NeedToGather &&
4718 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
4719 return std::nullopt;
4720 return std::move(ResOrder);
4722 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4723 any_of(TE.UserTreeIndices,
4725 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4727 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
4728 return std::nullopt;
4729 if ((TE.State == TreeEntry::Vectorize ||
4730 TE.State == TreeEntry::StridedVectorize) &&
4731 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4732 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4734 return TE.ReorderIndices;
4735 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4736 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
4737 Value *V1 = TE.Scalars[I1];
4738 Value *V2 = TE.Scalars[I2];
4739 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
4745 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
4746 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4747 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4748 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4755 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4756 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4757 if (EE1->getOperand(0) != EE2->getOperand(0))
4763 auto IsIdentityOrder = [](
const OrdersType &Order) {
4764 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4769 if (!TE.ReorderIndices.empty())
4770 return TE.ReorderIndices;
4773 std::iota(Phis.begin(), Phis.end(), 0);
4775 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4778 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4779 ResOrder[Id] = PhiToId[Phis[Id]];
4780 if (IsIdentityOrder(ResOrder))
4781 return std::nullopt;
4782 return std::move(ResOrder);
4784 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4788 if ((TE.getOpcode() == Instruction::ExtractElement ||
4789 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4790 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4792 auto *EE = dyn_cast<ExtractElementInst>(V);
4793 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4798 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4800 if (Reuse || !CurrentOrder.
empty())
4801 return std::move(CurrentOrder);
4809 int Sz = TE.Scalars.size();
4811 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4813 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
4814 if (It == TE.Scalars.begin())
4817 if (It != TE.Scalars.end()) {
4819 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4834 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4837 return std::move(Order);
4842 return std::nullopt;
4843 if (TE.Scalars.size() >= 4)
4847 return CurrentOrder;
4849 return std::nullopt;
4859 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
4861 if (Cluster != FirstCluster)
4867void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
4870 const unsigned Sz =
TE.Scalars.size();
4872 if (
TE.State != TreeEntry::NeedToGather ||
4879 addMask(NewMask,
TE.ReuseShuffleIndices);
4881 TE.ReorderIndices.clear();
4888 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
4889 *
End =
TE.ReuseShuffleIndices.end();
4890 It !=
End; std::advance(It, Sz))
4891 std::iota(It, std::next(It, Sz), 0);
4897 "Expected same size of orders");
4898 unsigned Sz = Order.
size();
4900 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
4901 if (Order[
Idx] != Sz)
4902 UsedIndices.
set(Order[
Idx]);
4904 if (SecondaryOrder.
empty()) {
4905 for (
unsigned Idx : seq<unsigned>(0, Sz))
4906 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
4909 for (
unsigned Idx : seq<unsigned>(0, Sz))
4910 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
4911 !UsedIndices.
test(SecondaryOrder[
Idx]))
4912 Order[
Idx] = SecondaryOrder[
Idx];
4932 ExternalUserReorderMap;
4937 const std::unique_ptr<TreeEntry> &TE) {
4940 findExternalStoreUsersReorderIndices(TE.get());
4941 if (!ExternalUserReorderIndices.
empty()) {
4942 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4944 std::move(ExternalUserReorderIndices));
4950 if (TE->isAltShuffle()) {
4953 unsigned Opcode0 = TE->getOpcode();
4954 unsigned Opcode1 = TE->getAltOpcode();
4957 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4958 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4959 OpcodeMask.
set(Lane);
4961 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4962 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4968 if (std::optional<OrdersType> CurrentOrder =
4978 const TreeEntry *UserTE = TE.get();
4980 if (UserTE->UserTreeIndices.size() != 1)
4983 return EI.UserTE->State == TreeEntry::Vectorize &&
4984 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4987 UserTE = UserTE->UserTreeIndices.back().UserTE;
4990 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4991 if (!(TE->State == TreeEntry::Vectorize ||
4992 TE->State == TreeEntry::StridedVectorize) ||
4993 !TE->ReuseShuffleIndices.empty())
4994 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4995 if (TE->State == TreeEntry::Vectorize &&
4996 TE->getOpcode() == Instruction::PHI)
4997 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5002 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5004 auto It = VFToOrderedEntries.
find(VF);
5005 if (It == VFToOrderedEntries.
end())
5017 for (
const TreeEntry *OpTE : OrderedEntries) {
5020 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5023 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5025 if (OpTE->State == TreeEntry::NeedToGather ||
5026 !OpTE->ReuseShuffleIndices.empty()) {
5027 auto It = GathersToOrders.find(OpTE);
5028 if (It != GathersToOrders.end())
5031 if (OpTE->isAltShuffle()) {
5032 auto It = AltShufflesToOrders.find(OpTE);
5033 if (It != AltShufflesToOrders.end())
5036 if (OpTE->State == TreeEntry::Vectorize &&
5037 OpTE->getOpcode() == Instruction::PHI) {
5038 auto It = PhisToOrders.
find(OpTE);
5039 if (It != PhisToOrders.
end())
5042 return OpTE->ReorderIndices;
5045 auto It = ExternalUserReorderMap.
find(OpTE);
5046 if (It != ExternalUserReorderMap.
end()) {
5047 const auto &ExternalUserReorderIndices = It->second;
5051 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5052 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
5053 ExternalUserReorderIndices.size();
5055 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
5056 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5063 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5064 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5067 unsigned E = Order.size();
5070 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5073 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5075 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5078 if (OrdersUses.empty())
5081 const unsigned Sz = Order.size();
5082 for (
unsigned Idx : seq<unsigned>(0, Sz))
5083 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5088 unsigned IdentityCnt = 0;
5089 unsigned FilledIdentityCnt = 0;
5091 for (
auto &Pair : OrdersUses) {
5092 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5093 if (!Pair.first.empty())
5094 FilledIdentityCnt += Pair.second;
5095 IdentityCnt += Pair.second;
5100 unsigned Cnt = IdentityCnt;
5101 for (
auto &Pair : OrdersUses) {
5105 if (Cnt < Pair.second ||
5106 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5107 Cnt == Pair.second && !BestOrder.
empty() &&
5108 IsIdentityOrder(BestOrder))) {
5110 BestOrder = Pair.first;
5117 if (IsIdentityOrder(BestOrder))
5123 unsigned E = BestOrder.
size();
5125 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5128 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5130 if (TE->Scalars.size() != VF) {
5131 if (TE->ReuseShuffleIndices.size() == VF) {
5137 return EI.UserTE->Scalars.size() == VF ||
5138 EI.UserTE->Scalars.size() ==
5141 "All users must be of VF size.");
5144 reorderNodeWithReuses(*TE, Mask);
5148 if ((TE->State == TreeEntry::Vectorize ||
5149 TE->State == TreeEntry::StridedVectorize) &&
5152 !TE->isAltShuffle()) {
5156 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5157 TE->reorderOperands(Mask);
5160 TE->reorderOperands(Mask);
5161 assert(TE->ReorderIndices.empty() &&
5162 "Expected empty reorder sequence.");
5165 if (!TE->ReuseShuffleIndices.empty()) {
5172 addMask(NewReuses, TE->ReuseShuffleIndices);
5173 TE->ReuseShuffleIndices.swap(NewReuses);
5179bool BoUpSLP::canReorderOperands(
5180 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5184 if (UserTE->isNonPowOf2Vec())
5187 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
5188 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
5189 return OpData.first ==
I &&
5190 (OpData.second->State == TreeEntry::Vectorize ||
5191 OpData.second->State == TreeEntry::StridedVectorize);
5194 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
5196 if (
any_of(TE->UserTreeIndices,
5197 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5201 Edges.emplace_back(
I, TE);
5207 if (TE->State != TreeEntry::Vectorize &&
5208 TE->State != TreeEntry::StridedVectorize &&
5209 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5213 TreeEntry *
Gather =
nullptr;
5215 [&
Gather, UserTE,
I](TreeEntry *TE) {
5216 assert(TE->State != TreeEntry::Vectorize &&
5217 TE->State != TreeEntry::StridedVectorize &&
5218 "Only non-vectorized nodes are expected.");
5219 if (
any_of(TE->UserTreeIndices,
5220 [UserTE,
I](
const EdgeInfo &EI) {
5221 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5223 assert(TE->isSame(UserTE->getOperand(
I)) &&
5224 "Operand entry does not match operands.");
5245 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5246 if (TE->State != TreeEntry::Vectorize &&
5247 TE->State != TreeEntry::StridedVectorize)
5249 if (std::optional<OrdersType> CurrentOrder =
5251 OrderedEntries.
insert(TE.get());
5252 if (!(TE->State == TreeEntry::Vectorize ||
5253 TE->State == TreeEntry::StridedVectorize) ||
5254 !TE->ReuseShuffleIndices.empty())
5255 GathersToOrders.
insert(TE.get());
5264 while (!OrderedEntries.
empty()) {
5269 for (TreeEntry *TE : OrderedEntries) {
5270 if (!(TE->State == TreeEntry::Vectorize ||
5271 TE->State == TreeEntry::StridedVectorize ||
5272 (TE->State == TreeEntry::NeedToGather &&
5274 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5277 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5279 !Visited.
insert(TE).second) {
5285 for (
EdgeInfo &EI : TE->UserTreeIndices) {
5286 TreeEntry *UserTE = EI.
UserTE;
5287 auto It =
Users.find(UserTE);
5288 if (It ==
Users.end())
5289 It =
Users.insert({UserTE, {}}).first;
5290 It->second.emplace_back(EI.
EdgeIdx, TE);
5294 for (TreeEntry *TE : Filtered)
5295 OrderedEntries.remove(TE);
5297 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5299 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
5300 return Data1.first->Idx > Data2.first->Idx;
5302 for (
auto &
Data : UsersVec) {
5305 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
5307 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5308 OrderedEntries.remove(
Op.second);
5321 for (
const auto &
Op :
Data.second) {
5322 TreeEntry *OpTE =
Op.second;
5323 if (!VisitedOps.
insert(OpTE).second)
5325 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5327 const auto Order = [&]() ->
const OrdersType {
5328 if (OpTE->State == TreeEntry::NeedToGather ||
5329 !OpTE->ReuseShuffleIndices.empty())
5332 return OpTE->ReorderIndices;
5336 if (Order.size() == 1)
5339 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
5340 return P.second == OpTE;
5343 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5344 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5347 unsigned E = Order.size();
5350 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5353 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5356 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5358 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
5359 const auto AllowsReordering = [&](
const TreeEntry *TE) {
5361 if (TE->isNonPowOf2Vec())
5363 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5364 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5365 (IgnoreReorder && TE->Idx == 0))
5367 if (TE->State == TreeEntry::NeedToGather) {
5376 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
5377 TreeEntry *UserTE = EI.
UserTE;
5378 if (!VisitedUsers.
insert(UserTE).second)
5383 if (AllowsReordering(UserTE))
5391 if (
static_cast<unsigned>(
count_if(
5392 Ops, [UserTE, &AllowsReordering](
5393 const std::pair<unsigned, TreeEntry *> &
Op) {
5394 return AllowsReordering(
Op.second) &&
5397 return EI.UserTE == UserTE;
5399 })) <= Ops.
size() / 2)
5400 ++Res.first->second;
5403 if (OrdersUses.empty()) {
5404 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5405 OrderedEntries.remove(
Op.second);
5409 const unsigned Sz = Order.size();
5410 for (
unsigned Idx : seq<unsigned>(0, Sz))
5411 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5416 unsigned IdentityCnt = 0;
5417 unsigned VF =
Data.second.front().second->getVectorFactor();
5419 for (
auto &Pair : OrdersUses) {
5420 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5421 IdentityCnt += Pair.second;
5426 unsigned Cnt = IdentityCnt;
5427 for (
auto &Pair : OrdersUses) {
5431 if (Cnt < Pair.second) {
5433 BestOrder = Pair.first;
5440 if (IsIdentityOrder(BestOrder)) {
5441 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5442 OrderedEntries.remove(
Op.second);
5451 unsigned E = BestOrder.
size();
5453 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5455 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
5456 TreeEntry *TE =
Op.second;
5457 OrderedEntries.remove(TE);
5458 if (!VisitedOps.
insert(TE).second)
5460 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
5461 reorderNodeWithReuses(*TE, Mask);
5465 if (TE->State != TreeEntry::Vectorize &&
5466 TE->State != TreeEntry::StridedVectorize &&
5467 (TE->State != TreeEntry::ScatterVectorize ||
5468 TE->ReorderIndices.empty()))
5470 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
5471 TE->ReorderIndices.empty()) &&
5472 "Non-matching sizes of user/operand entries.");
5474 if (IgnoreReorder && TE == VectorizableTree.front().get())
5475 IgnoreReorder =
false;
5478 for (TreeEntry *
Gather : GatherOps) {
5480 "Unexpected reordering of gathers.");
5481 if (!
Gather->ReuseShuffleIndices.empty()) {
5487 OrderedEntries.remove(
Gather);
5491 if (
Data.first->State != TreeEntry::Vectorize ||
5492 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5493 Data.first->getMainOp()) ||
5494 Data.first->isAltShuffle())
5495 Data.first->reorderOperands(Mask);
5496 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
5497 Data.first->isAltShuffle() ||
5498 Data.first->State == TreeEntry::StridedVectorize) {
5502 if (
Data.first->ReuseShuffleIndices.empty() &&
5503 !
Data.first->ReorderIndices.empty() &&
5504 !
Data.first->isAltShuffle()) {
5507 OrderedEntries.insert(
Data.first);
5515 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5516 VectorizableTree.front()->ReuseShuffleIndices.empty())
5517 VectorizableTree.front()->ReorderIndices.clear();
5524 for (
auto &TEPtr : VectorizableTree) {
5525 TreeEntry *Entry = TEPtr.get();
5528 if (Entry->State == TreeEntry::NeedToGather)
5532 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5533 Value *Scalar = Entry->Scalars[Lane];
5534 if (!isa<Instruction>(Scalar))
5537 auto It = ScalarToExtUses.
find(Scalar);
5538 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
5542 const auto *ExtI = ExternallyUsedValues.
find(Scalar);
5543 if (ExtI != ExternallyUsedValues.
end()) {
5544 int FoundLane = Entry->findLaneForValue(Scalar);
5545 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
5546 << FoundLane <<
" from " << *Scalar <<
".\n");
5547 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
5548 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
5551 for (
User *U : Scalar->users()) {
5559 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5563 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5567 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5569 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5570 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
5572 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
5576 if (It != ScalarToExtUses.
end()) {
5577 ExternalUses[It->second].User =
nullptr;
5582 int FoundLane = Entry->findLaneForValue(Scalar);
5584 <<
" from lane " << FoundLane <<
" from " << *Scalar
5586 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
5587 ExternalUses.emplace_back(Scalar, U, FoundLane);
5596BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
5598 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5599 Value *V = TE->Scalars[Lane];
5605 for (
User *U : V->users()) {
5606 auto *SI = dyn_cast<StoreInst>(U);
5607 if (SI ==
nullptr || !SI->isSimple() ||
5611 if (getTreeEntry(U))
5615 auto &StoresVec = PtrToStoresMap[
Ptr];
5618 if (StoresVec.size() > Lane)
5621 if (!StoresVec.empty() &&
5622 SI->getParent() != StoresVec.back()->getParent())
5625 if (!StoresVec.empty() &&
5626 SI->getValueOperand()->getType() !=
5627 StoresVec.back()->getValueOperand()->getType())
5629 StoresVec.push_back(SI);
5632 return PtrToStoresMap;
5636 OrdersType &ReorderIndices)
const {
5644 StoreOffsetVec[0] = {S0, 0};
5647 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
5649 std::optional<int> Diff =
5651 SI->getPointerOperand(), *
DL, *SE,
5656 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
5661 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
5662 const std::pair<StoreInst *, int> &Pair2) {
5663 int Offset1 = Pair1.second;
5664 int Offset2 = Pair2.second;
5665 return Offset1 < Offset2;
5669 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5670 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx - 1].second + 1)
5675 ReorderIndices.reserve(StoresVec.
size());
5678 [SI](
const std::pair<StoreInst *, int> &Pair) {
5679 return Pair.first ==
SI;
5681 StoreOffsetVec.begin();
5682 ReorderIndices.push_back(
Idx);
5687 auto IsIdentityOrder = [](
const OrdersType &Order) {
5688 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
5693 if (IsIdentityOrder(ReorderIndices))
5694 ReorderIndices.clear();
5701 for (
unsigned Idx : Order)
5708BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
5709 unsigned NumLanes =
TE->Scalars.size();
5712 collectUserStores(TE);
5721 for (
const auto &Pair : PtrToStoresMap) {
5722 auto &StoresVec = Pair.second;
5724 if (StoresVec.size() != NumLanes)
5729 if (!canFormVector(StoresVec, ReorderIndices))
5734 ExternalReorderIndices.
push_back(ReorderIndices);
5736 return ExternalReorderIndices;
5742 UserIgnoreList = &UserIgnoreLst;
5745 buildTree_rec(Roots, 0,
EdgeInfo());
5752 buildTree_rec(Roots, 0,
EdgeInfo());
5759 Value *NeedsScheduling =
nullptr;
5760 for (
Value *V : VL) {
5763 if (!NeedsScheduling) {
5764 NeedsScheduling = V;
5769 return NeedsScheduling;
5780 bool AllowAlternate) {
5784 if (
auto *LI = dyn_cast<LoadInst>(V)) {
5787 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
5792 if (isa<ExtractElementInst, UndefValue>(V))
5794 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
5796 !isa<UndefValue>(EI->getIndexOperand()))
5799 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
5802 if ((isa<BinaryOperator, CastInst>(
I)) &&
5812 : cast<CastInst>(
I)->getOperand(0)->getType()));
5814 if (isa<CastInst>(
I)) {
5815 std::pair<size_t, size_t> OpVals =
5821 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
5823 if (CI->isCommutative())
5829 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
5843 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
5844 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5845 SubKey =
hash_value(Gep->getPointerOperand());
5849 !isa<ConstantInt>(
I->getOperand(1))) {
5857 return std::make_pair(Key, SubKey);
5867bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
5869 unsigned Opcode0 = S.getOpcode();
5870 unsigned Opcode1 = S.getAltOpcode();
5873 for (
unsigned Lane : seq<unsigned>(0, VL.
size()))
5874 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
5875 OpcodeMask.set(Lane);
5878 Opcode0, Opcode1, OpcodeMask))
5881 for (
unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
5885 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
5889 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
5895 switch (Res.value_or(0)) {
5910 constexpr unsigned NumAltInsts = 3;
5911 unsigned NonInstCnt = 0;
5914 unsigned UndefCnt = 0;
5916 unsigned ExtraShuffleInsts = 0;
5925 return is_contained(Operands.back(), V);
5928 ++ExtraShuffleInsts;
5945 if (isa<Constant, ExtractElementInst>(V) ||
5946 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
5947 if (isa<UndefValue>(V))
5953 if (!Res.second && Res.first->second == 1)
5954 ++ExtraShuffleInsts;
5955 ++Res.first->getSecond();
5956 if (
auto *
I = dyn_cast<Instruction>(V))
5957 UniqueOpcodes.
insert(
I->getOpcode());
5958 else if (Res.second)
5961 return none_of(Uniques, [&](
const auto &
P) {
5962 return P.first->hasNUsesOrMore(
P.second + 1) &&
5964 return getTreeEntry(U) || Uniques.contains(U);
5973 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
5974 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
5975 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
5978BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5981 assert(S.MainOp &&
"Expected instructions with same/alternate opcodes only.");
5983 unsigned ShuffleOrOp =
5984 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
5985 auto *VL0 = cast<Instruction>(S.OpValue);
5986 switch (ShuffleOrOp) {
5987 case Instruction::PHI: {
5990 for (
Value *
Incoming : cast<PHINode>(V)->incoming_values()) {
5992 if (Term &&
Term->isTerminator()) {
5994 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
5995 return TreeEntry::NeedToGather;
5999 return TreeEntry::Vectorize;
6001 case Instruction::ExtractValue:
6002 case Instruction::ExtractElement: {
6003 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6006 return TreeEntry::NeedToGather;
6007 if (Reuse || !CurrentOrder.empty())
6008 return TreeEntry::Vectorize;
6010 return TreeEntry::NeedToGather;
6012 case Instruction::InsertElement: {
6016 for (
Value *V : VL) {
6017 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
6019 "Non-constant or undef index?");
6023 return !SourceVectors.contains(V);
6026 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
6027 "different source vectors.\n");
6028 return TreeEntry::NeedToGather;
6031 return TreeEntry::Vectorize;
6033 case Instruction::Load: {
6042 return TreeEntry::Vectorize;
6044 return TreeEntry::ScatterVectorize;
6046 return TreeEntry::StridedVectorize;
6049 Type *ScalarTy = VL0->getType();
6050 if (
DL->getTypeSizeInBits(ScalarTy) !=
6051 DL->getTypeAllocSizeInBits(ScalarTy))
6052 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
6054 [](
Value *V) {
return !cast<LoadInst>(V)->isSimple(); }))
6059 return TreeEntry::NeedToGather;
6063 case Instruction::ZExt:
6064 case Instruction::SExt:
6065 case Instruction::FPToUI:
6066 case Instruction::FPToSI:
6067 case Instruction::FPExt:
6068 case Instruction::PtrToInt:
6069 case Instruction::IntToPtr:
6070 case Instruction::SIToFP:
6071 case Instruction::UIToFP:
6072 case Instruction::Trunc:
6073 case Instruction::FPTrunc:
6074 case Instruction::BitCast: {
6075 Type *SrcTy = VL0->getOperand(0)->getType();
6076 for (
Value *V : VL) {
6077 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6080 dbgs() <<
"SLP: Gathering casts with different src types.\n");
6081 return TreeEntry::NeedToGather;
6084 return TreeEntry::Vectorize;
6086 case Instruction::ICmp:
6087 case Instruction::FCmp: {
6091 Type *ComparedTy = VL0->getOperand(0)->getType();
6092 for (
Value *V : VL) {
6094 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
6095 Cmp->getOperand(0)->getType() != ComparedTy) {
6096 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
6097 return TreeEntry::NeedToGather;
6100 return TreeEntry::Vectorize;
6102 case Instruction::Select:
6103 case Instruction::FNeg:
6104 case Instruction::Add:
6105 case Instruction::FAdd:
6106 case Instruction::Sub:
6107 case Instruction::FSub:
6108 case Instruction::Mul:
6109 case Instruction::FMul:
6110 case Instruction::UDiv:
6111 case Instruction::SDiv:
6112 case Instruction::FDiv:
6113 case Instruction::URem:
6114 case Instruction::SRem:
6115 case Instruction::FRem:
6116 case Instruction::Shl:
6117 case Instruction::LShr:
6118 case Instruction::AShr:
6119 case Instruction::And:
6120 case Instruction::Or:
6121 case Instruction::Xor:
6122 return TreeEntry::Vectorize;
6123 case Instruction::GetElementPtr: {
6125 for (
Value *V : VL) {
6126 auto *
I = dyn_cast<GetElementPtrInst>(V);
6129 if (
I->getNumOperands() != 2) {
6130 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
6131 return TreeEntry::NeedToGather;
6137 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6138 for (
Value *V : VL) {
6139 auto *
GEP = dyn_cast<GEPOperator>(V);
6142 Type *CurTy =
GEP->getSourceElementType();
6144 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
6145 return TreeEntry::NeedToGather;
6150 Type *Ty1 = VL0->getOperand(1)->getType();
6151 for (
Value *V : VL) {
6152 auto *
I = dyn_cast<GetElementPtrInst>(V);
6155 auto *
Op =
I->getOperand(1);
6156 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6157 (
Op->getType() != Ty1 &&
6158 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6159 Op->getType()->getScalarSizeInBits() >
6160 DL->getIndexSizeInBits(
6161 V->getType()->getPointerAddressSpace())))) {
6163 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
6164 return TreeEntry::NeedToGather;
6168 return TreeEntry::Vectorize;
6170 case Instruction::Store: {
6172 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6175 if (
DL->getTypeSizeInBits(ScalarTy) !=
6176 DL->getTypeAllocSizeInBits(ScalarTy)) {
6177 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
6178 return TreeEntry::NeedToGather;
6182 for (
Value *V : VL) {
6183 auto *
SI = cast<StoreInst>(V);
6184 if (!
SI->isSimple()) {
6186 return TreeEntry::NeedToGather;
6195 if (CurrentOrder.empty()) {
6196 Ptr0 = PointerOps.
front();
6197 PtrN = PointerOps.
back();
6199 Ptr0 = PointerOps[CurrentOrder.front()];
6200 PtrN = PointerOps[CurrentOrder.back()];
6202 std::optional<int> Dist =
6205 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
6206 return TreeEntry::Vectorize;
6210 return TreeEntry::NeedToGather;
6212 case Instruction::Call: {
6215 CallInst *CI = cast<CallInst>(VL0);
6226 return TreeEntry::NeedToGather;
6231 for (
unsigned J = 0; J != NumArgs; ++J)
6234 for (
Value *V : VL) {
6235 CallInst *CI2 = dyn_cast<CallInst>(V);
6241 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
6243 return TreeEntry::NeedToGather;
6247 for (
unsigned J = 0; J != NumArgs; ++J) {
6250 if (ScalarArgs[J] != A1J) {
6252 <<
"SLP: mismatched arguments in call:" << *CI
6253 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
6254 return TreeEntry::NeedToGather;
6263 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
6264 <<
"!=" << *V <<
'\n');
6265 return TreeEntry::NeedToGather;
6269 return TreeEntry::Vectorize;
6271 case Instruction::ShuffleVector: {
6274 if (!S.isAltShuffle()) {
6275 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
6276 return TreeEntry::NeedToGather;
6281 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
6282 "the whole alt sequence is not profitable.\n");
6283 return TreeEntry::NeedToGather;
6286 return TreeEntry::Vectorize;
6290 return TreeEntry::NeedToGather;
6295 const EdgeInfo &UserTreeIdx) {
6301 auto TryToFindDuplicates = [&](
const InstructionsState &S,
6302 bool DoNotFail =
false) {
6305 for (
Value *V : VL) {
6312 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
6317 size_t NumUniqueScalarValues = UniqueValues.
size();
6318 if (NumUniqueScalarValues == VL.size()) {
6319 ReuseShuffleIndicies.
clear();
6322 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6323 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
6324 "for nodes with padding.\n");
6325 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6329 if (NumUniqueScalarValues <= 1 ||
6330 (UniquePositions.size() == 1 &&
all_of(UniqueValues,
6332 return isa<UndefValue>(V) ||
6335 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6336 if (DoNotFail && UniquePositions.size() > 1 &&
6337 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6339 return isa<ExtractElementInst>(V) ||
6340 areAllUsersVectorized(cast<Instruction>(V),
6344 if (PWSz == VL.size()) {
6345 ReuseShuffleIndicies.
clear();
6347 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
6348 NonUniqueValueVL.
append(PWSz - UniqueValues.
size(),
6349 UniqueValues.
back());
6350 VL = NonUniqueValueVL;
6355 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6366 if (!EphValues.
empty()) {
6367 for (
Value *V : VL) {
6368 if (EphValues.
count(V)) {
6370 <<
") is ephemeral.\n");
6371 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6381 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6386 cast<Instruction>(
I)->getOpcode() ==
6387 cast<Instruction>(S.MainOp)->getOpcode();
6389 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
6390 if (TryToFindDuplicates(S))
6391 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6392 ReuseShuffleIndicies);
6397 if (S.getOpcode() == Instruction::ExtractElement &&
6398 isa<ScalableVectorType>(
6399 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6400 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
6401 if (TryToFindDuplicates(S))
6402 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6403 ReuseShuffleIndicies);
6408 if (S.OpValue->getType()->isVectorTy() &&
6409 !isa<InsertElementInst>(S.OpValue)) {
6411 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6415 if (
StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6416 if (
SI->getValueOperand()->getType()->isVectorTy()) {
6417 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
6418 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6427 auto &&NotProfitableForVectorization = [&S,
this,
6429 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6438 for (
Value *V : VL) {
6439 auto *
I = cast<Instruction>(V);
6441 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6445 if ((IsCommutative &&
6446 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6448 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
6450 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
6452 auto *
I1 = cast<Instruction>(VL.front());
6453 auto *I2 = cast<Instruction>(VL.back());
6454 for (
int Op = 0,
E = S.MainOp->getNumOperands();
Op <
E; ++
Op)
6456 I2->getOperand(
Op));
6457 if (
static_cast<unsigned>(
count_if(
6458 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6460 })) >= S.MainOp->getNumOperands() / 2)
6462 if (S.MainOp->getNumOperands() > 2)
6464 if (IsCommutative) {
6467 for (
int Op = 0,
E = S.MainOp->getNumOperands();
Op <
E; ++
Op)
6469 I2->getOperand((
Op + 1) %
E));
6471 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6480 bool IsScatterVectorizeUserTE =
6481 UserTreeIdx.UserTE &&
6482 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6483 bool AreAllSameInsts =
6485 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6489 auto *
I = dyn_cast<GetElementPtrInst>(V);
6493 BB =
I->getParent();
6494 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
6500 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6503 NotProfitableForVectorization(VL)) {
6504 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
6505 if (TryToFindDuplicates(S))
6506 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6507 ReuseShuffleIndicies);
6515 if (TreeEntry *
E = getTreeEntry(S.OpValue)) {
6516 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.OpValue <<
".\n");
6517 if (!
E->isSame(VL)) {
6518 auto It = MultiNodeScalars.
find(S.OpValue);
6519 if (It != MultiNodeScalars.
end()) {
6520 auto *TEIt =
find_if(It->getSecond(),
6521 [&](TreeEntry *ME) { return ME->isSame(VL); });
6522 if (TEIt != It->getSecond().end())
6532 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
6533 if (TryToFindDuplicates(S))
6534 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6535 ReuseShuffleIndicies);
6541 E->UserTreeIndices.push_back(UserTreeIdx);
6542 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.OpValue
6549 for (
Value *V : VL) {
6550 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6553 if (getTreeEntry(V)) {
6555 <<
") is already in tree.\n");
6556 if (TryToFindDuplicates(S))
6557 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6558 ReuseShuffleIndicies);
6564 if (UserIgnoreList && !UserIgnoreList->empty()) {
6565 for (
Value *V : VL) {
6566 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6567 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
6568 if (TryToFindDuplicates(S))
6569 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6570 ReuseShuffleIndicies);
6578 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6579 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6581 assert(S.OpValue->getType()->isPointerTy() &&
6582 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6583 "Expected pointers only.");
6585 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
6586 assert(It != VL.end() &&
"Expected at least one GEP.");
6592 auto *VL0 = cast<Instruction>(S.OpValue);
6599 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6608 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6613 if (!TryToFindDuplicates(S,
true))
6619 TreeEntry::EntryState State = getScalarsVectorizationState(
6620 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6621 if (State == TreeEntry::NeedToGather) {
6622 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6623 ReuseShuffleIndicies);
6627 auto &BSRef = BlocksSchedules[BB];
6629 BSRef = std::make_unique<BlockScheduling>(BB);
6631 BlockScheduling &BS = *BSRef;
6633 std::optional<ScheduleData *> Bundle =
6634 BS.tryScheduleBundle(UniqueValues,
this, S);
6635#ifdef EXPENSIVE_CHECKS
6640 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
6641 assert((!BS.getScheduleData(VL0) ||
6642 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6643 "tryScheduleBundle should cancelScheduling on failure");
6644 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6645 ReuseShuffleIndicies);
6648 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
6650 unsigned ShuffleOrOp = S.isAltShuffle() ?
6651 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
6652 switch (ShuffleOrOp) {
6653 case Instruction::PHI: {
6654 auto *PH = cast<PHINode>(VL0);
6657 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6662 for (
unsigned I = 0,
E = PH->getNumIncomingValues();
I <
E; ++
I) {
6672 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
6673 PH->getIncomingBlock(
I)));
6677 for (
unsigned OpIdx = 0, OpE = OperandsVec.
size(); OpIdx != OpE; ++OpIdx)
6678 buildTree_rec(OperandsVec[OpIdx],
Depth + 1, {
TE, OpIdx});
6681 case Instruction::ExtractValue:
6682 case Instruction::ExtractElement: {
6683 if (CurrentOrder.empty()) {
6684 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
6685 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6686 ReuseShuffleIndicies);
6690 Op0.
assign(VL.size(), VL0->getOperand(0));
6691 VectorizableTree.back()->setOperand(0, Op0);
6695 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
6697 for (
unsigned Idx : CurrentOrder)
6704 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6705 ReuseShuffleIndicies, CurrentOrder);
6709 Op0.
assign(VL.size(), VL0->getOperand(0));
6710 VectorizableTree.back()->setOperand(0, Op0);
6713 case Instruction::InsertElement: {
6714 assert(ReuseShuffleIndicies.
empty() &&
"All inserts should be unique");
6716 auto OrdCompare = [](
const std::pair<int, int> &P1,
6717 const std::pair<int, int> &P2) {
6718 return P1.first > P2.first;
6721 decltype(OrdCompare)>
6722 Indices(OrdCompare);
6723 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
6725 Indices.emplace(
Idx,
I);
6727 OrdersType CurrentOrder(VL.size(), VL.size());
6728 bool IsIdentity =
true;
6729 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
6730 CurrentOrder[Indices.top().second] =
I;
6731 IsIdentity &= Indices.top().second ==
I;
6735 CurrentOrder.clear();
6736 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6737 std::nullopt, CurrentOrder);
6740 constexpr int NumOps = 2;
6742 for (
int I = 0;
I < NumOps; ++
I) {
6744 VectorOperands[
I].
push_back(cast<Instruction>(V)->getOperand(
I));
6746 TE->setOperand(
I, VectorOperands[
I]);
6748 buildTree_rec(VectorOperands[NumOps - 1],
Depth + 1, {
TE, NumOps - 1});
6751 case Instruction::Load: {
6758 TreeEntry *
TE =
nullptr;
6761 case TreeEntry::Vectorize:
6762 if (CurrentOrder.empty()) {
6764 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6765 ReuseShuffleIndicies);
6769 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6770 ReuseShuffleIndicies, CurrentOrder);
6773 TE->setOperandsInOrder();
6775 case TreeEntry::StridedVectorize:
6777 if (CurrentOrder.empty()) {
6778 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6779 UserTreeIdx, ReuseShuffleIndicies);
6781 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6782 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6784 TE->setOperandsInOrder();
6787 case TreeEntry::ScatterVectorize:
6789 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6790 UserTreeIdx, ReuseShuffleIndicies);
6791 TE->setOperandsInOrder();
6792 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
6793 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
6795 case TreeEntry::NeedToGather:
6800 case Instruction::ZExt:
6801 case Instruction::SExt:
6802 case Instruction::FPToUI:
6803 case Instruction::FPToSI:
6804 case Instruction::FPExt:
6805 case Instruction::PtrToInt:
6806 case Instruction::IntToPtr:
6807 case Instruction::SIToFP:
6808 case Instruction::UIToFP:
6809 case Instruction::Trunc:
6810 case Instruction::FPTrunc:
6811 case Instruction::BitCast: {
6812 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6813 std::make_pair(std::numeric_limits<unsigned>::min(),
6814 std::numeric_limits<unsigned>::max()));
6815 if (ShuffleOrOp == Instruction::ZExt ||
6816 ShuffleOrOp == Instruction::SExt) {
6817 CastMaxMinBWSizes = std::make_pair(
6818 std::max<unsigned>(
DL->getTypeSizeInBits(VL0->getType()),
6821 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6823 }
else if (ShuffleOrOp == Instruction::Trunc) {
6824 CastMaxMinBWSizes = std::make_pair(
6826 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6828 std::min<unsigned>(
DL->getTypeSizeInBits(VL0->getType()),
6830 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
6831 }
else if (ShuffleOrOp == Instruction::SIToFP ||
6832 ShuffleOrOp == Instruction::UIToFP) {
6833 unsigned NumSignBits =
6835 if (
auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
6837 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
6839 if (NumSignBits * 2 >=
6840 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6841 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
6843 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6844 ReuseShuffleIndicies);
6847 TE->setOperandsInOrder();
6848 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6852 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
6858 case Instruction::ICmp:
6859 case Instruction::FCmp: {
6862 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6863 ReuseShuffleIndicies);
6871 "Commutative Predicate mismatch");
6872 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
6875 for (
Value *V : VL) {
6876 auto *
Cmp = cast<CmpInst>(V);
6879 if (
Cmp->getPredicate() != P0)
6889 if (ShuffleOrOp == Instruction::ICmp) {
6890 unsigned NumSignBits0 =
6892 if (NumSignBits0 * 2 >=
6893 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6894 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
6895 unsigned NumSignBits1 =
6897 if (NumSignBits1 * 2 >=
6898 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
6899 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
6903 case Instruction::Select:
6904 case Instruction::FNeg:
6905 case Instruction::Add:
6906 case Instruction::FAdd:
6907 case Instruction::Sub:
6908 case Instruction::FSub:
6909 case Instruction::Mul:
6910 case Instruction::FMul:
6911 case Instruction::UDiv:
6912 case Instruction::SDiv:
6913 case Instruction::FDiv:
6914 case Instruction::URem:
6915 case Instruction::SRem:
6916 case Instruction::FRem:
6917 case Instruction::Shl:
6918 case Instruction::LShr:
6919 case Instruction::AShr:
6920 case Instruction::And:
6921 case Instruction::Or:
6922 case Instruction::Xor: {
6923 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6924 ReuseShuffleIndicies);
6931 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
6939 TE->setOperandsInOrder();
6940 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6944 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
6950 case Instruction::GetElementPtr: {
6951 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6952 ReuseShuffleIndicies);
6956 for (
Value *V : VL) {
6957 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
6962 Operands.front().push_back(
GEP->getPointerOperand());
6971 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
6973 [VL0Ty, IndexIdx](
Value *V) {
6974 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
6977 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
6980 :
DL->getIndexType(cast<GetElementPtrInst>(VL0)
6981 ->getPointerOperandType()
6984 for (
Value *V : VL) {
6985 auto *
I = dyn_cast<GetElementPtrInst>(V);
6988 ConstantInt::get(Ty, 0,
false));
6991 auto *
Op =
I->getOperand(IndexIdx);
6992 auto *CI = dyn_cast<ConstantInt>(
Op);
6997 CI, Ty, CI->getValue().isSignBitSet(), *
DL));
7001 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
7005 case Instruction::Store: {
7009 for (
Value *V : VL) {
7010 auto *
SI = cast<StoreInst>(V);
7011 *OIter =
SI->getValueOperand();
7015 if (CurrentOrder.empty()) {
7017 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7018 ReuseShuffleIndicies);
7019 TE->setOperandsInOrder();
7024 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7025 ReuseShuffleIndicies, CurrentOrder);
7026 TE->setOperandsInOrder();
7028 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");
7032 case Instruction::Call: {
7035 CallInst *CI = cast<CallInst>(VL0);
7038 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7039 ReuseShuffleIndicies);
7044 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7048 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7052 for (
Value *V : VL) {
7053 auto *CI2 = cast<CallInst>(V);
7060 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7067 TE->setOperandsInOrder();
7068 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
7075 for (
Value *V : VL) {
7076 auto *CI2 = cast<CallInst>(V);
7083 case Instruction::ShuffleVector: {
7084 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7085 ReuseShuffleIndicies);
7089 auto *CI = dyn_cast<CmpInst>(VL0);
7090 if (isa<BinaryOperator>(VL0) || CI) {
7093 return cast<CmpInst>(V)->isCommutative();
7095 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7097 auto *MainCI = cast<CmpInst>(S.MainOp);
7098 auto *AltCI = cast<CmpInst>(S.AltOp);
7102 "Expected different main/alternate predicates.");
7105 for (
Value *V : VL) {
7106 auto *
Cmp = cast<CmpInst>(V);
7128 TE->setOperandsInOrder();
7129 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7133 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
7149 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7150 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
7152 for (
const auto *Ty : ST->elements())
7153 if (Ty != *ST->element_begin())
7155 N *= ST->getNumElements();
7156 EltTy = *ST->element_begin();
7157 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
7158 N *= AT->getNumElements();
7159 EltTy = AT->getElementType();
7161 auto *VT = cast<FixedVectorType>(EltTy);
7162 N *= VT->getNumElements();
7163 EltTy = VT->getElementType();
7170 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7171 VTSize !=
DL->getTypeStoreSizeInBits(
T))
7178 bool ResizeAllowed)
const {
7179 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7180 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
7181 auto *E0 = cast<Instruction>(*It);
7183 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7187 Value *Vec = E0->getOperand(0);
7189 CurrentOrder.
clear();
7193 if (E0->getOpcode() == Instruction::ExtractValue) {
7198 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7202 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
7205 unsigned E = VL.
size();
7206 if (!ResizeAllowed && NElts !=
E)
7209 unsigned MinIdx = NElts, MaxIdx = 0;
7211 auto *Inst = dyn_cast<Instruction>(V);
7214 if (Inst->getOperand(0) != Vec)
7216 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
7217 if (isa<UndefValue>(EE->getIndexOperand()))
7222 const unsigned ExtIdx = *
Idx;
7223 if (ExtIdx >= NElts)
7225 Indices[
I] = ExtIdx;
7226 if (MinIdx > ExtIdx)
7228 if (MaxIdx < ExtIdx)
7231 if (MaxIdx - MinIdx + 1 >
E)
7233 if (MaxIdx + 1 <=
E)
7237 bool ShouldKeepOrder =
true;
7244 for (
unsigned I = 0;
I <
E; ++
I) {
7247 const unsigned ExtIdx = Indices[
I] - MinIdx;
7248 if (CurrentOrder[ExtIdx] !=
E) {
7249 CurrentOrder.
clear();
7252 ShouldKeepOrder &= ExtIdx ==
I;
7253 CurrentOrder[ExtIdx] =
I;
7255 if (ShouldKeepOrder)
7256 CurrentOrder.
clear();
7258 return ShouldKeepOrder;
7261bool BoUpSLP::areAllUsersVectorized(
7263 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
7265 return ScalarToTreeEntry.contains(U) ||
7266 isVectorLikeInstWithConstOps(U) ||
7267 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7271static std::pair<InstructionCost, InstructionCost>
7279 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
7280 FMF = FPCI->getFastMathFlags();
7283 dyn_cast<IntrinsicInst>(CI));
7284 auto IntrinsicCost =
7291 auto LibCost = IntrinsicCost;
7298 return {IntrinsicCost, LibCost};
7301void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7305 unsigned Sz = Scalars.size();
7308 if (!ReorderIndices.empty())
7310 for (
unsigned I = 0;
I < Sz; ++
I) {
7312 if (!ReorderIndices.empty())
7314 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
7315 if (IsAltOp(OpInst)) {
7325 if (!ReuseShuffleIndices.empty()) {
7328 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7338 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7339 auto *AltCI = cast<CmpInst>(AltOp);
7342 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
7343 auto *CI = cast<CmpInst>(
I);
7351 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
7352 "CmpInst expected to match either main or alternate predicate or "
7355 return MainP !=
P && MainP != SwappedP;
7362 const auto *Op0 = Ops.
front();
7368 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
7372 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
7374 if (
auto *CI = dyn_cast<ConstantInt>(V))
7375 return CI->getValue().isPowerOf2();
7378 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
7380 if (
auto *CI = dyn_cast<ConstantInt>(V))
7381 return CI->getValue().isNegatedPowerOf2();
7386 if (IsConstant && IsUniform)
7388 else if (IsConstant)
7402class BaseShuffleAnalysis {
7409 int Limit =
Mask.size();
7421 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
7437 unsigned VF =
Mask.size();
7439 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
7442 int MaskedIdx =
Mask[ExtMask[
I] % VF];
7483 bool SinglePermute) {
7487 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
7489 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7495 if (isIdentityMask(Mask, SVTy,
false)) {
7496 if (!IdentityOp || !SinglePermute ||
7497 (isIdentityMask(Mask, SVTy,
true) &&
7499 IdentityMask.
size()))) {
7504 IdentityMask.
assign(Mask);
7524 if (SV->isZeroEltSplat()) {
7526 IdentityMask.
assign(Mask);
7528 int LocalVF =
Mask.size();
7530 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7531 LocalVF = SVOpTy->getNumElements();
7535 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
7537 ExtMask[
Idx] = SV->getMaskValue(
I);
7547 if (!IsOp1Undef && !IsOp2Undef) {
7549 for (
int &
I : Mask) {
7552 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
7559 SV->getShuffleMask().end());
7560 combineMasks(LocalVF, ShuffleMask, Mask);
7561 Mask.swap(ShuffleMask);
7563 Op = SV->getOperand(0);
7565 Op = SV->getOperand(1);
7567 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
7568 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7573 "Expected masks of same sizes.");
7578 Mask.swap(IdentityMask);
7579 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7580 return SinglePermute &&
7581 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
7583 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
7584 Shuffle->isZeroEltSplat() &&
7597 template <
typename T,
typename ShuffleBuilderTy>
7599 ShuffleBuilderTy &Builder) {
7600 assert(V1 &&
"Expected at least one vector value.");
7602 Builder.resizeToMatch(V1, V2);
7603 int VF =
Mask.size();
7604 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
7605 VF = FTy->getNumElements();
7612 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
7615 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7617 CombinedMask1[
I] =
Mask[
I];
7619 CombinedMask2[
I] =
Mask[
I] - VF;
7626 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
7627 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
7630 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7631 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7636 ExtMask1[
Idx] = SV1->getMaskValue(
I);
7639 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7641 ExtMask1, UseMask::SecondArg);
7646 ExtMask2[
Idx] = SV2->getMaskValue(
I);
7649 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7651 ExtMask2, UseMask::SecondArg);
7652 if (SV1->getOperand(0)->getType() ==
7653 SV2->getOperand(0)->getType() &&
7654 SV1->getOperand(0)->getType() != SV1->getType() &&
7657 Op1 = SV1->getOperand(0);
7658 Op2 = SV2->getOperand(0);
7660 SV1->getShuffleMask().end());
7661 int LocalVF = ShuffleMask1.size();
7662 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
7663 LocalVF = FTy->getNumElements();
7664 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7665 CombinedMask1.swap(ShuffleMask1);
7667 SV2->getShuffleMask().end());
7668 LocalVF = ShuffleMask2.size();
7669 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
7670 LocalVF = FTy->getNumElements();
7671 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7672 CombinedMask2.swap(ShuffleMask2);
7675 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
7676 Builder.resizeToMatch(Op1, Op2);
7677 VF = std::max(cast<VectorType>(Op1->
getType())
7679 .getKnownMinValue(),
7680 cast<VectorType>(Op2->
getType())
7682 .getKnownMinValue());
7683 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7686 "Expected undefined mask element");
7687 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
7693 isa<ShuffleVectorInst>(Op1) &&
7694 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7696 return Builder.createIdentity(Op1);
7697 return Builder.createShuffleVector(
7701 if (isa<PoisonValue>(V1))
7702 return Builder.createPoison(
7703 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
7705 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
7706 assert(V1 &&
"Expected non-null value after looking through shuffles.");
7709 return Builder.createShuffleVector(V1, NewMask);
7710 return Builder.createIdentity(V1);
7726 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7729 Mask, NumSrcElts, NumSubElts,
Index)) {
7730 if (
Index + NumSubElts > NumSrcElts &&
7731 Index + NumSrcElts <=
static_cast<int>(Mask.size()))
7741static std::pair<InstructionCost, InstructionCost>
7752 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7762 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7766 for (
Value *V : Ptrs) {
7771 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7776 if (!
Ptr || !
Ptr->hasOneUse())
7780 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
7786 TTI::PointersChainInfo::getKnownStride(),
7796 [](
const Value *V) {
7797 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7798 return Ptr && !
Ptr->hasAllConstantIndices();
7800 ? TTI::PointersChainInfo::getUnknownStride()
7801 : TTI::PointersChainInfo::getKnownStride();
7805 if (
auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7808 BaseGEP->getPointerOperand(), Indices, VecTy,
7813 return std::make_pair(ScalarCost, VecCost);
7823 bool IsFinalized =
false;
7836 bool SameNodesEstimated =
true;
7845 if (
auto *VTy = dyn_cast<VectorType>(Ty))
7861 const unsigned Sz = R.DL->getTypeSizeInBits(VL.
front()->getType());
7862 unsigned MinVF = R.getMinVF(2 * Sz);
7863 if (VL.
size() > 2 &&
7864 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
7865 (InVectors.
empty() &&
7868 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
7869 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
7870 return S.getOpcode() == Instruction::Load &&
7873 !
all_of(Gathers, [&](
Value *V) {
return R.getTreeEntry(V); }) &&
7879 unsigned StartIdx = 0;
7880 unsigned VF = VL.
size() / 2;
7881 for (; VF >= MinVF; VF /= 2) {
7882 for (
unsigned Cnt = StartIdx,
End = VL.
size(); Cnt + VF <=
End;
7885 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
7887 if (SliceS.getOpcode() != Instruction::Load ||
7888 SliceS.isAltShuffle())
7896 CurrentOrder, PointerOps);
7906 CurrentOrder.
empty()) ||
7915 if (Cnt == StartIdx)
7924 if (StartIdx >= VL.
size())
7927 if (!VectorizedLoads.
empty())
7930 if (!VectorizedLoads.
empty()) {
7932 bool NeedInsertSubvectorAnalysis =
7933 !NumParts || (VL.
size() / VF) > NumParts;
7939 getBuildVectorCost(VL.
slice(
I, std::min(
End -
I, VF)), Root);
7946 for (
Value *V : VectorizedLoads) {
7947 auto *LI = cast<LoadInst>(V);
7954 for (
const std::pair<unsigned, LoadsState> &
P : VectorizedStarts) {
7955 auto *LI = cast<LoadInst>(VL[
P.first]);
7964 false, Alignment, CostKind, LI);
7968 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
7969 auto [ScalarGEPCost, VectorGEPCost] =
7971 Instruction::Load, CostKind, LI->
getType(), LoadTy);
7972 GatherCost += VectorGEPCost - ScalarGEPCost;
7974 for (
unsigned P : ScatterVectorized) {
7975 auto *LI0 = cast<LoadInst>(VL[
P]);
7977 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
7979 Instruction::Load, LoadTy, LI0->getPointerOperand(),
7980 false, CommonAlignment, CostKind, LI0);
7984 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
7992 auto [ScalarGEPCost, VectorGEPCost] =
7994 CostKind, ScalarTy, VecTy);
7995 GatherCost += VectorGEPCost - ScalarGEPCost;
7996 if (!Order.
empty()) {
8000 VecTy, Mask, CostKind);
8003 GatherCost += R.getGatherCost(PointerOps,
true);
8006 if (NeedInsertSubvectorAnalysis) {
8009 for (
unsigned I = VF, E = VL.
size();
I < E;
I += VF) {
8010 for (
unsigned Idx : seq<unsigned>(0, E))
8013 ShuffleMask, CostKind,
I, LoadTy);
8016 GatherCost -= ScalarsCost;
8018 GatherCost = std::min(BaseCost, GatherCost);
8019 }
else if (!Root &&
isSplat(VL)) {
8022 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
8023 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
8026 count(VL, *It) > 1 &&
8030 CostKind, std::distance(VL.
begin(), It),
8035 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8038 Instruction::InsertElement, VecTy, CostKind, 0,
8042 ShuffleMask, CostKind, 0,
8046 (
all_of(Gathers, IsaPred<UndefValue>)
8048 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers)));
8055 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8056 unsigned NumParts) {
8057 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
8059 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
8060 auto *EE = dyn_cast<ExtractElementInst>(V);
8063 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8066 return std::max(Sz, VecTy->getNumElements());
8070 if (NumSrcRegs == 0)
8075 auto CheckPerRegistersShuffle =
8080 int FirstRegId = -1;
8081 for (
int &
I : Mask) {
8084 int RegId = (
I / NumElts) * NumParts + (
I % NumElts) / EltsPerVector;
8087 RegIndices.
insert(RegId);
8088 if (RegIndices.
size() > 2)
8089 return std::nullopt;
8090 if (RegIndices.
size() == 2)
8092 I = (
I % NumElts) % EltsPerVector +
8093 (RegId == FirstRegId ? 0 : EltsPerVector);
8102 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8103 if (!ShuffleKinds[Part])
8106 Mask.slice(Part * EltsPerVector,
8107 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8108 ? Mask.size() % EltsPerVector
8112 std::optional<TTI::ShuffleKind> RegShuffleKind =
8113 CheckPerRegistersShuffle(SubMask);
8114 if (!RegShuffleKind) {
8116 TTI, *ShuffleKinds[Part],
8123 TTI, *RegShuffleKind,
8134 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8141 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
8143 unsigned SliceSize) {
8144 if (SameNodesEstimated) {
8150 if ((InVectors.
size() == 2 &&
8151 InVectors.
front().get<
const TreeEntry *>() == &E1 &&
8152 InVectors.
back().get<
const TreeEntry *>() == E2) ||
8153 (!E2 && InVectors.
front().get<
const TreeEntry *>() == &E1)) {
8156 "Expected all poisoned elements.");
8159 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
8164 Cost += createShuffle(InVectors.
front(),
8165 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
8167 transformMaskAfterShuffle(CommonMask, CommonMask);
8169 SameNodesEstimated =
false;
8170 if (!E2 && InVectors.
size() == 1) {
8171 unsigned VF = E1.getVectorFactor();
8174 cast<FixedVectorType>(V1->
getType())->getNumElements());
8176 const auto *E = InVectors.
front().get<
const TreeEntry *>();
8177 VF = std::max(VF, E->getVectorFactor());
8179 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8181 CommonMask[
Idx] = Mask[
Idx] + VF;
8182 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
8183 transformMaskAfterShuffle(CommonMask, CommonMask);
8185 Cost += createShuffle(&E1, E2, Mask);
8186 transformMaskAfterShuffle(CommonMask, Mask);
8190 class ShuffleCostBuilder {
8193 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
8195 return Mask.empty() ||
8196 (VF == Mask.size() &&
8204 ~ShuffleCostBuilder() =
default;
8209 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8210 if (isEmptyOrIdentity(Mask, VF))
8213 cast<VectorType>(V1->
getType()), Mask);
8218 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8219 if (isEmptyOrIdentity(Mask, VF))
8222 cast<VectorType>(V1->
getType()), Mask);
8228 void resizeToMatch(
Value *&,
Value *&)
const {}
8238 ShuffleCostBuilder Builder(
TTI);
8241 unsigned CommonVF = Mask.size();
8242 if (!V1 && !V2 && !P2.
isNull()) {
8244 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8245 unsigned VF = E->getVectorFactor();
8246 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8247 CommonVF = std::max(VF, E2->getVectorFactor());
8250 return Idx < 2 * static_cast<int>(CommonVF);
8252 "All elements in mask must be less than 2 * CommonVF.");
8253 if (E->Scalars.size() == E2->Scalars.size()) {
8257 for (
int &
Idx : CommonMask) {
8260 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
8262 else if (
Idx >=
static_cast<int>(CommonVF))
8263 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
8267 CommonVF = E->Scalars.size();
8271 V2 = getAllOnesValue(
8273 }
else if (!V1 && P2.
isNull()) {
8275 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8276 unsigned VF = E->getVectorFactor();
8280 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8281 "All elements in mask must be less than CommonVF.");
8282 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8284 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
8285 for (
int &
Idx : CommonMask) {
8289 CommonVF = E->Scalars.size();
8294 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8295 CommonVF == CommonMask.
size() &&
8297 [](
const auto &&
P) {
8299 static_cast<unsigned>(
P.value()) !=
P.index();
8307 }
else if (V1 && P2.
isNull()) {
8309 CommonVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8312 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8313 "All elements in mask must be less than CommonVF.");
8314 }
else if (V1 && !V2) {
8316 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8317 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8318 CommonVF = std::max(VF, E2->getVectorFactor());
8321 return Idx < 2 * static_cast<int>(CommonVF);
8323 "All elements in mask must be less than 2 * CommonVF.");
8324 if (E2->Scalars.size() == VF && VF != CommonVF) {
8326 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
8327 for (
int &
Idx : CommonMask) {
8330 if (
Idx >=
static_cast<int>(CommonVF))
8331 Idx = E2Mask[
Idx - CommonVF] + VF;
8337 V2 = getAllOnesValue(
8340 }
else if (!V1 && V2) {
8342 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8343 const TreeEntry *E1 = P1.
get<
const TreeEntry *>();
8344 CommonVF = std::max(VF, E1->getVectorFactor());
8347 return Idx < 2 * static_cast<int>(CommonVF);
8349 "All elements in mask must be less than 2 * CommonVF.");
8350 if (E1->Scalars.size() == VF && VF != CommonVF) {
8352 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
8353 for (
int &
Idx : CommonMask) {
8356 if (
Idx >=
static_cast<int>(CommonVF))
8357 Idx = E1Mask[
Idx - CommonVF] + VF;
8365 V2 = getAllOnesValue(
8369 assert(V1 && V2 &&
"Expected both vectors.");
8370 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8372 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8375 return Idx < 2 * static_cast<int>(CommonVF);
8377 "All elements in mask must be less than 2 * CommonVF.");
8378 if (V1->
getType() != V2->getType()) {
8380 cast<FixedVectorType>(V1->
getType())->getElementType(), CommonVF));
8381 V2 = getAllOnesValue(
8383 cast<FixedVectorType>(V1->
getType())->getElementType(),
8388 cast<FixedVectorType>(V1->
getType())->getElementType(),
8389 CommonMask.
size()));
8390 if (InVectors.
size() == 2)
8392 return BaseShuffleAnalysis::createShuffle<InstructionCost>(
8393 V1, V2, CommonMask, Builder);
8400 :
TTI(
TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
8401 R(R), CheckedExtracts(CheckedExtracts) {}
8403 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8404 unsigned NumParts,
bool &UseVecBaseAsInput) {
8405 UseVecBaseAsInput =
false;
8408 Value *VecBase =
nullptr;
8411 if (NumParts == VL.
size())
8415 bool PrevNodeFound =
any_of(
8417 [&](
const std::unique_ptr<TreeEntry> &TE) {
8418 return ((!TE->isAltShuffle() &&
8419 TE->getOpcode() == Instruction::ExtractElement) ||
8420 TE->State == TreeEntry::NeedToGather) &&
8421 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8422 return VL.size() > Data.index() &&
8423 (Mask[Data.index()] == PoisonMaskElem ||
8424 isa<UndefValue>(VL[Data.index()]) ||
8425 Data.value() == VL[Data.index()]);
8429 unsigned SliceSize = VL.
size() / NumParts;
8430 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8431 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8432 for (
auto [
I, V] :
enumerate(VL.
slice(Part * SliceSize, SliceSize))) {
8434 if (isa<UndefValue>(V) ||
8443 auto *EE = cast<ExtractElementInst>(V);
8444 VecBase = EE->getVectorOperand();
8445 UniqueBases.
insert(VecBase);
8446 const TreeEntry *VE = R.getTreeEntry(V);
8447 if (!CheckedExtracts.
insert(V).second ||
8448 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8454 unsigned Idx = *EEIdx;
8456 if (EE->hasOneUse() || !PrevNodeFound) {
8458 if (isa<SExtInst, ZExtInst>(Ext) &&
8459 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8464 EE->getVectorOperandType(),
Idx);
8467 Ext->getOpcode(), Ext->getType(), EE->getType(),
8483 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8486 transformMaskAfterShuffle(CommonMask, CommonMask);
8487 SameNodesEstimated =
false;
8488 if (NumParts != 1 && UniqueBases.
size() != 1) {
8489 UseVecBaseAsInput =
true;
8497 std::optional<InstructionCost>
8501 return std::nullopt;
8507 return Idx < static_cast<int>(E1.getVectorFactor());
8509 "Expected single vector shuffle mask.");
8513 if (InVectors.
empty()) {
8514 CommonMask.
assign(Mask.begin(), Mask.end());
8515 InVectors.
assign({&E1, &E2});
8518 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8522 if (NumParts == 0 || NumParts >= Mask.size())
8524 unsigned SliceSize = Mask.size() / NumParts;
8527 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8528 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8531 if (InVectors.
empty()) {
8532 CommonMask.
assign(Mask.begin(), Mask.end());
8533 InVectors.
assign(1, &E1);
8536 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8540 if (NumParts == 0 || NumParts >= Mask.size())
8542 unsigned SliceSize = Mask.size() / NumParts;
8545 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8546 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
8547 if (!SameNodesEstimated && InVectors.
size() == 1)
8560 cast<ExtractElementInst>(InVectors.
front()
8561 .get<
const TreeEntry *>()
8562 ->Scalars[
P.index()]);
8563 return EI->getVectorOperand() == V1 ||
8564 EI->getVectorOperand() == V2;
8566 "Expected extractelement vectors.");
8570 if (InVectors.
empty()) {
8572 "Expected empty input mask/vectors.");
8573 CommonMask.
assign(Mask.begin(), Mask.end());
8580 InVectors.
front().is<
const TreeEntry *>() && !CommonMask.
empty() &&
8584 .get<const TreeEntry *>()
8585 ->Scalars[
P.index()];
8587 return P.value() == Mask[
P.index()] ||
8588 isa<UndefValue>(Scalar);
8589 if (isa<Constant>(V1))
8591 auto *EI = cast<ExtractElementInst>(Scalar);
8592 return EI->getVectorOperand() == V1;
8594 "Expected only tree entry for extractelement vectors.");
8598 "Expected only tree entries from extracts/reused buildvectors.");
8599 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8600 if (InVectors.
size() == 2) {
8601 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
8602 transformMaskAfterShuffle(CommonMask, CommonMask);
8603 VF = std::max<unsigned>(VF, CommonMask.
size());
8604 }
else if (
const auto *InTE =
8605 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
8606 VF = std::max(VF, InTE->getVectorFactor());
8610 ->getNumElements());
8613 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8615 CommonMask[
Idx] = Mask[
Idx] + VF;
8618 Value *Root =
nullptr) {
8619 Cost += getBuildVectorCost(VL, Root);
8623 unsigned VF = VL.
size();
8625 VF = std::min(VF, MaskVF);
8627 if (isa<UndefValue>(V)) {
8637 cast<FixedVectorType>(Root->
getType())->getNumElements()),
8638 getAllOnesValue(*R.DL, VL.
front()->getType()));
8648 if (InVectors.
size() == 2)
8649 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
8651 Cost += createShuffle(Vec,
nullptr, CommonMask);
8652 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8656 "Expected vector length for the final value before action.");
8658 Action(V, CommonMask);
8659 InVectors.
front() = V;
8662 if (CommonMask.
empty()) {
8663 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
8667 createShuffle(InVectors.
front(),
8668 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
8674 "Shuffle construction must be finalized.");
8678const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
8679 unsigned Idx)
const {
8681 if (
const TreeEntry *TE = getTreeEntry(
Op)) {
8682 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8683 return EI.EdgeIdx == Idx && EI.UserTE == E;
8684 }) != TE->UserTreeIndices.end())
8686 auto MIt = MultiNodeScalars.
find(
Op);
8687 if (MIt != MultiNodeScalars.
end()) {
8688 for (
const TreeEntry *TE : MIt->second) {
8689 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8690 return EI.EdgeIdx == Idx && EI.UserTE == E;
8691 }) != TE->UserTreeIndices.end())
8697 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8698 return TE->State == TreeEntry::NeedToGather &&
8699 find_if(
TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8700 return EI.EdgeIdx == Idx && EI.UserTE == E;
8701 }) !=
TE->UserTreeIndices.end();
8703 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
8708 if (
TE.State == TreeEntry::ScatterVectorize ||
8709 TE.State == TreeEntry::StridedVectorize)
8711 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
8712 !
TE.isAltShuffle()) {
8713 if (
TE.ReorderIndices.empty())
8752 Type *ScalarTy = VL[0]->getType();
8753 if (E->State != TreeEntry::NeedToGather) {
8754 if (
auto *SI = dyn_cast<StoreInst>(VL[0]))
8755 ScalarTy =
SI->getValueOperand()->getType();
8756 else if (
auto *CI = dyn_cast<CmpInst>(VL[0]))
8758 else if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
8759 ScalarTy =
IE->getOperand(1)->getType();
8768 auto It = MinBWs.
find(E);
8769 Type *OrigScalarTy = ScalarTy;
8770 if (It != MinBWs.
end()) {
8774 unsigned EntryVF = E->getVectorFactor();
8777 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
8778 if (E->State == TreeEntry::NeedToGather) {
8781 if (isa<InsertElementInst>(VL[0]))
8783 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
8784 E, *
TTI, VectorizedVals, *
this, CheckedExtracts);
8789 if (!E->ReorderIndices.empty() &&
8790 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
8792 if (E->getOpcode() == Instruction::Store) {
8794 NewMask.
resize(E->ReorderIndices.size());
8795 copy(E->ReorderIndices, NewMask.
begin());
8801 if (NeedToShuffleReuses)
8802 ::addMask(Mask, E->ReuseShuffleIndices);
8806 assert((E->State == TreeEntry::Vectorize ||
8807 E->State == TreeEntry::ScatterVectorize ||
8808 E->State == TreeEntry::StridedVectorize) &&
8812 (E->getOpcode() == Instruction::GetElementPtr &&
8813 E->getMainOp()->getType()->isPointerTy())) &&
8816 unsigned ShuffleOrOp =
8817 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
8819 const unsigned Sz = UniqueValues.
size();
8821 for (
unsigned I = 0;
I < Sz; ++
I) {
8822 if (getTreeEntry(UniqueValues[
I]) == E)
8826 auto GetCastContextHint = [&](
Value *
V) {
8827 if (
const TreeEntry *OpTE = getTreeEntry(V))
8828 return getCastContextHint(*OpTE);
8829 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
8830 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
8839 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
8843 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
8845 for (
unsigned I = 0;
I < Sz; ++
I) {
8846 if (UsedScalars.test(
I))
8848 ScalarCost += ScalarEltCost(
I);
8856 const EdgeInfo &EI = E->UserTreeIndices.front();
8857 if ((EI.UserTE->getOpcode() != Instruction::Select ||
8859 It != MinBWs.
end()) {
8860 auto UserBWIt = MinBWs.
find(EI.UserTE);
8861 Type *UserScalarTy =
8862 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
8863 if (UserBWIt != MinBWs.
end())
8865 UserBWIt->second.first);
8866 if (ScalarTy != UserScalarTy) {
8867 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
8868 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
8873 VecOpcode = Instruction::Trunc;
8876 It->second.second ? Instruction::SExt : Instruction::ZExt;
8883 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
8884 ScalarCost,
"Calculated costs for Tree"));
8885 return VecCost - ScalarCost;
8890 assert((E->State == TreeEntry::Vectorize ||
8891 E->State == TreeEntry::StridedVectorize) &&
8892 "Entry state expected to be Vectorize or StridedVectorize here.");
8896 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
8897 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
8898 "Calculated GEPs cost for Tree"));
8900 return VecCost - ScalarCost;
8903 switch (ShuffleOrOp) {
8904 case Instruction::PHI: {
8908 for (
Value *V : UniqueValues) {
8909 auto *
PHI = dyn_cast<PHINode>(V);
8914 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
8918 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
8920 if (!OpTE->ReuseShuffleIndices.empty())
8921 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
8922 OpTE->Scalars.size());
8925 return CommonCost - ScalarCost;
8927 case Instruction::ExtractValue:
8928 case Instruction::ExtractElement: {
8929 auto GetScalarCost = [&](
unsigned Idx) {
8930 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
8932 if (ShuffleOrOp == Instruction::ExtractElement) {
8933 auto *EE = cast<ExtractElementInst>(
I);
8934 SrcVecTy = EE->getVectorOperandType();
8936 auto *EV = cast<ExtractValueInst>(
I);
8937 Type *AggregateTy = EV->getAggregateOperand()->getType();
8939 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
8940 NumElts = ATy->getNumElements();
8945 if (
I->hasOneUse()) {
8947 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
8948 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
8955 Ext->getOpcode(),
Ext->getType(),
I->getType(),
8963 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
8964 return GetCostDiff(GetScalarCost, GetVectorCost);
8966 case Instruction::InsertElement: {
8967 assert(E->ReuseShuffleIndices.empty() &&
8968 "Unique insertelements only are expected.");
8969 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
8970 unsigned const NumElts = SrcVecTy->getNumElements();
8971 unsigned const NumScalars = VL.
size();
8977 unsigned OffsetEnd = OffsetBeg;
8978 InsertMask[OffsetBeg] = 0;
8981 if (OffsetBeg >
Idx)
8983 else if (OffsetEnd <
Idx)
8985 InsertMask[
Idx] =
I + 1;
8989 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
8990 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
8992 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
8993 unsigned InsertVecSz = std::min<unsigned>(
8995 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
8996 bool IsWholeSubvector =
8997 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9001 if (OffsetBeg + InsertVecSz > VecSz) {
9004 InsertVecSz = VecSz;
9010 if (!E->ReorderIndices.empty()) {
9015 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
9017 bool IsIdentity =
true;
9019 Mask.swap(PrevMask);
9020 for (
unsigned I = 0;
I < NumScalars; ++
I) {
9022 DemandedElts.
setBit(InsertIdx);
9023 IsIdentity &= InsertIdx - OffsetBeg ==
I;
9024 Mask[InsertIdx - OffsetBeg] =
I;
9026 assert(
Offset < NumElts &&
"Failed to find vector index offset");
9041 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
9042 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9050 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9051 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
9052 if (InsertVecSz != VecSz) {
9064 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
9073 case Instruction::ZExt:
9074 case Instruction::SExt:
9075 case Instruction::FPToUI:
9076 case Instruction::FPToSI:
9077 case Instruction::FPExt:
9078 case Instruction::PtrToInt:
9079 case Instruction::IntToPtr:
9080 case Instruction::SIToFP:
9081 case Instruction::UIToFP:
9082 case Instruction::Trunc:
9083 case Instruction::FPTrunc:
9084 case Instruction::BitCast: {
9085 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9088 unsigned Opcode = ShuffleOrOp;
9089 unsigned VecOpcode = Opcode;
9091 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
9093 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
9094 if (SrcIt != MinBWs.
end()) {
9095 SrcBWSz = SrcIt->second.first;
9099 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9100 if (BWSz == SrcBWSz) {
9101 VecOpcode = Instruction::BitCast;
9102 }
else if (BWSz < SrcBWSz) {
9103 VecOpcode = Instruction::Trunc;
9104 }
else if (It != MinBWs.
end()) {
9105 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9106 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9107 }
else if (SrcIt != MinBWs.
end()) {
9108 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9110 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9112 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
9113 !SrcIt->second.second) {
9114 VecOpcode = Instruction::UIToFP;
9117 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9125 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9127 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
9131 VecOpcode == Opcode ? VI :
nullptr);
9133 return GetCostDiff(GetScalarCost, GetVectorCost);
9135 case Instruction::FCmp:
9136 case Instruction::ICmp:
9137 case Instruction::Select: {
9141 match(VL0, MatchCmp))
9147 auto GetScalarCost = [&](
unsigned Idx) {
9148 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9154 !
match(VI, MatchCmp)) ||
9155 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9161 Builder.getInt1Ty(), CurrentPred,
CostKind,
9168 E->getOpcode(), VecTy, MaskTy, VecPred,
CostKind, VL0);
9180 if (IntrinsicAndUse.second)
9183 VecCost = std::min(VecCost, IntrinsicCost);
9185 return VecCost + CommonCost;
9187 return GetCostDiff(GetScalarCost, GetVectorCost);
9189 case Instruction::FNeg:
9190 case Instruction::Add:
9191 case Instruction::FAdd:
9192 case Instruction::Sub:
9193 case Instruction::FSub:
9194 case Instruction::Mul:
9195 case Instruction::FMul:
9196 case Instruction::UDiv:
9197 case Instruction::SDiv:
9198 case Instruction::FDiv:
9199 case Instruction::URem:
9200 case Instruction::SRem:
9201 case Instruction::FRem:
9202 case Instruction::Shl:
9203 case Instruction::LShr:
9204 case Instruction::AShr:
9205 case Instruction::And:
9206 case Instruction::Or:
9207 case Instruction::Xor: {
9208 auto GetScalarCost = [&](
unsigned Idx) {
9209 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9210 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9219 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9223 Op2Info, std::nullopt,
nullptr, TLI) +
9226 return GetCostDiff(GetScalarCost, GetVectorCost);
9228 case Instruction::GetElementPtr: {
9229 return CommonCost + GetGEPCostDiff(VL, VL0);
9231 case Instruction::Load: {
9232 auto GetScalarCost = [&](
unsigned Idx) {
9233 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
9235 VI->getAlign(),
VI->getPointerAddressSpace(),
9238 auto *LI0 = cast<LoadInst>(VL0);
9241 if (E->State == TreeEntry::Vectorize) {
9243 Instruction::Load, VecTy, LI0->getAlign(),
9245 }
else if (E->State == TreeEntry::StridedVectorize) {
9246 Align CommonAlignment =
9247 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9249 Instruction::Load, VecTy, LI0->getPointerOperand(),
9252 assert(E->State == TreeEntry::ScatterVectorize &&
"Unknown EntryState");
9253 Align CommonAlignment =
9254 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9256 Instruction::Load, VecTy, LI0->getPointerOperand(),
9259 return VecLdCost + CommonCost;
9265 if (E->State == TreeEntry::ScatterVectorize)
9271 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
9272 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9274 case Instruction::Store: {
9275 bool IsReorder = !E->ReorderIndices.empty();
9276 auto GetScalarCost = [=](
unsigned Idx) {
9277 auto *
VI = cast<StoreInst>(VL[
Idx]);
9280 VI->getAlign(),
VI->getPointerAddressSpace(),
9284 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9289 BaseSI->getPointerAddressSpace(),
CostKind,
9295 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
9296 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
9299 return GetCostDiff(GetScalarCost, GetVectorCost) +
9300 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9302 case Instruction::Call: {
9303 auto GetScalarCost = [&](
unsigned Idx) {
9304 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
9315 auto *CI = cast<CallInst>(VL0);
9319 It != MinBWs.
end() ? It->second.first : 0);
9321 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9323 return GetCostDiff(GetScalarCost, GetVectorCost);
9325 case Instruction::ShuffleVector: {
9326 assert(E->isAltShuffle() &&
9331 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9332 "Invalid Shuffle Vector Operand");
9335 auto TryFindNodeWithEqualOperands = [=]() {
9336 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9339 if (
TE->isAltShuffle() &&
9340 ((
TE->getOpcode() == E->getOpcode() &&
9341 TE->getAltOpcode() == E->getAltOpcode()) ||
9342 (
TE->getOpcode() == E->getAltOpcode() &&
9343 TE->getAltOpcode() == E->getOpcode())) &&
9344 TE->hasEqualOperands(*E))
9349 auto GetScalarCost = [&](
unsigned Idx) {
9350 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9351 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
9361 if (TryFindNodeWithEqualOperands()) {
9363 dbgs() <<
"SLP: diamond match for alternate node found.\n";
9370 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
9372 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
9373 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9375 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9376 CI0->getPredicate(),
CostKind, VL0);
9377 VecCost += TTIRef.getCmpSelInstrCost(
9378 E->getOpcode(), VecTy, MaskTy,
9379 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
9382 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9385 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9386 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9388 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9389 if (SrcIt != MinBWs.
end()) {
9390 SrcBWSz = SrcIt->second.first;
9394 if (BWSz <= SrcBWSz) {
9397 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9401 <<
"SLP: alternate extension, which should be truncated.\n";
9407 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9410 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9414 E->buildAltOpShuffleMask(
9416 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
9417 return I->getOpcode() == E->getAltOpcode();
9426 unsigned Opcode0 = E->getOpcode();
9427 unsigned Opcode1 = E->getAltOpcode();
9430 for (
unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9431 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9432 OpcodeMask.set(Lane);
9435 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9437 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
9438 return AltVecCost < VecCost ? AltVecCost : VecCost;
9443 return GetCostDiff(GetScalarCost, GetVectorCost);
9450bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
9452 << VectorizableTree.size() <<
" is fully vectorizable .\n");
9454 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
9456 return TE->State == TreeEntry::NeedToGather &&
9458 [
this](
Value *V) { return EphValues.contains(V); }) &&
9460 TE->Scalars.size() < Limit ||
9461 ((
TE->getOpcode() == Instruction::ExtractElement ||
9462 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9464 (
TE->State == TreeEntry::NeedToGather &&
9465 TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()));
9469 if (VectorizableTree.size() == 1 &&
9470 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9472 AreVectorizableGathers(VectorizableTree[0].
get(),
9473 VectorizableTree[0]->Scalars.size()) &&
9474 VectorizableTree[0]->getVectorFactor() > 2)))
9477 if (VectorizableTree.size() != 2)
9485 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9486 AreVectorizableGathers(VectorizableTree[1].
get(),
9487 VectorizableTree[0]->Scalars.size()))
9491 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9492 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9493 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9494 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9502 bool MustMatchOrInst) {
9506 Value *ZextLoad = Root;
9507 const APInt *ShAmtC;
9508 bool FoundOr =
false;
9509 while (!isa<ConstantExpr>(ZextLoad) &&
9512 ShAmtC->
urem(8) == 0))) {
9513 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9514 ZextLoad = BinOp->getOperand(0);
9515 if (BinOp->getOpcode() == Instruction::Or)
9520 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9527 Type *SrcTy = Load->getType();
9534 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
9535 << *(cast<Instruction>(Root)) <<
"\n");
9544 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9545 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9553 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9554 for (
Value *Scalar : VectorizableTree[0]->Scalars) {
9565 if (VectorizableTree.size() == 2 &&
9566 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9567 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9568 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9569 !(
isSplat(VectorizableTree[1]->Scalars) ||
9577 constexpr int Limit = 4;
9579 !VectorizableTree.empty() &&
9580 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9581 return (TE->State == TreeEntry::NeedToGather &&
9582 TE->getOpcode() != Instruction::ExtractElement &&
9583 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9584 TE->getOpcode() == Instruction::PHI;
9595 if (isFullyVectorizableTinyTree(ForReduction))
9600 bool IsAllowedSingleBVNode =
9601 VectorizableTree.size() > 1 ||
9602 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9603 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9604 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9606 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9607 return TE->State == TreeEntry::NeedToGather &&
9609 return isa<ExtractElementInst, UndefValue>(V) ||
9610 (IsAllowedSingleBVNode &&
9611 !V->hasNUsesOrMore(UsesLimit) &&
9612 any_of(V->users(), IsaPred<InsertElementInst>));
9617 assert(VectorizableTree.empty()
9618 ? ExternalUses.empty()
9619 :
true &&
"We shouldn't have any external users");
9631 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9644 for (
const auto &TEPtr : VectorizableTree) {
9645 if (TEPtr->State != TreeEntry::Vectorize)
9647 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9653 auto *NodeA = DT->
getNode(
A->getParent());
9654 auto *NodeB = DT->
getNode(
B->getParent());
9655 assert(NodeA &&
"Should only process reachable instructions");
9656 assert(NodeB &&
"Should only process reachable instructions");
9657 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9658 "Different nodes should have different DFS numbers");
9660 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9661 return B->comesBefore(
A);
9671 LiveValues.
erase(PrevInst);
9672 for (
auto &J : PrevInst->
operands()) {
9673 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
9674 LiveValues.
insert(cast<Instruction>(&*J));
9678 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
9679 for (
auto *
X : LiveValues)
9680 dbgs() <<
" " <<
X->getName();
9681 dbgs() <<
", Looking at ";
9686 unsigned NumCalls = 0;
9690 while (InstIt != PrevInstIt) {
9692 PrevInstIt = Inst->getParent()->rbegin();
9697 if (
auto *II = dyn_cast<IntrinsicInst>(
I)) {
9698 if (II->isAssumeLikeIntrinsic())
9702 for (
auto &ArgOp : II->args())
9704 if (
auto *FPMO = dyn_cast<FPMathOperator>(II))
9705 FMF = FPMO->getFastMathFlags();
9712 if (IntrCost < CallCost)
9719 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
9720 &*PrevInstIt != PrevInst)
9728 for (
auto *II : LiveValues) {
9729 auto *ScalarTy = II->getType();
9730 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
9731 ScalarTy = VectorTy->getElementType();
9749 const auto *I1 = IE1;
9750 const auto *I2 = IE2;
9762 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
9764 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
9765 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
9767 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
9768 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
9776 template <
typename U>
9777 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
9780 template <
typename U>
9781 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
9799template <
typename T>
9805 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
9807 auto VMIt = std::next(ShuffleMask.begin());
9810 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
9812 if (!IsBaseUndef.
all()) {
9814 std::pair<T *, bool> Res =
9815 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
9817 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
9821 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
9823 auto *V = ValueSelect::get<T *>(
Base);
9825 assert((!V || GetVF(V) == Mask.size()) &&
9826 "Expected base vector of VF number of elements.");
9827 Prev = Action(Mask, {
nullptr, Res.first});
9828 }
else if (ShuffleMask.size() == 1) {
9831 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
9837 Prev = Action(Mask, {ShuffleMask.begin()->first});
9841 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
9842 unsigned Vec2VF = GetVF(VMIt->first);
9843 if (Vec1VF == Vec2VF) {
9847 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9850 Mask[
I] = SecMask[
I] + Vec1VF;
9853 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
9856 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
9858 std::pair<T *, bool> Res2 =
9859 ResizeAction(VMIt->first, VMIt->second,
false);
9861 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9868 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
9871 Prev = Action(Mask, {Res1.first, Res2.first});
9873 VMIt = std::next(VMIt);
9875 bool IsBaseNotUndef = !IsBaseUndef.
all();
9876 (void)IsBaseNotUndef;
9878 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
9880 std::pair<T *, bool> Res =
9881 ResizeAction(VMIt->first, VMIt->second,
false);
9883 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9886 "Multiple uses of scalars.");
9887 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
9892 Prev = Action(Mask, {Prev, Res.first});
9900 << VectorizableTree.size() <<
".\n");
9902 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
9905 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
9906 TreeEntry &TE = *VectorizableTree[
I];
9907 if (TE.State == TreeEntry::NeedToGather) {
9908 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
9909 E && E->getVectorFactor() == TE.getVectorFactor() &&
9910 E->isSame(TE.Scalars)) {
9915 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
9924 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
9934 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
9935 for (ExternalUser &EU : ExternalUses) {
9937 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
9938 !ExtractCostCalculated.
insert(EU.Scalar).second)
9944 if (EphValues.
count(EU.User))
9948 if (isa<FixedVectorType>(EU.Scalar->getType()))
9953 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
9954 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
9955 if (!UsedInserts.
insert(VU).second)
9959 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
9962 [
this, VU](
const std::pair<Value *, const TreeEntry *> &Pair) {
9964 VU, cast<InsertElementInst>(Pair.first),
9966 Value *Op0 = II->getOperand(0);
9967 if (getTreeEntry(II) && !getTreeEntry(Op0))
9973 if (It == FirstUsers.
end()) {
9980 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
9981 if (IEBase != EU.User &&
9982 (!IEBase->hasOneUse() ||
9986 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
9989 IEBase = cast<InsertElementInst>(
Base);
9992 "InsertElementInstruction used already.");
9994 Base = IEBase->getOperand(0);
9995 }
while (E == getTreeEntry(
Base));
9998 Base = cast<InsertElementInst>(
Base)->getOperand(0);
10002 VecId = FirstUsers.
size() - 1;
10003 auto It = MinBWs.
find(ScalarTE);
10004 if (It != MinBWs.
end() &&
10006 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
10008 unsigned BWSz = It->second.first;
10009 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
10010 unsigned VecOpcode;
10011 if (DstBWSz < BWSz)
10012 VecOpcode = Instruction::Trunc;
10015 It->second.second ? Instruction::SExt : Instruction::ZExt;
10021 FTy->getNumElements()),
10024 <<
" for extending externally used vector with "
10025 "non-equal minimum bitwidth.\n");
10031 VecId = std::distance(FirstUsers.
begin(), It);
10033 int InIdx = *InsertIdx;
10037 Mask[InIdx] = EU.Lane;
10038 DemandedElts[VecId].setBit(InIdx);
10046 if (
auto *
GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10047 if (!ValueToExtUses) {
10048 ValueToExtUses.emplace();
10050 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
10056 if (!getTreeEntry(V))
10058 auto It = ValueToExtUses->find(V);
10059 if (It != ValueToExtUses->end()) {
10061 ExternalUses[It->second].User = nullptr;
10066 if (CanBeUsedAsGEP) {
10068 ExternalUsesAsGEPs.
insert(EU.Scalar);
10077 auto It = MinBWs.
find(getTreeEntry(EU.Scalar));
10078 if (It != MinBWs.
end()) {
10081 It->second.second ? Instruction::SExt : Instruction::ZExt;
10091 if (!VectorizedVals.
empty()) {
10092 const TreeEntry &Root = *VectorizableTree.front().get();
10093 auto BWIt = MinBWs.find(&Root);
10094 if (BWIt != MinBWs.end()) {
10095 Type *DstTy = Root.Scalars.front()->getType();
10096 unsigned OriginalSz =
DL->getTypeSizeInBits(DstTy);
10098 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10099 if (OriginalSz != SrcSz) {
10100 unsigned Opcode = Instruction::Trunc;
10101 if (OriginalSz > SrcSz)
10102 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10112 Cost += SpillCost + ExtractCost;
10116 unsigned VF =
Mask.size();
10117 unsigned VecVF =
TE->getVectorFactor();
10119 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
10122 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
10128 dbgs() <<
"SLP: Adding cost " <<
C
10129 <<
" for final shuffle of insertelement external users.\n";
10130 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10132 return std::make_pair(TE,
true);
10134 return std::make_pair(TE,
false);
10137 for (
int I = 0, E = FirstUsers.size();
I < E; ++
I) {
10138 Value *
Base = cast<Instruction>(FirstUsers[
I].first)->getOperand(0);
10139 auto Vector = ShuffleMasks[
I].takeVector();
10143 assert((TEs.size() == 1 || TEs.size() == 2) &&
10144 "Expected exactly 1 or 2 tree entries.");
10145 if (TEs.size() == 1) {
10147 VF = TEs.front()->getVectorFactor();
10153 (
Data.index() < VF &&
10154 static_cast<int>(
Data.index()) ==
Data.value());
10159 <<
" for final shuffle of insertelement "
10160 "external users.\n";
10161 TEs.front()->
dump();
10162 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10168 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10169 VF = TEs.front()->getVectorFactor();
10178 <<
" for final shuffle of vector node and external "
10179 "insertelement users.\n";
10180 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10181 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10187 (void)performExtractsShuffleAction<const TreeEntry>(
10189 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
10190 EstimateShufflesCost);
10192 cast<FixedVectorType>(FirstUsers[
I].first->getType()), DemandedElts[
I],
10194 Cost -= InsertCost;
10198 if (ReductionBitWidth != 0) {
10199 assert(UserIgnoreList &&
"Expected reduction tree.");
10200 const TreeEntry &E = *VectorizableTree.front().get();
10201 auto It = MinBWs.find(&E);
10202 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10203 unsigned SrcSize = It->second.first;
10204 unsigned DstSize = ReductionBitWidth;
10205 unsigned Opcode = Instruction::Trunc;
10206 if (SrcSize < DstSize)
10207 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10214 switch (E.getOpcode()) {
10215 case Instruction::SExt:
10216 case Instruction::ZExt:
10217 case Instruction::Trunc: {
10218 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10219 CCH = getCastContextHint(*OpTE);
10229 <<
" for final resize for reduction from " << SrcVecTy
10230 <<
" to " << DstVecTy <<
"\n";
10231 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10239 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
10240 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
10241 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
10245 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
10256std::optional<TTI::ShuffleKind>
10257BoUpSLP::tryToGatherSingleRegisterExtractElements(
10263 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
10264 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10266 if (isa<UndefValue>(VL[
I]))
10270 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10271 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10280 ExtractMask.reset(*
Idx);
10285 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
10289 for (
const auto &
Data : VectorOpToIdx)
10290 VFToVector[cast<FixedVectorType>(
Data.first->getType())->getNumElements()]
10291 .push_back(
Data.first);
10292 for (
auto &
Data : VFToVector) {
10294 return VectorOpToIdx.find(V1)->second.size() >
10295 VectorOpToIdx.find(V2)->second.size();
10300 const int UndefSz = UndefVectorExtracts.
size();
10301 unsigned SingleMax = 0;
10302 Value *SingleVec =
nullptr;
10303 unsigned PairMax = 0;
10304 std::pair<Value *, Value *> PairVec(
nullptr,
nullptr);
10305 for (
auto &
Data : VFToVector) {
10307 if (SingleMax < VectorOpToIdx[V1].
size() + UndefSz) {
10308 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10312 if (
Data.second.size() > 1)
10313 V2 = *std::next(
Data.second.begin());
10314 if (V2 && PairMax < VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() +
10316 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[
V2].size() + UndefSz;
10317 PairVec = std::make_pair(V1, V2);
10320 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10321 return std::nullopt;
10327 if (SingleMax >= PairMax && SingleMax) {
10328 for (
int Idx : VectorOpToIdx[SingleVec])
10331 for (
Value *V : {PairVec.first, PairVec.second})
10332 for (
int Idx : VectorOpToIdx[V])
10336 for (
int Idx : UndefVectorExtracts)
10340 std::optional<TTI::ShuffleKind> Res =
10346 return std::nullopt;
10350 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
10351 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
10352 isa<UndefValue>(GatheredExtracts[
I])) {
10356 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10357 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10358 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10373 unsigned NumParts)
const {
10374 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
10377 unsigned SliceSize = VL.
size() / NumParts;
10378 for (
unsigned Part = 0; Part < NumParts; ++Part) {
10384 std::optional<TTI::ShuffleKind> Res =
10385 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10386 ShufflesRes[Part] = Res;
10387 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
10389 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
10390 return Res.has_value();
10392 ShufflesRes.clear();
10393 return ShufflesRes;
10396std::optional<TargetTransformInfo::ShuffleKind>
10397BoUpSLP::isGatherShuffledSingleRegisterEntry(
10403 const EdgeInfo &TEUseEI =
TE->UserTreeIndices.front();
10404 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10408 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10409 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10412 TEInsertBlock = TEInsertPt->
getParent();
10415 return std::nullopt;
10416 auto *NodeUI = DT->
getNode(TEInsertBlock);
10417 assert(NodeUI &&
"Should only process reachable instructions");
10419 auto CheckOrdering = [&](
const Instruction *InsertPt) {
10433 auto *NodeEUI = DT->
getNode(InsertBlock);
10436 assert((NodeUI == NodeEUI) ==
10437 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10438 "Different nodes should have different DFS numbers");
10440 if (TEInsertPt->
getParent() != InsertBlock &&
10443 if (TEInsertPt->
getParent() == InsertBlock &&
10457 for (
Value *V : VL) {
10462 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10466 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
10467 "Must contain at least single gathered value.");
10468 assert(TEPtr->UserTreeIndices.size() == 1 &&
10469 "Expected only single user of a gather node.");
10470 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10472 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10475 : &getLastInstructionInBundle(UseEI.UserTE);
10476 if (TEInsertPt == InsertPt) {
10480 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10484 if (TEUseEI.UserTE != UseEI.UserTE &&
10485 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10491 if ((TEInsertBlock != InsertPt->
getParent() ||
10492 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10493 !CheckOrdering(InsertPt))
10497 if (
const TreeEntry *VTE = getTreeEntry(V)) {
10499 if (VTE->State != TreeEntry::Vectorize) {
10500 auto It = MultiNodeScalars.
find(V);
10501 if (It == MultiNodeScalars.
end())
10503 VTE = *It->getSecond().begin();
10505 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
10506 return MTE->State == TreeEntry::Vectorize;
10508 if (MIt == It->getSecond().end())
10513 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10514 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10518 if (VToTEs.
empty())
10520 if (UsedTEs.
empty()) {
10534 if (!VToTEs.
empty()) {
10540 VToTEs = SavedVToTEs;
10549 if (UsedTEs.
size() == 2)
10551 UsedTEs.push_back(SavedVToTEs);
10558 if (UsedTEs.
empty()) {
10560 return std::nullopt;
10564 if (UsedTEs.
size() == 1) {
10567 UsedTEs.front().
end());
10568 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10569 return TE1->Idx < TE2->Idx;
10572 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
10573 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
10575 if (It != FirstEntries.end() &&
10576 ((*It)->getVectorFactor() == VL.size() ||
10577 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
10578 TE->ReuseShuffleIndices.size() == VL.size() &&
10579 (*It)->isSame(
TE->Scalars)))) {
10580 Entries.push_back(*It);
10581 if ((*It)->getVectorFactor() == VL.size()) {
10582 std::iota(std::next(
Mask.begin(), Part * VL.size()),
10583 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
10589 for (
int I = 0, Sz = VL.size();
I < Sz; ++
I)
10590 if (isa<PoisonValue>(VL[
I]))
10596 Entries.push_back(FirstEntries.front());
10599 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
10602 for (
const TreeEntry *TE : UsedTEs.front()) {
10603 unsigned VF =
TE->getVectorFactor();
10604 auto It = VFToTE.
find(VF);
10605 if (It != VFToTE.
end()) {
10606 if (It->second->Idx >
TE->Idx)
10607 It->getSecond() =
TE;
10614 UsedTEs.back().
end());
10615 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10616 return TE1->Idx < TE2->Idx;
10618 for (
const TreeEntry *TE : SecondEntries) {
10619 auto It = VFToTE.
find(
TE->getVectorFactor());
10620 if (It != VFToTE.
end()) {
10622 Entries.push_back(It->second);
10623 Entries.push_back(TE);
10629 if (Entries.empty()) {
10631 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10632 return TE1->Idx < TE2->Idx;
10634 Entries.push_back(SecondEntries.front());
10635 VF = std::max(Entries.front()->getVectorFactor(),
10636 Entries.back()->getVectorFactor());
10640 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
10643 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
10644 auto *
PHI = cast<PHINode>(V);
10645 auto *PHI1 = cast<PHINode>(V1);
10650 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
10652 Value *In1 = PHI1->getIncomingValue(
I);
10657 if (cast<Instruction>(In)->
getParent() !=
10667 auto MightBeIgnored = [=](
Value *
V) {
10668 auto *
I = dyn_cast<Instruction>(V);
10669 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
10671 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
10676 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
10678 bool UsedInSameVTE =
false;
10679 auto It = UsedValuesEntry.
find(V1);
10680 if (It != UsedValuesEntry.
end())
10681 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
10682 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
10684 cast<Instruction>(V)->getParent() ==
10685 cast<Instruction>(V1)->getParent() &&
10686 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
10691 for (
int I = 0, E = VL.size();
I < E; ++
I) {
10693 auto It = UsedValuesEntry.
find(V);
10694 if (It == UsedValuesEntry.
end())
10700 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
10701 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
10703 unsigned Idx = It->second;
10710 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
10711 if (!UsedIdxs.test(
I))
10717 for (std::pair<unsigned, int> &Pair : EntryLanes)
10718 if (Pair.first ==
I)
10719 Pair.first = TempEntries.
size();
10722 Entries.swap(TempEntries);
10723 if (EntryLanes.size() == Entries.size() &&
10725 .
slice(Part * VL.size(),
10726 std::min<int>(VL.size(),
TE->Scalars.size())))) {
10732 return std::nullopt;
10735 bool IsIdentity = Entries.size() == 1;
10738 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
10739 unsigned Idx = Part * VL.size() + Pair.second;
10742 (ForOrder ? std::distance(
10743 Entries[Pair.first]->Scalars.begin(),
10744 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
10745 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
10746 IsIdentity &=
Mask[
Idx] == Pair.second;
10748 switch (Entries.size()) {
10750 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
10754 if (EntryLanes.size() > 2 || VL.size() <= 2)
10762 std::fill(std::next(
Mask.begin(), Part * VL.size()),
10764 return std::nullopt;
10768BoUpSLP::isGatherShuffledEntry(
10772 assert(NumParts > 0 && NumParts < VL.
size() &&
10773 "Expected positive number of registers.");
10776 if (TE == VectorizableTree.front().get())
10779 if (
TE->isNonPowOf2Vec())
10782 assert(
TE->UserTreeIndices.size() == 1 &&
10783 "Expected only single user of the gather node.");
10785 "Number of scalars must be divisible by NumParts.");
10786 unsigned SliceSize = VL.
size() / NumParts;
10788 for (
unsigned Part = 0; Part < NumParts; ++Part) {
10791 std::optional<TTI::ShuffleKind> SubRes =
10792 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
10795 SubEntries.
clear();
10798 SubEntries.
front()->getVectorFactor() == VL.
size() &&
10799 (SubEntries.
front()->isSame(
TE->Scalars) ||
10800 SubEntries.
front()->isSame(VL))) {
10802 LocalSubEntries.
swap(SubEntries);
10805 std::iota(
Mask.begin(),
Mask.end(), 0);
10807 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
10808 if (isa<PoisonValue>(VL[
I]))
10810 Entries.emplace_back(1, LocalSubEntries.
front());
10816 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
10824 bool ForPoisonSrc)
const {
10826 Type *ScalarTy = VL[0]->getType();
10827 if (
StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
10828 ScalarTy =
SI->getValueOperand()->getType();
10830 bool DuplicateNonConst =
false;
10838 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
10845 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
10848 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
10856 EstimateInsertCost(
I, V);
10857 ShuffleMask[
I] =
I;
10861 DuplicateNonConst =
true;
10863 ShuffleMask[
I] = Res.first->second;
10869 if (DuplicateNonConst)
10871 VecTy, ShuffleMask);
10883 VLOperands Ops(VL, R);
10886 Left = Ops.getVL(0);
10887 Right = Ops.getVL(1);
10890Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
10893 return *Res.second;
10897 auto *Front = E->getMainOp();
10900 if (E->getOpcode() == Instruction::GetElementPtr &&
10901 !isa<GetElementPtrInst>(V))
10903 auto *I = cast<Instruction>(V);
10904 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
10905 isVectorLikeInstWithConstOps(I);
10908 auto FindLastInst = [&]() {
10910 for (
Value *V : E->Scalars) {
10911 auto *
I = dyn_cast<Instruction>(V);
10914 if (LastInst->
getParent() ==
I->getParent()) {
10919 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10920 !isa<GetElementPtrInst>(
I)) ||
10923 "Expected vector-like or non-GEP in GEP node insts only.");
10931 auto *NodeB = DT->
getNode(
I->getParent());
10932 assert(NodeA &&
"Should only process reachable instructions");
10933 assert(NodeB &&
"Should only process reachable instructions");
10934 assert((NodeA == NodeB) ==
10935 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10936 "Different nodes should have different DFS numbers");
10937 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
10944 auto FindFirstInst = [&]() {
10946 for (
Value *V : E->Scalars) {
10947 auto *
I = dyn_cast<Instruction>(V);
10950 if (FirstInst->
getParent() ==
I->getParent()) {
10951 if (
I->comesBefore(FirstInst))
10955 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10956 !isa<GetElementPtrInst>(
I)) ||
10959 "Expected vector-like or non-GEP in GEP node insts only.");
10967 auto *NodeB = DT->
getNode(
I->getParent());
10968 assert(NodeA &&
"Should only process reachable instructions");
10969 assert(NodeB &&
"Should only process reachable instructions");
10970 assert((NodeA == NodeB) ==
10971 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10972 "Different nodes should have different DFS numbers");
10973 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
10982 (E->State != TreeEntry::NeedToGather &&
10984 if ((E->getOpcode() == Instruction::GetElementPtr &&
10987 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
10991 return !isVectorLikeInstWithConstOps(V) &&
10992 isUsedOutsideBlock(V);
10994 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
10995 all_of(E->Scalars, IsaPred<ExtractElementInst, UndefValue>)))
10996 Res.second = FindLastInst();
10998 Res.second = FindFirstInst();
10999 return *Res.second;
11006 if (BlocksSchedules.count(BB)) {
11007 Value *
V = E->isOneOf(E->Scalars.back());
11010 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11011 if (Bundle && Bundle->isPartOfBundle())
11012 for (; Bundle; Bundle = Bundle->NextInBundle)
11013 if (Bundle->OpValue == Bundle->Inst)
11014 Res.second = Bundle->Inst;
11036 Res.second = FindLastInst();
11037 assert(Res.second &&
"Failed to find last instruction in bundle");
11038 return *Res.second;
11041void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
11042 auto *Front = E->getMainOp();
11043 Instruction *LastInst = &getLastInstructionInBundle(E);
11044 assert(LastInst &&
"Failed to find last instruction in bundle");
11047 bool IsPHI = isa<PHINode>(LastInst);
11050 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11052 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
11056 Builder.SetInsertPoint(
11060 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11070 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
11073 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
11074 InsertBB = InsertBB->getSinglePredecessor();
11075 return InsertBB && InsertBB == InstBB;
11077 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11078 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
11079 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11080 getTreeEntry(Inst) ||
11081 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
11082 PostponedIndices.
insert(
I).second)
11086 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
11089 if (cast<VectorType>(Vec->
getType())->getElementType() != Ty) {
11091 "Expected integer types only.");
11092 Vec = Builder.CreateIntCast(
11095 cast<VectorType>(Vec->
getType())->getElementCount()),
11099 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11100 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11103 GatherShuffleExtractSeq.
insert(InsElt);
11104 CSEBlocks.
insert(InsElt->getParent());
11106 if (isa<Instruction>(V)) {
11107 if (TreeEntry *Entry = getTreeEntry(V)) {
11109 User *UserOp =
nullptr;
11111 if (
auto *SI = dyn_cast<Instruction>(Scalar))
11117 unsigned FoundLane = Entry->findLaneForValue(V);
11118 ExternalUses.emplace_back(V, UserOp, FoundLane);
11125 isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
11131 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11139 if (!isa<UndefValue>(VL[
I])) {
11143 if (isa<PoisonValue>(VL[
I]))
11145 if (
auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11150 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11153 for (
int I : NonConsts)
11154 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11157 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11158 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11196 bool IsFinalized =
false;
11209 class ShuffleIRBuilder {
11222 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11223 CSEBlocks(CSEBlocks),
DL(
DL) {}
11224 ~ShuffleIRBuilder() =
default;
11227 if (V1->
getType() != V2->getType()) {
11230 "Expected integer vector types only.");
11231 if (V1->
getType() != V2->getType()) {
11232 if (cast<VectorType>(V2->getType())
11234 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
11236 ->getIntegerBitWidth())
11245 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11246 GatherShuffleExtractSeq.
insert(
I);
11247 CSEBlocks.
insert(
I->getParent());
11256 unsigned VF = Mask.size();
11257 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11261 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11262 GatherShuffleExtractSeq.
insert(
I);
11263 CSEBlocks.
insert(
I->getParent());
11267 Value *createIdentity(
Value *V) {
return V; }
11268 Value *createPoison(
Type *Ty,
unsigned VF) {
11273 void resizeToMatch(
Value *&V1,
Value *&V2) {
11274 if (V1->
getType() == V2->getType())
11276 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11277 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11278 int VF = std::max(V1VF, V2VF);
11279 int MinVF = std::min(V1VF, V2VF);
11281 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
11283 Value *&
Op = MinVF == V1VF ? V1 : V2;
11285 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
11286 GatherShuffleExtractSeq.
insert(
I);
11287 CSEBlocks.
insert(
I->getParent());
11300 assert(V1 &&
"Expected at least one vector value.");
11301 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11302 R.CSEBlocks, *R.DL);
11303 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11311 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11318 : Builder(Builder), R(R) {}
11322 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11323 unsigned NumParts,
bool &UseVecBaseAsInput) {
11324 UseVecBaseAsInput =
false;
11326 Value *VecBase =
nullptr;
11327 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
11331 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
11332 VecBase = EI->getVectorOperand();
11333 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
11334 VecBase = TE->VectorizedValue;
11335 assert(VecBase &&
"Expected vectorized value.");
11336 UniqueBases.
insert(VecBase);
11339 if (!EI->hasOneUse() || (NumParts != 1 &&
count(E->Scalars, EI) > 1) ||
11341 const TreeEntry *UTE = R.getTreeEntry(U);
11342 return !UTE || R.MultiNodeScalars.contains(U) ||
11343 count_if(R.VectorizableTree,
11344 [&](const std::unique_ptr<TreeEntry> &TE) {
11345 return any_of(TE->UserTreeIndices,
11346 [&](const EdgeInfo &Edge) {
11347 return Edge.UserTE == UTE;
11349 is_contained(TE->Scalars, EI);
11353 R.eraseInstruction(EI);
11355 if (NumParts == 1 || UniqueBases.
size() == 1)
11357 UseVecBaseAsInput =
true;
11367 Value *Vec =
nullptr;
11369 unsigned SliceSize = E->Scalars.size() / NumParts;
11370 for (
unsigned Part = 0; Part < NumParts; ++Part) {
11374 constexpr int MaxBases = 2;
11382 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11383 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
11384 VecOp = TE->VectorizedValue;
11385 assert(VecOp &&
"Expected vectorized value.");
11387 cast<FixedVectorType>(VecOp->
getType())->getNumElements();
11389 assert((PrevSize ==
Size || PrevSize == 0) &&
11390 "Expected vectors of the same size.");
11393 Bases[SubMask[
I] <
Size ? 0 : 1] = VecOp;
11395 if (!Bases.front())
11398 if (Bases.back()) {
11399 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11400 TransformToIdentity(SubMask);
11402 SubVec = Bases.front();
11409 Mask.slice(
P * SliceSize, SliceSize);
11414 "Expected first part or all previous parts masked.");
11415 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11417 unsigned VF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11419 unsigned SubVecVF =
11420 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
11421 VF = std::max(VF, SubVecVF);
11424 for (
int &
Idx : SubMask)
11427 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11428 Vec = createShuffle(Vec, SubVec, VecMask);
11429 TransformToIdentity(VecMask);
11437 std::optional<Value *>
11443 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
11445 return std::nullopt;
11449 E->getVectorFactor());
11457 add(E1.VectorizedValue, E2.VectorizedValue, Mask);
11462 add(E1.VectorizedValue, Mask);
11466 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
11467 if (InVectors.
empty()) {
11470 CommonMask.
assign(Mask.begin(), Mask.end());
11474 if (InVectors.
size() == 2) {
11475 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11476 transformMaskAfterShuffle(CommonMask, CommonMask);
11477 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
11479 Vec = createShuffle(Vec,
nullptr, CommonMask);
11480 transformMaskAfterShuffle(CommonMask, CommonMask);
11482 V1 = createShuffle(V1, V2, Mask);
11483 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11485 CommonMask[
Idx] =
Idx + Sz;
11486 InVectors.
front() = Vec;
11487 if (InVectors.
size() == 2)
11488 InVectors.
back() = V1;
11494 if (InVectors.
empty()) {
11495 if (!isa<FixedVectorType>(V1->
getType())) {
11496 V1 = createShuffle(V1,
nullptr, CommonMask);
11498 transformMaskAfterShuffle(CommonMask, Mask);
11501 CommonMask.
assign(Mask.begin(), Mask.end());
11504 const auto *It =
find(InVectors, V1);
11505 if (It == InVectors.
end()) {
11506 if (InVectors.
size() == 2 ||
11508 !isa<FixedVectorType>(V1->
getType())) {
11510 if (InVectors.
size() == 2) {
11511 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11512 transformMaskAfterShuffle(CommonMask, CommonMask);
11513 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11514 CommonMask.
size()) {
11515 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
11516 transformMaskAfterShuffle(CommonMask, CommonMask);
11518 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11521 V->getType() != V1->
getType()
11523 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
11524 ->getNumElements();
11525 if (V->getType() != V1->
getType())
11526 V1 = createShuffle(V1,
nullptr, Mask);
11527 InVectors.
front() = V;
11528 if (InVectors.
size() == 2)
11529 InVectors.
back() = V1;
11536 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11542 int VF = CommonMask.
size();
11543 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
11544 VF = FTy->getNumElements();
11545 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11547 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
11556 Value *Root =
nullptr) {
11557 return R.gather(VL, Root);
11566 IsFinalized =
true;
11569 if (InVectors.
size() == 2) {
11570 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11573 Vec = createShuffle(Vec,
nullptr, CommonMask);
11575 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11579 "Expected vector length for the final value before action.");
11580 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11583 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11584 Vec = createShuffle(Vec,
nullptr, ResizeMask);
11586 Action(Vec, CommonMask);
11587 InVectors.
front() = Vec;
11589 if (!ExtMask.
empty()) {
11590 if (CommonMask.
empty()) {
11594 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
11597 NewMask[
I] = CommonMask[ExtMask[
I]];
11599 CommonMask.
swap(NewMask);
11602 if (CommonMask.
empty()) {
11603 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11604 return InVectors.
front();
11606 if (InVectors.
size() == 2)
11607 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11608 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
11613 "Shuffle construction must be finalized.");
11617Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
11618 bool PostponedPHIs) {
11619 ValueList &VL = E->getOperand(NodeIdx);
11620 const unsigned VF = VL.size();
11623 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11624 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
11625 if (It != VL.end())
11628 if (S.getOpcode()) {
11629 auto CheckSameVE = [&](
const TreeEntry *VE) {
11630 return VE->isSame(VL) &&
11631 (
any_of(VE->UserTreeIndices,
11632 [E, NodeIdx](
const EdgeInfo &EI) {
11633 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11635 any_of(VectorizableTree,
11636 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
11637 return TE->isOperandGatherNode({E, NodeIdx}) &&
11638 VE->isSame(TE->Scalars);
11641 TreeEntry *VE = getTreeEntry(S.OpValue);
11642 bool IsSameVE = VE && CheckSameVE(VE);
11644 auto It = MultiNodeScalars.
find(S.OpValue);
11645 if (It != MultiNodeScalars.
end()) {
11646 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
11647 return TE != VE && CheckSameVE(TE);
11649 if (
I != It->getSecond().end()) {
11657 ShuffleInstructionBuilder ShuffleBuilder(Builder, *
this);
11658 ShuffleBuilder.add(V, Mask);
11659 return ShuffleBuilder.finalize(std::nullopt);
11662 if (VF != cast<FixedVectorType>(
V->getType())->getNumElements()) {
11663 if (!VE->ReuseShuffleIndices.empty()) {
11684 if (isa<PoisonValue>(V))
11686 Mask[
I] = VE->findLaneForValue(V);
11688 V = FinalShuffle(V, Mask);
11690 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
11691 "Expected vectorization factor less "
11692 "than original vector size.");
11694 std::iota(UniformMask.begin(), UniformMask.end(), 0);
11695 V = FinalShuffle(V, UniformMask);
11701 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11702 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11703 }) == VE->UserTreeIndices.end()) {
11705 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11706 return TE->State == TreeEntry::NeedToGather &&
11707 TE->UserTreeIndices.front().UserTE == E &&
11708 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
11710 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
11711 (*It)->VectorizedValue =
V;
11720 auto *
I =
find_if(VectorizableTree,
11721 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
11722 return TE->isOperandGatherNode({E, NodeIdx});
11724 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
11725 assert(
I->get()->UserTreeIndices.size() == 1 &&
11726 "Expected only single user for the gather node.");
11727 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
11731template <
typename BVTy,
typename ResTy,
typename...
Args>
11732ResTy BoUpSLP::processBuildVector(
const TreeEntry *E, Args &...Params) {
11733 assert(E->State == TreeEntry::NeedToGather &&
"Expected gather node.");
11734 unsigned VF = E->getVectorFactor();
11736 bool NeedFreeze =
false;
11738 E->ReuseShuffleIndices.end());
11744 if (!ReorderMask.
empty())
11747 unsigned I,
unsigned SliceSize) {
11749 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
11752 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
11753 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
11754 if (UserTE->getNumOperands() != 2)
11757 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
11758 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
11759 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
11760 }) !=
TE->UserTreeIndices.end();
11762 if (It == VectorizableTree.end())
11765 if ((
Mask.size() < InputVF &&
11768 (
Mask.size() == InputVF &&
11770 std::iota(std::next(
Mask.begin(),
I * SliceSize),
11771 std::next(
Mask.begin(), (
I + 1) * SliceSize), 0);
11775 std::fill(std::next(
Mask.begin(),
I * SliceSize),
11776 std::next(
Mask.begin(), (
I + 1) * SliceSize), IVal);
11780 BVTy ShuffleBuilder(Params...);
11781 ResTy Res = ResTy();
11785 Value *ExtractVecBase =
nullptr;
11786 bool UseVecBaseAsInput =
false;
11789 Type *ScalarTy = GatheredScalars.front()->getType();
11792 if (NumParts == 0 || NumParts >= GatheredScalars.size())
11794 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
11796 bool Resized =
false;
11798 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
11799 if (!ExtractShuffles.
empty()) {
11804 if (
const auto *TE = getTreeEntry(
11805 cast<ExtractElementInst>(E->Scalars[
Idx])->getVectorOperand()))
11808 if (std::optional<ResTy> Delayed =
11809 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
11811 PostponedGathers.
insert(E);
11816 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
11817 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
11818 ExtractVecBase = VecBase;
11819 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
11820 if (VF == VecBaseTy->getNumElements() &&
11821 GatheredScalars.size() != VF) {
11823 GatheredScalars.append(VF - GatheredScalars.size(),
11829 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
11830 E->isAltShuffle() ||
11831 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
11833 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
11835 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
11837 if (!GatherShuffles.
empty()) {
11838 if (std::optional<ResTy> Delayed =
11839 ShuffleBuilder.needToDelay(E, Entries)) {
11841 PostponedGathers.
insert(E);
11846 if (GatherShuffles.
size() == 1 &&
11848 Entries.front().front()->isSame(E->Scalars)) {
11853 <<
"SLP: perfect diamond match for gather bundle "
11856 Mask.resize(E->Scalars.size());
11857 const TreeEntry *FrontTE = Entries.front().front();
11858 if (FrontTE->ReorderIndices.empty() &&
11859 ((FrontTE->ReuseShuffleIndices.empty() &&
11860 E->Scalars.size() == FrontTE->Scalars.size()) ||
11861 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
11862 std::iota(
Mask.begin(),
Mask.end(), 0);
11865 if (isa<PoisonValue>(V)) {
11869 Mask[
I] = FrontTE->findLaneForValue(V);
11872 ShuffleBuilder.add(*FrontTE, Mask);
11873 Res = ShuffleBuilder.finalize(E->getCommonMask());
11877 if (GatheredScalars.size() != VF &&
11879 return any_of(TEs, [&](
const TreeEntry *TE) {
11880 return TE->getVectorFactor() == VF;
11883 GatheredScalars.append(VF - GatheredScalars.size(),
11887 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
11895 bool IsRootPoison) {
11898 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
11905 int NumNonConsts = 0;
11908 if (isa<UndefValue>(V)) {
11909 if (!isa<PoisonValue>(V)) {
11924 Scalars.
front() = OrigV;
11927 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
11928 Scalars[Res.first->second] = OrigV;
11929 ReuseMask[
I] = Res.first->second;
11932 if (NumNonConsts == 1) {
11937 if (!UndefPos.
empty() && UndefPos.
front() == 0)
11940 ReuseMask[SinglePos] = SinglePos;
11941 }
else if (!UndefPos.
empty() && IsSplat) {
11946 return !isa<UndefValue>(V) &&
11948 (E->UserTreeIndices.size() == 1 &&
11952 return E->UserTreeIndices.front().EdgeIdx !=
11953 U.getOperandNo() &&
11955 E->UserTreeIndices.front().UserTE->Scalars,
11959 if (It != Scalars.
end()) {
11961 int Pos = std::distance(Scalars.
begin(), It);
11962 for (
int I : UndefPos) {
11964 ReuseMask[
I] = Pos;
11973 for (
int I : UndefPos) {
11975 if (isa<UndefValue>(Scalars[
I]))
11982 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
11983 bool IsNonPoisoned =
true;
11984 bool IsUsedInExpr =
true;
11985 Value *Vec1 =
nullptr;
11986 if (!ExtractShuffles.
empty()) {
11990 Value *Vec2 =
nullptr;
11991 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
11995 if (UseVecBaseAsInput) {
11996 Vec1 = ExtractVecBase;
11998 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12001 if (isa<UndefValue>(E->Scalars[
I]))
12003 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
12004 Value *VecOp = EI->getVectorOperand();
12005 if (
const auto *TE = getTreeEntry(VecOp))
12006 if (
TE->VectorizedValue)
12007 VecOp =
TE->VectorizedValue;
12010 }
else if (Vec1 != VecOp) {
12011 assert((!Vec2 || Vec2 == VecOp) &&
12012 "Expected only 1 or 2 vectors shuffle.");
12018 IsUsedInExpr =
false;
12021 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12023 IsUsedInExpr &= FindReusedSplat(
12025 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
12026 ExtractMask.size());
12027 ShuffleBuilder.add(Vec1, ExtractMask,
true);
12030 IsUsedInExpr =
false;
12032 ScalarTy, GatheredScalars.size())),
12033 ExtractMask,
true);
12036 if (!GatherShuffles.
empty()) {
12037 unsigned SliceSize = E->Scalars.size() / NumParts;
12039 for (
const auto [
I, TEs] :
enumerate(Entries)) {
12042 "No shuffles with empty entries list expected.");
12046 "Expected shuffle of 1 or 2 entries.");
12049 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
12050 if (TEs.
size() == 1) {
12051 IsUsedInExpr &= FindReusedSplat(
12052 VecMask, TEs.
front()->getVectorFactor(),
I, SliceSize);
12053 ShuffleBuilder.add(*TEs.
front(), VecMask);
12054 if (TEs.
front()->VectorizedValue)
12058 IsUsedInExpr =
false;
12059 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
12060 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
12071 int EMSz = ExtractMask.size();
12072 int MSz =
Mask.size();
12075 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
12076 bool IsIdentityShuffle =
12077 ((UseVecBaseAsInput ||
12079 [](
const std::optional<TTI::ShuffleKind> &SK) {
12083 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
12085 (!GatherShuffles.
empty() &&
12087 [](
const std::optional<TTI::ShuffleKind> &SK) {
12091 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
12093 bool EnoughConstsForShuffle =
12097 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12101 return isa<Constant>(V) && !isa<UndefValue>(V);
12103 (!IsIdentityShuffle ||
12104 (GatheredScalars.size() == 2 &&
12106 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
12108 return isa<Constant>(V) && !isa<PoisonValue>(V);
12112 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
12113 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
12119 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12121 TryPackScalars(GatheredScalars, BVMask,
true);
12122 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12123 ShuffleBuilder.add(BV, BVMask);
12126 return isa<PoisonValue>(V) ||
12127 (IsSingleShuffle && ((IsIdentityShuffle &&
12128 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12130 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12132 Res = ShuffleBuilder.finalize(
12133 E->ReuseShuffleIndices, E->Scalars.size(),
12135 TryPackScalars(NonConstants, Mask,
false);
12136 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
12141 TryPackScalars(GatheredScalars, ReuseMask,
true);
12142 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
12143 ShuffleBuilder.add(BV, ReuseMask);
12144 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12149 if (!isa<PoisonValue>(V))
12152 Value *BV = ShuffleBuilder.gather(E->Scalars);
12153 ShuffleBuilder.add(BV, Mask);
12154 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12158 Res = ShuffleBuilder.createFreeze(Res);
12162Value *BoUpSLP::createBuildVector(
const TreeEntry *E) {
12163 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
12170 if (E->VectorizedValue &&
12171 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12172 E->isAltShuffle())) {
12173 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
12174 return E->VectorizedValue;
12177 if (E->State == TreeEntry::NeedToGather) {
12179 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12180 setInsertPointAfterBundle(E);
12181 Value *Vec = createBuildVector(E);
12182 E->VectorizedValue = Vec;
12187 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E,
VectorType *VecTy) {
12188 ShuffleInstructionBuilder ShuffleBuilder(Builder, *
this);
12189 if (E->getOpcode() == Instruction::Store) {
12191 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
12192 E->ReorderIndices.size());
12193 ShuffleBuilder.add(V, Mask);
12194 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12195 ShuffleBuilder.addOrdered(V, std::nullopt);
12197 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12199 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12202 assert((E->State == TreeEntry::Vectorize ||
12203 E->State == TreeEntry::ScatterVectorize ||
12204 E->State == TreeEntry::StridedVectorize) &&
12205 "Unhandled state");
12206 unsigned ShuffleOrOp =
12207 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
12210 if (
auto *Store = dyn_cast<StoreInst>(VL0))
12211 ScalarTy =
Store->getValueOperand()->getType();
12212 else if (
auto *IE = dyn_cast<InsertElementInst>(VL0))
12213 ScalarTy =
IE->getOperand(1)->getType();
12214 auto It = MinBWs.
find(E);
12215 if (It != MinBWs.
end())
12217 auto GetOperandSignedness = [&](
unsigned Idx) {
12218 const TreeEntry *OpE = getOperandEntry(E,
Idx);
12219 bool IsSigned =
false;
12220 auto It = MinBWs.
find(OpE);
12221 if (It != MinBWs.
end())
12222 IsSigned = It->second.second;
12225 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12230 switch (ShuffleOrOp) {
12231 case Instruction::PHI: {
12232 assert((E->ReorderIndices.empty() ||
12233 E != VectorizableTree.front().get() ||
12234 !E->UserTreeIndices.empty()) &&
12235 "PHI reordering is free.");
12236 if (PostponedPHIs && E->VectorizedValue)
12237 return E->VectorizedValue;
12238 auto *PH = cast<PHINode>(VL0);
12240 PH->getParent()->getFirstNonPHIIt());
12242 if (PostponedPHIs || !E->VectorizedValue) {
12249 PH->getParent()->getFirstInsertionPt());
12252 V = FinalShuffle(V, E, VecTy);
12254 E->VectorizedValue =
V;
12258 PHINode *NewPhi = cast<PHINode>(E->PHI);
12267 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12273 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12277 if (!VisitedBBs.
insert(IBB).second) {
12284 Value *Vec = vectorizeOperand(E,
I,
true);
12285 if (VecTy != Vec->
getType()) {
12287 getOperandEntry(E,
I)->State == TreeEntry::NeedToGather ||
12288 MinBWs.
contains(getOperandEntry(E,
I))) &&
12289 "Expected item in MinBWs.");
12290 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
12296 "Invalid number of incoming values");
12300 case Instruction::ExtractElement: {
12301 Value *
V = E->getSingleOperand(0);
12302 if (
const TreeEntry *TE = getTreeEntry(V))
12303 V =
TE->VectorizedValue;
12304 setInsertPointAfterBundle(E);
12305 V = FinalShuffle(V, E, VecTy);
12306 E->VectorizedValue =
V;
12309 case Instruction::ExtractValue: {
12310 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12315 NewV = FinalShuffle(NewV, E, VecTy);
12316 E->VectorizedValue = NewV;
12319 case Instruction::InsertElement: {
12320 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
12322 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
12324 Type *ScalarTy =
Op.front()->getType();
12325 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
12327 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
12328 assert(Res.first > 0 &&
"Expected item in MinBWs.");
12333 cast<FixedVectorType>(
V->getType())->getNumElements()),
12338 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
12339 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12341 const unsigned NumElts =
12342 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12343 const unsigned NumScalars = E->Scalars.size();
12346 assert(
Offset < NumElts &&
"Failed to find vector index offset");
12350 if (!E->ReorderIndices.empty()) {
12355 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
12358 bool IsIdentity =
true;
12360 Mask.swap(PrevMask);
12361 for (
unsigned I = 0;
I < NumScalars; ++
I) {
12364 IsIdentity &= InsertIdx -
Offset ==
I;
12367 if (!IsIdentity || NumElts != NumScalars) {
12371 if (NumElts != NumScalars &&
Offset == 0) {
12380 InsertMask[*InsertIdx] = *InsertIdx;
12381 if (!
Ins->hasOneUse())
12383 Ins = dyn_cast_or_null<InsertElementInst>(
12384 Ins->getUniqueUndroppableUser());
12387 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12389 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12392 if (!IsFirstPoison.
all()) {
12394 for (
unsigned I = 0;
I < NumElts;
I++) {
12396 IsFirstUndef.
test(
I)) {
12397 if (IsVNonPoisonous) {
12398 InsertMask[
I] =
I < NumScalars ?
I : 0;
12403 if (
Idx >= NumScalars)
12404 Idx = NumScalars - 1;
12405 InsertMask[
I] = NumScalars +
Idx;
12419 if (
auto *
I = dyn_cast<Instruction>(V)) {
12420 GatherShuffleExtractSeq.
insert(
I);
12421 CSEBlocks.
insert(
I->getParent());
12426 for (
unsigned I = 0;
I < NumElts;
I++) {
12431 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12434 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
12435 NumElts != NumScalars) {
12436 if (IsFirstUndef.
all()) {
12439 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12440 if (!IsFirstPoison.
all()) {
12441 for (
unsigned I = 0;
I < NumElts;
I++) {
12443 InsertMask[
I] =
I + NumElts;
12450 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
12451 if (
auto *
I = dyn_cast<Instruction>(V)) {
12452 GatherShuffleExtractSeq.
insert(
I);
12453 CSEBlocks.
insert(
I->getParent());
12458 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12459 for (
unsigned I = 0;
I < NumElts;
I++) {
12463 InsertMask[
I] += NumElts;
12466 FirstInsert->getOperand(0), V, InsertMask,
12467 cast<Instruction>(E->Scalars.back())->getName());
12468 if (
auto *
I = dyn_cast<Instruction>(V)) {
12469 GatherShuffleExtractSeq.
insert(
I);
12470 CSEBlocks.
insert(
I->getParent());
12475 ++NumVectorInstructions;
12476 E->VectorizedValue =
V;
12479 case Instruction::ZExt:
12480 case Instruction::SExt:
12481 case Instruction::FPToUI:
12482 case Instruction::FPToSI:
12483 case Instruction::FPExt:
12484 case Instruction::PtrToInt:
12485 case Instruction::IntToPtr:
12486 case Instruction::SIToFP:
12487 case Instruction::UIToFP:
12488 case Instruction::Trunc:
12489 case Instruction::FPTrunc:
12490 case Instruction::BitCast: {
12491 setInsertPointAfterBundle(E);
12493 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12494 if (E->VectorizedValue) {
12495 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12496 return E->VectorizedValue;
12499 auto *CI = cast<CastInst>(VL0);
12501 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
12502 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
12504 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
12507 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
12508 if (SrcIt != MinBWs.
end())
12509 SrcBWSz = SrcIt->second.first;
12510 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
12511 if (BWSz == SrcBWSz) {
12512 VecOpcode = Instruction::BitCast;
12513 }
else if (BWSz < SrcBWSz) {
12514 VecOpcode = Instruction::Trunc;
12515 }
else if (It != MinBWs.
end()) {
12516 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12517 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12518 }
else if (SrcIt != MinBWs.
end()) {
12519 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12521 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12523 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
12524 !SrcIt->second.second) {
12525 VecOpcode = Instruction::UIToFP;
12527 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12529 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
12530 V = FinalShuffle(V, E, VecTy);
12532 E->VectorizedValue =
V;
12533 ++NumVectorInstructions;
12536 case Instruction::FCmp:
12537 case Instruction::ICmp: {
12538 setInsertPointAfterBundle(E);
12540 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
12541 if (E->VectorizedValue) {
12542 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12543 return E->VectorizedValue;
12545 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
12546 if (E->VectorizedValue) {
12547 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12548 return E->VectorizedValue;
12550 if (
L->getType() !=
R->getType()) {
12551 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12552 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12553 MinBWs.
contains(getOperandEntry(E, 0)) ||
12554 MinBWs.
contains(getOperandEntry(E, 1))) &&
12555 "Expected item in MinBWs.");
12556 if (cast<VectorType>(
L->getType())
12558 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
12560 ->getIntegerBitWidth()) {
12561 Type *CastTy =
R->getType();
12564 Type *CastTy =
L->getType();
12573 VecTy = cast<FixedVectorType>(
V->getType());
12574 V = FinalShuffle(V, E, VecTy);
12576 E->VectorizedValue =
V;
12577 ++NumVectorInstructions;
12580 case Instruction::Select: {
12581 setInsertPointAfterBundle(E);
12583 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
12584 if (E->VectorizedValue) {
12585 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12586 return E->VectorizedValue;
12588 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12589 if (E->VectorizedValue) {
12590 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12591 return E->VectorizedValue;
12593 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12594 if (E->VectorizedValue) {
12595 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12596 return E->VectorizedValue;
12600 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12601 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12602 MinBWs.
contains(getOperandEntry(E, 1)) ||
12603 MinBWs.
contains(getOperandEntry(E, 2))) &&
12604 "Expected item in MinBWs.");
12605 if (True->
getType() != VecTy)
12606 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
12607 if (False->
getType() != VecTy)
12608 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
12612 V = FinalShuffle(V, E, VecTy);
12614 E->VectorizedValue =
V;
12615 ++NumVectorInstructions;
12618 case Instruction::FNeg: {
12619 setInsertPointAfterBundle(E);
12621 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
12623 if (E->VectorizedValue) {
12624 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12625 return E->VectorizedValue;
12631 if (
auto *
I = dyn_cast<Instruction>(V))
12634 V = FinalShuffle(V, E, VecTy);
12636 E->VectorizedValue =
V;
12637 ++NumVectorInstructions;
12641 case Instruction::Add:
12642 case Instruction::FAdd:
12643 case Instruction::Sub:
12644 case Instruction::FSub:
12645 case Instruction::Mul:
12646 case Instruction::FMul:
12647 case Instruction::UDiv:
12648 case Instruction::SDiv:
12649 case Instruction::FDiv:
12650 case Instruction::URem:
12651 case Instruction::SRem:
12652 case Instruction::FRem:
12653 case Instruction::Shl:
12654 case Instruction::LShr:
12655 case Instruction::AShr:
12656 case Instruction::And:
12657 case Instruction::Or:
12658 case Instruction::Xor: {
12659 setInsertPointAfterBundle(E);
12661 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
12662 if (E->VectorizedValue) {
12663 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12664 return E->VectorizedValue;
12666 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
12667 if (E->VectorizedValue) {
12668 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12669 return E->VectorizedValue;
12673 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12674 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12675 MinBWs.
contains(getOperandEntry(E, 0)) ||
12676 MinBWs.
contains(getOperandEntry(E, 1))) &&
12677 "Expected item in MinBWs.");
12688 if (
auto *
I = dyn_cast<Instruction>(V)) {
12691 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
12693 return isCommutative(cast<Instruction>(V));
12695 I->setHasNoUnsignedWrap(
false);
12698 V = FinalShuffle(V, E, VecTy);
12700 E->VectorizedValue =
V;
12701 ++NumVectorInstructions;
12705 case Instruction::Load: {
12708 setInsertPointAfterBundle(E);
12710 LoadInst *LI = cast<LoadInst>(VL0);
12713 if (E->State == TreeEntry::Vectorize) {
12715 }
else if (E->State == TreeEntry::StridedVectorize) {
12716 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
12717 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
12718 PO = IsReverseOrder ? PtrN : Ptr0;
12724 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
12726 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
12727 DL->getTypeAllocSize(ScalarTy));
12731 return cast<LoadInst>(V)->getPointerOperand();
12734 std::optional<Value *> Stride =
12743 (IsReverseOrder ? -1 : 1) *
12744 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
12746 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12748 Intrinsic::experimental_vp_strided_load,
12749 {VecTy, PO->
getType(), StrideTy},
12751 Builder.
getInt32(E->Scalars.size())});
12757 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
12758 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
12759 if (E->VectorizedValue) {
12760 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12761 return E->VectorizedValue;
12764 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12769 V = FinalShuffle(V, E, VecTy);
12770 E->VectorizedValue =
V;
12771 ++NumVectorInstructions;
12774 case Instruction::Store: {
12775 auto *
SI = cast<StoreInst>(VL0);
12777 setInsertPointAfterBundle(E);
12779 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
12780 if (VecValue->
getType() != VecTy)
12782 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
12783 VecValue = FinalShuffle(VecValue, E, VecTy);
12791 E->VectorizedValue =
V;
12792 ++NumVectorInstructions;
12795 case Instruction::GetElementPtr: {
12796 auto *GEP0 = cast<GetElementPtrInst>(VL0);
12797 setInsertPointAfterBundle(E);
12799 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
12800 if (E->VectorizedValue) {
12801 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12802 return E->VectorizedValue;
12806 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
12807 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
12808 if (E->VectorizedValue) {
12809 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12810 return E->VectorizedValue;
12815 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
12816 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
12818 for (
Value *V : E->Scalars) {
12819 if (isa<GetElementPtrInst>(V))
12825 V = FinalShuffle(V, E, VecTy);
12827 E->VectorizedValue =
V;
12828 ++NumVectorInstructions;
12832 case Instruction::Call: {
12833 CallInst *CI = cast<CallInst>(VL0);
12834 setInsertPointAfterBundle(E);
12840 It != MinBWs.
end() ? It->second.first : 0);
12843 VecCallCosts.first <= VecCallCosts.second;
12845 Value *ScalarArg =
nullptr;
12851 auto *CEI = cast<CallInst>(VL0);
12852 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
12857 ScalarArg = CEI->getArgOperand(
I);
12860 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
12861 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
12869 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
12870 if (E->VectorizedValue) {
12871 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12872 return E->VectorizedValue;
12874 ScalarArg = CEI->getArgOperand(
I);
12875 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
12877 It == MinBWs.
end()) {
12880 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
12881 }
else if (It != MinBWs.
end()) {
12882 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
12891 if (!UseIntrinsic) {
12907 V = FinalShuffle(V, E, VecTy);
12909 E->VectorizedValue =
V;
12910 ++NumVectorInstructions;
12913 case Instruction::ShuffleVector: {
12914 assert(E->isAltShuffle() &&
12919 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
12920 "Invalid Shuffle Vector Operand");
12924 setInsertPointAfterBundle(E);
12925 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12926 if (E->VectorizedValue) {
12927 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12928 return E->VectorizedValue;
12930 RHS = vectorizeOperand(E, 1, PostponedPHIs);
12932 setInsertPointAfterBundle(E);
12933 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12935 if (E->VectorizedValue) {
12936 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12937 return E->VectorizedValue;
12944 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12945 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12946 MinBWs.
contains(getOperandEntry(E, 0)) ||
12947 MinBWs.
contains(getOperandEntry(E, 1))) &&
12948 "Expected item in MinBWs.");
12949 Type *CastTy = VecTy;
12953 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
12955 ->getIntegerBitWidth())
12972 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
12973 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
12974 auto *AltCI = cast<CmpInst>(E->getAltOp());
12976 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
12979 unsigned SrcBWSz =
DL->getTypeSizeInBits(
12980 cast<VectorType>(
LHS->
getType())->getElementType());
12981 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
12982 if (BWSz <= SrcBWSz) {
12983 if (BWSz < SrcBWSz)
12986 if (
auto *
I = dyn_cast<Instruction>(LHS))
12988 E->VectorizedValue =
LHS;
12989 ++NumVectorInstructions;
13000 for (
Value *V : {V0, V1}) {
13001 if (
auto *
I = dyn_cast<Instruction>(V)) {
13002 GatherShuffleExtractSeq.
insert(
I);
13003 CSEBlocks.
insert(
I->getParent());
13012 E->buildAltOpShuffleMask(
13014 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
13018 Mask, &OpScalars, &AltScalars);
13022 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
13024 if (
auto *
I = dyn_cast<Instruction>(Vec);
13025 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
13027 auto *IV = cast<Instruction>(V);
13028 return IV->getOpcode() == Instruction::Sub &&
13029 isCommutative(cast<Instruction>(IV));
13031 I->setHasNoUnsignedWrap(
false);
13033 DropNuwFlag(V0, E->getOpcode());
13034 DropNuwFlag(V1, E->getAltOpcode());
13037 if (
auto *
I = dyn_cast<Instruction>(V)) {
13039 GatherShuffleExtractSeq.
insert(
I);
13040 CSEBlocks.
insert(
I->getParent());
13043 E->VectorizedValue =
V;
13044 ++NumVectorInstructions;
13057 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13063struct ShuffledInsertData {
13076 for (
auto &BSIter : BlocksSchedules) {
13077 scheduleBlock(BSIter.second.get());
13081 EntryToLastInstruction.
clear();
13091 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13092 if (TE->State == TreeEntry::Vectorize &&
13093 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13094 TE->VectorizedValue)
13100 for (
const TreeEntry *E : PostponedNodes) {
13101 auto *TE =
const_cast<TreeEntry *
>(E);
13102 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
13103 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13104 TE->UserTreeIndices.front().EdgeIdx)))
13108 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13109 TE->VectorizedValue =
nullptr;
13111 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13120 if (isa<PHINode>(UserI)) {
13123 for (
User *U : PrevVec->users()) {
13126 auto *UI = dyn_cast<Instruction>(U);
13127 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
13129 if (UI->comesBefore(InsertPt))
13138 if (Vec->
getType() != PrevVec->getType()) {
13140 PrevVec->getType()->isIntOrIntVectorTy() &&
13141 "Expected integer vector types only.");
13143 "Expected user in MinBWs.");
13144 bool IsSigned = MinBWs.
lookup(TE->UserTreeIndices.front().UserTE).second;
13148 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
13151 auto It = PostponedValues.
find(PrevVec);
13152 if (It != PostponedValues.
end()) {
13153 for (TreeEntry *VTE : It->getSecond())
13154 VTE->VectorizedValue = Vec;
13174 for (
const auto &ExternalUse : ExternalUses) {
13175 Value *Scalar = ExternalUse.Scalar;
13182 TreeEntry *E = getTreeEntry(Scalar);
13183 assert(E &&
"Invalid scalar");
13184 assert(E->State != TreeEntry::NeedToGather &&
13185 "Extracting from a gather list");
13187 if (E->getOpcode() == Instruction::GetElementPtr &&
13188 !isa<GetElementPtrInst>(Scalar))
13191 Value *Vec = E->VectorizedValue;
13192 assert(Vec &&
"Can't find vectorizable value");
13195 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
13196 if (Scalar->getType() != Vec->
getType()) {
13197 Value *Ex =
nullptr;
13198 Value *ExV =
nullptr;
13199 auto *
GEP = dyn_cast<GetElementPtrInst>(Scalar);
13201 auto It = ScalarToEEs.find(Scalar);
13202 if (It != ScalarToEEs.end()) {
13206 if (EEIt != It->second.end()) {
13212 if (
auto *CI = EEIt->second.second)
13216 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13221 if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13222 Value *V = ES->getVectorOperand();
13223 if (
const TreeEntry *ETE = getTreeEntry(V))
13224 V = ETE->VectorizedValue;
13226 }
else if (ReplaceGEP) {
13229 auto *CloneGEP =
GEP->clone();
13232 if (
GEP->hasName())
13233 CloneGEP->takeName(
GEP);
13241 if (Scalar->getType() != Ex->
getType())
13243 MinBWs.
find(E)->second.second);
13244 if (
auto *
I = dyn_cast<Instruction>(Ex))
13245 ScalarToEEs[Scalar].try_emplace(
13247 std::make_pair(
I, cast<Instruction>(ExV)));
13251 if (
auto *ExI = dyn_cast<Instruction>(Ex)) {
13252 GatherShuffleExtractSeq.
insert(ExI);
13253 CSEBlocks.
insert(ExI->getParent());
13257 assert(isa<FixedVectorType>(Scalar->getType()) &&
13258 isa<InsertElementInst>(Scalar) &&
13259 "In-tree scalar of vector type is not insertelement?");
13260 auto *IE = cast<InsertElementInst>(Scalar);
13268 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
13273 if (ExternalUsesAsGEPs.contains(U))
13275 TreeEntry *UseEntry = getTreeEntry(U);
13277 (UseEntry->State == TreeEntry::Vectorize ||
13279 TreeEntry::StridedVectorize) &&
13280 (E->State == TreeEntry::Vectorize ||
13281 E->State == TreeEntry::StridedVectorize) &&
13282 doesInTreeUserNeedToExtract(
13284 cast<Instruction>(UseEntry->Scalars.front()),
13287 "Scalar with nullptr User must be registered in "
13288 "ExternallyUsedValues map or remain as scalar in vectorized "
13290 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13291 if (
auto *
PHI = dyn_cast<PHINode>(VecI))
13293 PHI->getParent()->getFirstNonPHIIt());
13296 std::next(VecI->getIterator()));
13300 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13302 Scalar->replaceAllUsesWith(NewInst);
13303 ReplacedExternals.emplace_back(Scalar, NewInst);
13307 if (
auto *VU = dyn_cast<InsertElementInst>(
User)) {
13309 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13310 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
13311 if (!UsedInserts.
insert(VU).second)
13314 auto BWIt = MinBWs.
find(E);
13316 auto *ScalarTy = FTy->getElementType();
13317 auto Key = std::make_pair(Vec, ScalarTy);
13318 auto VecIt = VectorCasts.
find(Key);
13319 if (VecIt == VectorCasts.
end()) {
13321 if (
auto *IVec = dyn_cast<Instruction>(Vec))
13327 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
13328 BWIt->second.second);
13331 Vec = VecIt->second;
13338 find_if(ShuffledInserts, [VU](
const ShuffledInsertData &
Data) {
13345 unsigned Idx = *InsertIdx;
13346 if (It == ShuffledInserts.
end()) {
13348 It = std::next(ShuffledInserts.
begin(),
13349 ShuffledInserts.
size() - 1);
13355 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
13356 if (IEBase !=
User &&
13357 (!IEBase->hasOneUse() ||
13361 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
13363 IEBase = cast<InsertElementInst>(
Base);
13366 "InsertElementInstruction used already.");
13367 Mask[IEIdx] = IEIdx;
13368 Base = IEBase->getOperand(0);
13369 }
while (E == getTreeEntry(
Base));
13372 Base = cast<InsertElementInst>(
Base)->getOperand(0);
13376 auto It = VectorToInsertElement.
find(
Base);
13377 if (It != VectorToInsertElement.
end())
13384 Mask[
Idx] = ExternalUse.Lane;
13385 It->InsertElements.push_back(cast<InsertElementInst>(
User));
13394 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13396 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13397 if (PH->getIncomingValue(
I) == Scalar) {
13399 PH->getIncomingBlock(
I)->getTerminator();
13400 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13402 std::next(VecI->getIterator()));
13406 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13407 PH->setOperand(
I, NewInst);
13412 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13417 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13427 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
13428 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
13430 CombinedMask1[
I] = Mask[
I];
13432 CombinedMask2[
I] = Mask[
I] - VF;
13435 ShuffleBuilder.
add(V1, CombinedMask1);
13437 ShuffleBuilder.
add(V2, CombinedMask2);
13438 return ShuffleBuilder.
finalize(std::nullopt);
13442 bool ForSingleMask) {
13443 unsigned VF = Mask.size();
13444 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
13446 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
13447 Vec = CreateShuffle(Vec,
nullptr, Mask);
13448 return std::make_pair(Vec,
true);
13450 if (!ForSingleMask) {
13452 for (
unsigned I = 0;
I < VF; ++
I) {
13454 ResizeMask[Mask[
I]] = Mask[
I];
13456 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
13460 return std::make_pair(Vec,
false);
13464 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
13470 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
13471 Value *NewInst = performExtractsShuffleAction<Value>(
13475 return cast<VectorType>(Vec->getType())
13476 ->getElementCount()
13477 .getKnownMinValue();
13482 assert((Vals.size() == 1 || Vals.size() == 2) &&
13483 "Expected exactly 1 or 2 input values.");
13484 if (Vals.size() == 1) {
13487 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13488 ->getNumElements() ||
13489 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13490 return CreateShuffle(Vals.front(), nullptr, Mask);
13491 return Vals.front();
13493 return CreateShuffle(Vals.
front() ? Vals.
front()
13495 Vals.
back(), Mask);
13497 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
13500 if (It != ShuffledInserts[
I].InsertElements.
rend())
13503 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
13504 assert(II &&
"Must be an insertelement instruction.");
13508 Inserts.
push_back(cast<Instruction>(II));
13509 II = dyn_cast<InsertElementInst>(II->
getOperand(0));
13513 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
13520 IE->replaceUsesOfWith(IE->getOperand(0),
13522 IE->replaceUsesOfWith(IE->getOperand(1),
13526 CSEBlocks.
insert(LastInsert->getParent());
13531 for (
auto &TEPtr : VectorizableTree) {
13532 TreeEntry *Entry = TEPtr.get();
13535 if (Entry->State == TreeEntry::NeedToGather)
13538 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
13541 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
13542 Value *Scalar = Entry->Scalars[Lane];
13544 if (Entry->getOpcode() == Instruction::GetElementPtr &&
13545 !isa<GetElementPtrInst>(Scalar))
13548 Type *Ty = Scalar->getType();
13550 for (
User *U : Scalar->users()) {
13554 assert((getTreeEntry(U) ||
13555 (UserIgnoreList && UserIgnoreList->contains(U)) ||
13556 (isa_and_nonnull<Instruction>(U) &&
13557 isDeleted(cast<Instruction>(U)))) &&
13558 "Deleting out-of-tree value");
13562 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
13567 RemovedInsts.
push_back(cast<Instruction>(Scalar));
13573 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
13574 V->mergeDIAssignID(RemovedInsts);
13577 InstrElementSize.
clear();
13579 const TreeEntry &RootTE = *VectorizableTree.front().get();
13580 Value *Vec = RootTE.VectorizedValue;
13581 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
13582 It != MinBWs.end() &&
13583 ReductionBitWidth != It->second.first) {
13586 ReductionRoot->getIterator());
13590 cast<VectorType>(Vec->
getType())->getElementCount()),
13591 It->second.second);
13598 <<
" gather sequences instructions.\n");
13605 Loop *L = LI->getLoopFor(
I->getParent());
13610 BasicBlock *PreHeader = L->getLoopPreheader();
13618 auto *OpI = dyn_cast<Instruction>(V);
13619 return OpI && L->contains(OpI);
13625 CSEBlocks.
insert(PreHeader);
13640 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
13641 "Different nodes should have different DFS numbers");
13642 return A->getDFSNumIn() <
B->getDFSNumIn();
13652 if (I1->getType() != I2->getType())
13654 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
13655 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
13657 return I1->isIdenticalTo(I2);
13658 if (SI1->isIdenticalTo(SI2))
13660 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
13661 if (SI1->getOperand(
I) != SI2->getOperand(
I))
13664 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
13668 unsigned LastUndefsCnt = 0;
13669 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
13675 NewMask[
I] != SM1[
I])
13678 NewMask[
I] = SM1[
I];
13682 return SM1.
size() - LastUndefsCnt > 1 &&
13686 SM1.
size() - LastUndefsCnt));
13692 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
13695 "Worklist not sorted properly!");
13701 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
13702 !GatherShuffleExtractSeq.contains(&In))
13707 bool Replaced =
false;
13710 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
13711 DT->
dominates(V->getParent(), In.getParent())) {
13712 In.replaceAllUsesWith(V);
13714 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
13715 if (!NewMask.
empty())
13716 SI->setShuffleMask(NewMask);
13720 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
13721 GatherShuffleExtractSeq.contains(V) &&
13722 IsIdenticalOrLessDefined(V, &In, NewMask) &&
13723 DT->
dominates(In.getParent(), V->getParent())) {
13725 V->replaceAllUsesWith(&In);
13727 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
13728 if (!NewMask.
empty())
13729 SI->setShuffleMask(NewMask);
13737 Visited.push_back(&In);
13742 GatherShuffleExtractSeq.clear();
13745BoUpSLP::ScheduleData *
13747 ScheduleData *Bundle =
nullptr;
13748 ScheduleData *PrevInBundle =
nullptr;
13749 for (
Value *V : VL) {
13752 ScheduleData *BundleMember = getScheduleData(V);
13754 "no ScheduleData for bundle member "
13755 "(maybe not in same basic block)");
13756 assert(BundleMember->isSchedulingEntity() &&
13757 "bundle member already part of other bundle");
13758 if (PrevInBundle) {
13759 PrevInBundle->NextInBundle = BundleMember;
13761 Bundle = BundleMember;
13765 BundleMember->FirstInBundle = Bundle;
13766 PrevInBundle = BundleMember;
13768 assert(Bundle &&
"Failed to find schedule bundle");
13774std::optional<BoUpSLP::ScheduleData *>
13776 const InstructionsState &S) {
13787 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
13788 ScheduleData *Bundle) {
13794 if (ScheduleEnd != OldScheduleEnd) {
13795 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
13796 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->clearDependencies(); });
13801 <<
" in block " << BB->
getName() <<
"\n");
13802 calculateDependencies(Bundle,
true, SLP);
13807 initialFillReadyList(ReadyInsts);
13814 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
13815 !ReadyInsts.empty()) {
13816 ScheduleData *Picked = ReadyInsts.pop_back_val();
13817 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
13818 "must be ready to schedule");
13819 schedule(Picked, ReadyInsts);
13825 for (
Value *V : VL) {
13828 if (!extendSchedulingRegion(V, S)) {
13835 TryScheduleBundleImpl(
false,
nullptr);
13836 return std::nullopt;
13840 bool ReSchedule =
false;
13841 for (
Value *V : VL) {
13844 ScheduleData *BundleMember = getScheduleData(V);
13846 "no ScheduleData for bundle member (maybe not in same basic block)");
13850 ReadyInsts.remove(BundleMember);
13852 if (!BundleMember->IsScheduled)
13857 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
13858 <<
" was already scheduled\n");
13862 auto *Bundle = buildBundle(VL);
13863 TryScheduleBundleImpl(ReSchedule, Bundle);
13864 if (!Bundle->isReady()) {
13865 cancelScheduling(VL, S.OpValue);
13866 return std::nullopt;
13879 ScheduleData *Bundle = getScheduleData(OpValue);
13880 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
13881 assert(!Bundle->IsScheduled &&
13882 "Can't cancel bundle which is already scheduled");
13883 assert(Bundle->isSchedulingEntity() &&
13885 "tried to unbundle something which is not a bundle");
13888 if (Bundle->isReady())
13889 ReadyInsts.remove(Bundle);
13892 ScheduleData *BundleMember = Bundle;
13893 while (BundleMember) {
13894 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
13895 BundleMember->FirstInBundle = BundleMember;
13896 ScheduleData *Next = BundleMember->NextInBundle;
13897 BundleMember->NextInBundle =
nullptr;
13898 BundleMember->TE =
nullptr;
13899 if (BundleMember->unscheduledDepsInBundle() == 0) {
13900 ReadyInsts.insert(BundleMember);
13902 BundleMember = Next;
13906BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
13908 if (ChunkPos >= ChunkSize) {
13909 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
13912 return &(ScheduleDataChunks.back()[ChunkPos++]);
13915bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V,
13916 const InstructionsState &S) {
13917 if (getScheduleData(V,
isOneOf(S, V)))
13920 assert(
I &&
"bundle member must be an instruction");
13923 "phi nodes/insertelements/extractelements/extractvalues don't need to "
13925 auto &&CheckScheduleForI = [
this, &S](
Instruction *
I) ->
bool {
13926 ScheduleData *ISD = getScheduleData(
I);
13929 assert(isInSchedulingRegion(ISD) &&
13930 "ScheduleData not in scheduling region");
13931 ScheduleData *SD = allocateScheduleDataChunks();
13933 SD->init(SchedulingRegionID, S.OpValue);
13934 ExtraScheduleDataMap[
I][S.OpValue] = SD;
13937 if (CheckScheduleForI(
I))
13939 if (!ScheduleStart) {
13941 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
13943 ScheduleEnd =
I->getNextNode();
13945 CheckScheduleForI(
I);
13946 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
13947 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
13955 ++ScheduleStart->getIterator().getReverse();
13960 if (
auto *II = dyn_cast<IntrinsicInst>(&
I))
13961 return II->isAssumeLikeIntrinsic();
13964 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
13965 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
13966 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
13968 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
13969 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
13976 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
13977 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
13979 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
13980 assert(
I->getParent() == ScheduleStart->getParent() &&
13981 "Instruction is in wrong basic block.");
13982 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
13985 CheckScheduleForI(
I);
13990 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
13991 "Expected to reach top of the basic block or instruction down the "
13993 assert(
I->getParent() == ScheduleEnd->getParent() &&
13994 "Instruction is in wrong basic block.");
13995 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
13997 ScheduleEnd =
I->getNextNode();
13999 CheckScheduleForI(
I);
14000 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14001 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
14005void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
14007 ScheduleData *PrevLoadStore,
14008 ScheduleData *NextLoadStore) {
14009 ScheduleData *CurrentLoadStore = PrevLoadStore;
14014 ScheduleData *SD = ScheduleDataMap.lookup(
I);
14016 SD = allocateScheduleDataChunks();
14017 ScheduleDataMap[
I] = SD;
14020 assert(!isInSchedulingRegion(SD) &&
14021 "new ScheduleData already in scheduling region");
14022 SD->init(SchedulingRegionID,
I);
14024 if (
I->mayReadOrWriteMemory() &&
14025 (!isa<IntrinsicInst>(
I) ||
14026 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
14027 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
14028 Intrinsic::pseudoprobe))) {
14030 if (CurrentLoadStore) {
14031 CurrentLoadStore->NextLoadStore = SD;
14033 FirstLoadStoreInRegion = SD;
14035 CurrentLoadStore = SD;
14038 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14039 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14040 RegionHasStackSave =
true;
14042 if (NextLoadStore) {
14043 if (CurrentLoadStore)
14044 CurrentLoadStore->NextLoadStore = NextLoadStore;
14046 LastLoadStoreInRegion = CurrentLoadStore;
14050void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14051 bool InsertInReadyList,
14053 assert(SD->isSchedulingEntity());
14058 while (!WorkList.
empty()) {
14060 for (ScheduleData *BundleMember = SD; BundleMember;
14061 BundleMember = BundleMember->NextInBundle) {
14062 assert(isInSchedulingRegion(BundleMember));
14063 if (BundleMember->hasValidDependencies())
14068 BundleMember->Dependencies = 0;
14069 BundleMember->resetUnscheduledDeps();
14072 if (BundleMember->OpValue != BundleMember->Inst) {
14073 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14074 BundleMember->Dependencies++;
14075 ScheduleData *DestBundle = UseSD->FirstInBundle;
14076 if (!DestBundle->IsScheduled)
14077 BundleMember->incrementUnscheduledDeps(1);
14078 if (!DestBundle->hasValidDependencies())
14082 for (
User *U : BundleMember->Inst->
users()) {
14083 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14084 BundleMember->Dependencies++;
14085 ScheduleData *DestBundle = UseSD->FirstInBundle;
14086 if (!DestBundle->IsScheduled)
14087 BundleMember->incrementUnscheduledDeps(1);
14088 if (!DestBundle->hasValidDependencies())
14095 auto *DepDest = getScheduleData(
I);
14096 assert(DepDest &&
"must be in schedule window");
14097 DepDest->ControlDependencies.push_back(BundleMember);
14098 BundleMember->Dependencies++;
14099 ScheduleData *DestBundle = DepDest->FirstInBundle;
14100 if (!DestBundle->IsScheduled)
14101 BundleMember->incrementUnscheduledDeps(1);
14102 if (!DestBundle->hasValidDependencies())
14110 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14111 I != ScheduleEnd;
I =
I->getNextNode()) {
14116 MakeControlDependent(
I);
14124 if (RegionHasStackSave) {
14128 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14129 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14130 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14131 I != ScheduleEnd;
I =
I->getNextNode()) {
14132 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14133 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14138 if (!isa<AllocaInst>(
I))
14142 MakeControlDependent(
I);
14151 if (isa<AllocaInst>(BundleMember->Inst) ||
14152 BundleMember->Inst->mayReadOrWriteMemory()) {
14153 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14154 I != ScheduleEnd;
I =
I->getNextNode()) {
14155 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
14156 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14160 MakeControlDependent(
I);
14167 ScheduleData *DepDest = BundleMember->NextLoadStore;
14172 "NextLoadStore list for non memory effecting bundle?");
14174 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14175 unsigned NumAliased = 0;
14176 unsigned DistToSrc = 1;
14178 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14179 assert(isInSchedulingRegion(DepDest));
14189 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14191 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14198 DepDest->MemoryDependencies.push_back(BundleMember);
14199 BundleMember->Dependencies++;
14200 ScheduleData *DestBundle = DepDest->FirstInBundle;
14201 if (!DestBundle->IsScheduled) {
14202 BundleMember->incrementUnscheduledDeps(1);
14204 if (!DestBundle->hasValidDependencies()) {
14227 if (InsertInReadyList && SD->isReady()) {
14228 ReadyInsts.insert(SD);
14235void BoUpSLP::BlockScheduling::resetSchedule() {
14237 "tried to reset schedule on block which has not been scheduled");
14238 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
14239 doForAllOpcodes(
I, [&](ScheduleData *SD) {
14240 assert(isInSchedulingRegion(SD) &&
14241 "ScheduleData not in scheduling region");
14242 SD->IsScheduled =
false;
14243 SD->resetUnscheduledDeps();
14246 ReadyInsts.clear();
14249void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14250 if (!BS->ScheduleStart)
14253 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
14260 BS->resetSchedule();
14267 struct ScheduleDataCompare {
14268 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
14269 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14272 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14277 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
14278 I =
I->getNextNode()) {
14279 BS->doForAllOpcodes(
I, [
this, &
Idx, BS](ScheduleData *SD) {
14280 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14283 SD->isPartOfBundle() ==
14285 "scheduler and vectorizer bundle mismatch");
14286 SD->FirstInBundle->SchedulingPriority =
Idx++;
14288 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14289 BS->calculateDependencies(SD,
false,
this);
14292 BS->initialFillReadyList(ReadyInsts);
14294 Instruction *LastScheduledInst = BS->ScheduleEnd;
14297 while (!ReadyInsts.empty()) {
14298 ScheduleData *Picked = *ReadyInsts.begin();
14299 ReadyInsts.erase(ReadyInsts.begin());
14303 for (ScheduleData *BundleMember = Picked; BundleMember;
14304 BundleMember = BundleMember->NextInBundle) {
14308 LastScheduledInst = PickedInst;
14311 BS->schedule(Picked, ReadyInsts);
14315#ifdef EXPENSIVE_CHECKS
14319#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14321 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
14322 BS->doForAllOpcodes(
I, [&](ScheduleData *SD) {
14323 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14324 assert(SD->IsScheduled &&
"must be scheduled at this point");
14331 BS->ScheduleStart =
nullptr;
14338 if (
auto *Store = dyn_cast<StoreInst>(V))
14339 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14341 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
14344 auto E = InstrElementSize.
find(V);
14345 if (E != InstrElementSize.
end())
14354 if (
auto *
I = dyn_cast<Instruction>(V)) {
14362 Value *FirstNonBool =
nullptr;
14363 while (!Worklist.
empty()) {
14368 auto *Ty =
I->getType();
14369 if (isa<VectorType>(Ty))
14371 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
14378 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
14379 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
14387 for (
Use &U :
I->operands()) {
14388 if (
auto *J = dyn_cast<Instruction>(U.get()))
14389 if (Visited.
insert(J).second &&
14390 (isa<PHINode>(
I) || J->getParent() == Parent)) {
14394 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
14395 FirstNonBool = U.get();
14406 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
14408 Width =
DL->getTypeSizeInBits(V->getType());
14412 InstrElementSize[
I] = Width;
14417bool BoUpSLP::collectValuesToDemote(
14418 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
14420 unsigned &MaxDepthLevel,
bool &IsProfitableToDemote,
14421 bool IsTruncRoot)
const {
14423 if (
all_of(E.Scalars, IsaPred<Constant>))
14426 unsigned OrigBitWidth =
DL->getTypeSizeInBits(E.Scalars.front()->getType());
14435 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
14444 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14447 if (
auto *
I = dyn_cast<Instruction>(V)) {
14449 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
14450 BitWidth1 = std::min(BitWidth1, BitWidth2);
14455 using namespace std::placeholders;
14456 auto FinalAnalysis = [&]() {
14457 if (!IsProfitableToDemote)
14460 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
14462 if (Res && E.State == TreeEntry::NeedToGather &&
14463 all_of(E.Scalars, IsaPred<Constant>))
14468 if (E.State == TreeEntry::NeedToGather || !Visited.
insert(&E).second ||
14470 return all_of(V->users(), [&](User *U) {
14471 return isa<InsertElementInst>(U) && !getTreeEntry(U);
14474 return FinalAnalysis();
14477 return !all_of(V->users(), [=](User *U) {
14478 return getTreeEntry(U) ||
14479 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14480 (U->getType()->isSized() && !U->getType()->isScalableTy() &&
14481 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
14482 }) && !IsPotentiallyTruncated(V,
BitWidth);
14487 bool &NeedToExit) {
14488 NeedToExit =
false;
14489 unsigned InitLevel = MaxDepthLevel;
14491 unsigned Level = InitLevel;
14492 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
14493 ToDemote, Visited, Level, IsProfitableToDemote,
14495 if (!IsProfitableToDemote)
14498 if (!FinalAnalysis())
14502 MaxDepthLevel = std::max(MaxDepthLevel, Level);
14506 auto AttemptCheckBitwidth =
14509 NeedToExit =
false;
14510 unsigned BestFailBitwidth = 0;
14512 if (Checker(
BitWidth, OrigBitWidth))
14514 if (BestFailBitwidth == 0 && FinalAnalysis())
14518 if (BestFailBitwidth == 0) {
14529 auto TryProcessInstruction =
14536 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14541 if (E.UserTreeIndices.size() > 1 &&
14542 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14545 bool NeedToExit =
false;
14546 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
14550 if (!ProcessOperands(
Operands, NeedToExit))
14559 return IsProfitableToDemote;
14561 switch (E.getOpcode()) {
14565 case Instruction::Trunc:
14566 if (IsProfitableToDemoteRoot)
14567 IsProfitableToDemote =
true;
14568 return TryProcessInstruction(
BitWidth);
14569 case Instruction::ZExt:
14570 case Instruction::SExt:
14571 IsProfitableToDemote =
true;
14572 return TryProcessInstruction(
BitWidth);
14576 case Instruction::Add:
14577 case Instruction::Sub:
14578 case Instruction::Mul:
14579 case Instruction::And:
14580 case Instruction::Or:
14581 case Instruction::Xor: {
14582 return TryProcessInstruction(
14583 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
14585 case Instruction::Shl: {
14590 auto *I = cast<Instruction>(V);
14591 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14592 return AmtKnownBits.getMaxValue().ult(BitWidth);
14595 return TryProcessInstruction(
14596 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
14598 case Instruction::LShr: {
14602 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14604 auto *I = cast<Instruction>(V);
14605 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14606 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14607 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14608 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
14609 SimplifyQuery(*DL));
14612 return TryProcessInstruction(
14613 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14616 case Instruction::AShr: {
14620 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14622 auto *I = cast<Instruction>(V);
14623 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14624 unsigned ShiftedBits = OrigBitWidth - BitWidth;
14625 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14626 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14630 return TryProcessInstruction(
14631 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14634 case Instruction::UDiv:
14635 case Instruction::URem: {
14637 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14640 auto *I = cast<Instruction>(V);
14641 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14642 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
14643 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14646 return TryProcessInstruction(
14647 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
14651 case Instruction::Select: {
14652 return TryProcessInstruction(
14653 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
14658 case Instruction::PHI: {
14659 const unsigned NumOps = E.getNumOperands();
14662 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
14664 return TryProcessInstruction(
BitWidth, Ops);
14667 case Instruction::Call: {
14668 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
14672 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
14673 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
14677 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14680 auto *I = cast<Instruction>(V);
14681 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
14682 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14683 return MaskedValueIsZero(I->getOperand(0), Mask,
14684 SimplifyQuery(*DL)) &&
14685 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14687 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
14688 "Expected min/max intrinsics only.");
14689 unsigned SignBits = OrigBitWidth -
BitWidth;
14702 if (
ID != Intrinsic::abs) {
14703 Operands.push_back(getOperandEntry(&E, 1));
14704 CallChecker = CompChecker;
14707 std::numeric_limits<InstructionCost::CostType>::max();
14709 unsigned VF = E.Scalars.size();
14719 if (
Cost < BestCost) {
14725 [[maybe_unused]]
bool NeedToExit;
14726 (void)AttemptCheckBitwidth(Checker, NeedToExit);
14736 return FinalAnalysis();
14743 bool IsStoreOrInsertElt =
14744 VectorizableTree.front()->getOpcode() == Instruction::Store ||
14745 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
14746 if ((IsStoreOrInsertElt || UserIgnoreList) &&
14747 ExtraBitWidthNodes.
size() <= 1 &&
14748 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
14749 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
14752 unsigned NodeIdx = 0;
14753 if (IsStoreOrInsertElt &&
14754 VectorizableTree.front()->State != TreeEntry::NeedToGather)
14758 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
14759 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.
empty()) ||
14760 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
14763 static_cast<int>(NodeIdx);
14769 bool IsTruncRoot =
false;
14770 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
14772 if (NodeIdx != 0 &&
14773 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
14774 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
14775 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
14776 IsTruncRoot =
true;
14778 IsProfitableToDemoteRoot =
true;
14783 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
14787 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
14788 bool IsProfitableToDemoteRoot,
unsigned Opcode,
14789 unsigned Limit,
bool IsTruncRoot,
14790 bool IsSignedCmp) {
14792 unsigned VF = E.getVectorFactor();
14793 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
14794 if (!TreeRootIT || !Opcode)
14798 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
14801 unsigned NumParts =
14807 unsigned MaxBitWidth = 1u;
14815 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
14816 KnownBits Known = computeKnownBits(R, *DL);
14817 return Known.isNonNegative();
14822 for (
Value *Root : E.Scalars) {
14825 unsigned BitWidth1 = NumTypeBits - NumSignBits;
14841 if (!IsKnownPositive)
14845 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
14847 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
14850 if (MaxBitWidth < 8 && MaxBitWidth > 1)
14855 if (NumParts > 1 &&
14861 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
14862 Opcode == Instruction::SExt ||
14863 Opcode == Instruction::ZExt || NumParts > 1;
14868 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
14869 bool NeedToDemote = IsProfitableToDemote;
14871 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
14872 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
14874 (MaxDepthLevel <= Limit &&
14875 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
14876 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
14877 DL->getTypeSizeInBits(TreeRootIT) /
14878 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
14884 MaxBitWidth =
bit_ceil(MaxBitWidth);
14886 return MaxBitWidth;
14893 if (UserIgnoreList &&
14894 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
14895 for (
Value *V : *UserIgnoreList) {
14897 auto NumTypeBits =
DL->getTypeSizeInBits(V->getType());
14898 unsigned BitWidth1 = NumTypeBits - NumSignBits;
14901 unsigned BitWidth2 = BitWidth1;
14904 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
14906 ReductionBitWidth =
14907 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
14909 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
14910 ReductionBitWidth = 8;
14912 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
14914 bool IsTopRoot = NodeIdx == 0;
14915 while (NodeIdx < VectorizableTree.size() &&
14916 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
14917 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
14920 IsTruncRoot =
true;
14922 bool IsSignedCmp =
false;
14923 while (NodeIdx < VectorizableTree.size()) {
14925 unsigned Limit = 2;
14926 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
14928 ReductionBitWidth ==
14929 DL->getTypeSizeInBits(
14930 VectorizableTree.front()->Scalars.front()->getType()))
14932 unsigned MaxBitWidth = ComputeMaxBitWidth(
14933 *VectorizableTree[NodeIdx].
get(), IsTopRoot, IsProfitableToDemoteRoot,
14934 Opcode, Limit, IsTruncRoot, IsSignedCmp);
14935 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
14936 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
14937 ReductionBitWidth =
bit_ceil(MaxBitWidth);
14938 else if (MaxBitWidth == 0)
14939 ReductionBitWidth = 0;
14942 for (
unsigned Idx : RootDemotes) {
14944 uint32_t OrigBitWidth =
DL->getTypeSizeInBits(V->getType());
14945 if (OrigBitWidth > MaxBitWidth) {
14953 RootDemotes.clear();
14955 IsProfitableToDemoteRoot =
true;
14957 if (ExtraBitWidthNodes.
empty()) {
14958 NodeIdx = VectorizableTree.size();
14960 unsigned NewIdx = 0;
14962 NewIdx = *ExtraBitWidthNodes.
begin();
14963 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
14964 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
14967 NodeIdx < VectorizableTree.size() &&
14968 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
14971 EI.
UserTE->getOpcode() == Instruction::Trunc &&
14972 !EI.
UserTE->isAltShuffle();
14975 NodeIdx < VectorizableTree.size() &&
14976 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
14978 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
14980 auto *IC = dyn_cast<ICmpInst>(V);
14981 return IC && IC->isSigned();
14988 if (MaxBitWidth == 0 ||
14990 cast<IntegerType>(TreeRoot.
front()->getType())->getBitWidth()) {
14991 if (UserIgnoreList)
14998 for (
unsigned Idx : ToDemote) {
14999 TreeEntry *TE = VectorizableTree[
Idx].get();
15002 bool IsSigned = TE->getOpcode() == Instruction::SExt ||
15004 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15022 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
15047 DL = &
F.getParent()->getDataLayout();
15051 bool Changed =
false;
15057 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
15062 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
15065 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
15069 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
15075 DT->updateDFSNumbers();
15078 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
15080 R.clearReductionData();
15081 collectSeedInstructions(BB);
15084 if (!Stores.empty()) {
15086 <<
" underlying objects.\n");
15087 Changed |= vectorizeStoreChains(R);
15091 Changed |= vectorizeChainsInBlock(BB, R);
15096 if (!GEPs.
empty()) {
15098 <<
" underlying objects.\n");
15099 Changed |= vectorizeGEPIndices(BB, R);
15104 R.optimizeGatherSequence();
15111 unsigned Idx,
unsigned MinVF) {
15114 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15115 unsigned VF = Chain.
size();
15128 R.buildTree(Chain);
15129 if (R.isTreeTinyAndNotFullyVectorizable())
15131 if (R.isLoadCombineCandidate())
15133 R.reorderTopToBottom();
15134 R.reorderBottomToTop();
15135 R.buildExternalUses();
15137 R.computeMinimumValueSizes();
15145 using namespace ore;
15148 cast<StoreInst>(Chain[0]))
15149 <<
"Stores SLP vectorized with cost " << NV(
"Cost",
Cost)
15150 <<
" and with tree size "
15151 << NV(
"TreeSize", R.getTreeSize()));
15165 bool Changed =
false;
15171 struct StoreDistCompare {
15172 bool operator()(
const std::pair<unsigned, int> &Op1,
15173 const std::pair<unsigned, int> &Op2)
const {
15174 return Op1.second < Op2.second;
15179 using StoreIndexToDistSet =
15180 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15181 auto TryToVectorize = [&](
const StoreIndexToDistSet &Set) {
15186 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
15188 PrevDist =
Data.second;
15189 if (
Idx != Set.size() - 1)
15194 Operands.push_back(Stores[DataVar.first]);
15195 PrevDist = DataVar.second;
15201 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
15202 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
15206 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15208 Type *StoreTy =
Store->getValueOperand()->getType();
15209 Type *ValueTy = StoreTy;
15210 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
15211 ValueTy = Trunc->getSrcTy();
15213 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy));
15215 if (MaxVF < MinVF) {
15216 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
15218 <<
"MinVF (" << MinVF <<
")\n");
15222 unsigned NonPowerOf2VF = 0;
15227 unsigned CandVF =
Operands.size();
15229 NonPowerOf2VF = CandVF;
15234 unsigned Size = MinVF;
15236 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
15239 unsigned StartIdx = 0;
15240 for (
unsigned Size : CandidateVFs) {
15241 for (
unsigned Cnt = StartIdx, E =
Operands.size(); Cnt +
Size <= E;) {
15247 return cast<StoreInst>(V)->getValueOperand()->getType() ==
15248 cast<StoreInst>(Slice.
front())
15249 ->getValueOperand()
15252 "Expected all operands of same type.");
15253 if (!VectorizedStores.
count(Slice.
front()) &&
15254 !VectorizedStores.
count(Slice.
back()) &&
15257 vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
15263 if (Cnt == StartIdx)
15319 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
15321 Stores[Set.first]->getValueOperand()->getType(),
15322 Stores[Set.first]->getPointerOperand(),
15323 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
15327 auto It = Set.second.find(std::make_pair(
Idx, *Diff));
15328 if (It == Set.second.end()) {
15329 Set.second.emplace(
Idx, *Diff);
15333 TryToVectorize(Set.second);
15334 StoreIndexToDistSet PrevSet;
15335 PrevSet.swap(Set.second);
15337 Set.second.emplace(
Idx, 0);
15340 unsigned StartIdx = It->first + 1;
15345 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
15347 if (Pair.first <= It->first ||
15348 VectorizedStores.
contains(Stores[Pair.first]))
15350 unsigned BI = Pair.first - StartIdx;
15351 UsedStores.set(BI);
15352 Dists[BI] = Pair.second - It->second;
15354 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
15355 unsigned BI =
I - StartIdx;
15356 if (UsedStores.test(BI))
15357 Set.second.emplace(
I, Dists[BI]);
15361 auto &Res = SortedStores.emplace_back();
15363 Res.second.emplace(
Idx, 0);
15369 SI->getValueOperand()->getType()) {
15370 for (
auto &Set : SortedStores)
15371 TryToVectorize(Set.second);
15372 SortedStores.clear();
15375 FillStoresSet(
I, SI);
15379 for (
auto &Set : SortedStores)
15380 TryToVectorize(Set.second);
15385void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
15396 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
15397 if (!
SI->isSimple())
15407 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
15408 if (
GEP->getNumIndices() != 1)
15411 if (isa<Constant>(
Idx))
15415 if (
GEP->getType()->isVectorTy())
15427 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
15428 << VL.
size() <<
".\n");
15433 if (!S.getOpcode())
15439 for (
Value *V : VL) {
15440 Type *Ty =
V->getType();
15444 R.getORE()->emit([&]() {
15445 std::string TypeStr;
15449 <<
"Cannot SLP vectorize list: type "
15450 << rso.str() +
" is unsupported by vectorizer";
15456 unsigned Sz =
R.getVectorElementSize(I0);
15457 unsigned MinVF =
R.getMinVF(Sz);
15458 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
15459 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
15461 R.getORE()->emit([&]() {
15463 <<
"Cannot SLP vectorize list: vectorization factor "
15464 <<
"less than 2 is not supported";
15469 bool Changed =
false;
15470 bool CandidateFound =
false;
15472 Type *ScalarTy = VL[0]->getType();
15473 if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
15474 ScalarTy =
IE->getOperand(1)->getType();
15476 unsigned NextInst = 0, MaxInst = VL.size();
15477 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
15484 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
15485 unsigned ActualVF = std::min(MaxInst -
I, VF);
15490 if (MaxVFOnly && ActualVF < MaxVF)
15492 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
15498 auto *
I = dyn_cast<Instruction>(V);
15499 return I &&
R.isDeleted(
I);
15503 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
15507 if (
R.isTreeTinyAndNotFullyVectorizable())
15509 R.reorderTopToBottom();
15510 R.reorderBottomToTop(
15511 !isa<InsertElementInst>(Ops.
front()) &&
15512 !
R.doesRootHaveInTreeUses());
15513 R.buildExternalUses();
15515 R.computeMinimumValueSizes();
15517 CandidateFound =
true;
15518 MinCost = std::min(MinCost,
Cost);
15521 <<
" for VF=" << ActualVF <<
"\n");
15525 cast<Instruction>(Ops[0]))
15526 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
15527 <<
" and with tree size "
15528 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
15539 if (!Changed && CandidateFound) {
15540 R.getORE()->emit([&]() {
15542 <<
"List vectorization was possible but not beneficial with cost "
15543 <<
ore::NV(
"Cost", MinCost) <<
" >= "
15546 }
else if (!Changed) {
15547 R.getORE()->emit([&]() {
15549 <<
"Cannot SLP vectorize list: vectorization was impossible"
15550 <<
" with available vectorization factors";
15560 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
15566 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
15567 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
15568 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P)
15575 auto *
A = dyn_cast<BinaryOperator>(Op0);
15576 auto *
B = dyn_cast<BinaryOperator>(Op1);
15578 if (
A &&
B &&
B->hasOneUse()) {
15579 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
15580 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
15581 if (B0 && B0->getParent() ==
P)
15583 if (B1 && B1->getParent() ==
P)
15587 if (
B &&
A &&
A->hasOneUse()) {
15588 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
15589 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
15590 if (A0 && A0->getParent() ==
P)
15592 if (A1 && A1->getParent() ==
P)
15596 if (Candidates.
size() == 1)
15597 return tryToVectorizeList({Op0, Op1},
R);
15600 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
15601 if (!BestCandidate)
15603 return tryToVectorizeList(
15604 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
15638 ReductionOpsListType ReductionOps;
15650 bool IsSupportedHorRdxIdentityOp =
false;
15661 return isa<SelectInst>(
I) &&
15667 if (Kind == RecurKind::None)
15675 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
15679 return I->getFastMathFlags().noNaNs();
15682 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
15685 return I->isAssociative();
15694 return I->getOperand(2);
15695 return I->getOperand(
Index);
15703 case RecurKind::Or:
15709 case RecurKind::And:
15715 case RecurKind::Add:
15716 case RecurKind::Mul:
15717 case RecurKind::Xor:
15718 case RecurKind::FAdd:
15719 case RecurKind::FMul:
15722 case RecurKind::FMax:
15724 case RecurKind::FMin:
15726 case RecurKind::FMaximum:
15728 case RecurKind::FMinimum:
15730 case RecurKind::SMax:
15736 case RecurKind::SMin:
15742 case RecurKind::UMax:
15748 case RecurKind::UMin:
15763 const ReductionOpsListType &ReductionOps) {
15764 bool UseSelect = ReductionOps.size() == 2 ||
15766 (ReductionOps.size() == 1 &&
15767 any_of(ReductionOps.front(), IsaPred<SelectInst>));
15768 assert((!UseSelect || ReductionOps.size() != 2 ||
15769 isa<SelectInst>(ReductionOps[1][0])) &&
15770 "Expected cmp + select pairs for reduction");
15773 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
15787 auto *
I = dyn_cast<Instruction>(V);
15789 return RecurKind::None;
15791 return RecurKind::Add;
15793 return RecurKind::Mul;
15796 return RecurKind::And;
15799 return RecurKind::Or;
15801 return RecurKind::Xor;
15803 return RecurKind::FAdd;
15805 return RecurKind::FMul;
15808 return RecurKind::FMax;
15810 return RecurKind::FMin;
15813 return RecurKind::FMaximum;
15815 return RecurKind::FMinimum;
15821 return RecurKind::SMax;
15823 return RecurKind::SMin;
15825 return RecurKind::UMax;
15827 return RecurKind::UMin;
15829 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
15851 if (!isa<ExtractElementInst>(
RHS) ||
15853 return RecurKind::None;
15855 if (!isa<ExtractElementInst>(
LHS) ||
15857 return RecurKind::None;
15859 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
15860 return RecurKind::None;
15864 return RecurKind::None;
15869 return RecurKind::None;
15872 return RecurKind::SMax;
15875 return RecurKind::SMin;
15878 return RecurKind::UMax;
15881 return RecurKind::UMin;
15884 return RecurKind::None;
15888 static unsigned getFirstOperandIndex(
Instruction *
I) {
15889 return isCmpSelMinMax(
I) ? 1 : 0;
15895 return isCmpSelMinMax(
I) ? 3 : 2;
15901 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
15902 auto *Sel = cast<SelectInst>(
I);
15903 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
15904 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
15906 return I->getParent() == BB;
15910 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
15911 if (IsCmpSelMinMax) {
15914 if (
auto *Sel = dyn_cast<SelectInst>(
I))
15915 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
15916 return I->hasNUses(2);
15920 return I->hasOneUse();
15925 if (isCmpSelMinMax(
I))
15926 ReductionOps.assign(2, ReductionOpsType());
15928 ReductionOps.assign(1, ReductionOpsType());
15933 if (isCmpSelMinMax(
I)) {
15934 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
15935 ReductionOps[1].emplace_back(
I);
15937 ReductionOps[0].emplace_back(
I);
15942 int Sz = Data.size();
15943 auto *
I = dyn_cast<Instruction>(Data.front());
15944 return Sz > 1 ||
isConstant(Data.front()) ||
15955 RdxKind = HorizontalReduction::getRdxKind(Root);
15956 if (!isVectorizable(RdxKind, Root))
15967 if (
auto *Sel = dyn_cast<SelectInst>(Root))
15968 if (!Sel->getCondition()->hasOneUse())
15971 ReductionRoot = Root;
15976 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
15985 for (
int I = getFirstOperandIndex(TreeN),
15986 End = getNumberOfOperands(TreeN);
15988 Value *EdgeVal = getRdxOperand(TreeN,
I);
15989 ReducedValsToOps[EdgeVal].push_back(TreeN);
15990 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
15993 !hasSameParent(EdgeInst, BB)) {
15994 ExtraArgs.push_back(EdgeVal);
16001 if (!EdgeInst ||
getRdxKind(EdgeInst) != RdxKind ||
16002 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16003 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16004 !isVectorizable(RdxKind, EdgeInst) ||
16005 (
R.isAnalyzedReductionRoot(EdgeInst) &&
16006 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16007 PossibleReducedVals.push_back(EdgeVal);
16010 ReductionOps.push_back(EdgeInst);
16019 PossibleReducedVals;
16020 initReductionOps(Root);
16025 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
16028 auto LIt = LoadsMap.
find(
Ptr);
16029 if (LIt != LoadsMap.
end()) {
16030 for (
LoadInst *RLI : LIt->second) {
16036 for (
LoadInst *RLI : LIt->second) {
16040 DoNotReverseVals.
insert(RLI);
16044 if (LIt->second.size() > 2) {
16046 hash_value(LIt->second.back()->getPointerOperand());
16047 DoNotReverseVals.
insert(LIt->second.back());
16052 LoadKeyUsed.
insert(Key);
16057 while (!Worklist.empty()) {
16062 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16065 if (
Args.size() < 2) {
16066 addReductionOps(TreeN);
16068 if (!
Args.empty()) {
16069 assert(
Args.size() == 1 &&
"Expected only single argument.");
16070 ExtraArgs[TreeN] =
Args.front();
16074 for (
Value *V : PossibleRedVals) {
16078 ++PossibleReducedVals[
Key][
Idx]
16079 .
insert(std::make_pair(V, 0))
16082 Worklist.append(PossibleReductionOps.
rbegin(),
16083 PossibleReductionOps.
rend());
16088 ++PossibleReducedVals[
Key][
Idx]
16089 .
insert(std::make_pair(TreeN, 0))
16093 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
16096 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
16097 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
16099 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
16102 auto RedValsVect = It->second.takeVector();
16104 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
16105 PossibleRedValsVect.
back().append(Data.second, Data.first);
16107 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
16108 return P1.size() > P2.size();
16112 if (isGoodForReduction(Data) ||
16113 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16114 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16116 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16120 NewIdx = ReducedVals.
size();
16123 if (DoNotReverseVals.
contains(Data.front()))
16124 ReducedVals[NewIdx].
append(Data.begin(), Data.end());
16126 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
16128 ReducedVals.
emplace_back().append(Data.rbegin(), Data.rend());
16143 constexpr int ReductionLimit = 4;
16144 constexpr unsigned RegMaxNumber = 4;
16145 constexpr unsigned RedValsMaxNumber = 128;
16149 unsigned NumReducedVals =
16150 std::accumulate(ReducedVals.
begin(), ReducedVals.
end(), 0,
16152 if (!isGoodForReduction(Vals))
16154 return Num + Vals.size();
16156 if (NumReducedVals < ReductionLimit &&
16161 for (ReductionOpsType &RdxOps : ReductionOps)
16162 for (
Value *RdxOp : RdxOps)
16163 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16174 ReducedVals.
size() * ReducedVals.
front().size() + ExtraArgs.size());
16177 ExternallyUsedValues.
reserve(ExtraArgs.size() + 1);
16180 for (
const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16181 assert(Pair.first &&
"DebugLoc must be set.");
16182 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16183 TrackedVals.
try_emplace(Pair.second, Pair.second);
16188 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
16189 assert(isa<SelectInst>(RdxRootInst) &&
16190 "Expected min/max reduction to have select root instruction");
16191 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16192 assert(isa<Instruction>(ScalarCond) &&
16193 "Expected min/max reduction to have compare condition");
16194 return cast<Instruction>(ScalarCond);
16198 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
16199 if (VectorizedTree) {
16202 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16203 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16206 auto It = ReducedValsToOps.
find(Res);
16207 if (It != ReducedValsToOps.
end() &&
16213 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
16219 bool AnyBoolLogicOp =
16221 return isBoolLogicOp(cast<Instruction>(V));
16225 ExternallyUsedValues[ReductionRoot];
16227 ReductionOps.front().size());
16228 for (ReductionOpsType &RdxOps : ReductionOps)
16229 for (
Value *RdxOp : RdxOps) {
16232 IgnoreList.insert(RdxOp);
16237 for (
Value *U : IgnoreList)
16238 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
16239 RdxFMF &= FPMO->getFastMathFlags();
16240 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
16245 for (
Value *V : Candidates)
16246 TrackedVals.try_emplace(V, V);
16252 Value *VectorizedTree =
nullptr;
16253 bool CheckForReusedReductionOps =
false;
16255 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
16261 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
16262 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
16267 auto *Inst = dyn_cast<Instruction>(RdxVal);
16269 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
16270 (S.getOpcode() && !Inst))
16273 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
16275 bool ShuffledExtracts =
false;
16277 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16279 InstructionsState NextS =
getSameOpcode(ReducedVals[
I + 1], TLI);
16280 if (NextS.getOpcode() == Instruction::ExtractElement &&
16281 !NextS.isAltShuffle()) {
16283 for (
Value *RV : ReducedVals[
I + 1]) {
16284 Value *RdxVal = TrackedVals.find(RV)->second;
16288 if (
auto *Inst = dyn_cast<Instruction>(RdxVal))
16289 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
16291 CommonCandidates.push_back(RdxVal);
16292 TrackedToOrig.try_emplace(RdxVal, RV);
16297 Candidates.
swap(CommonCandidates);
16298 ShuffledExtracts =
true;
16307 ++VectorizedVals.try_emplace(Candidates.
front(), 0).first->getSecond();
16309 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
16310 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
16311 if (
auto *ResI = dyn_cast<Instruction>(Res))
16312 V.analyzedReductionRoot(ResI);
16314 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
16318 unsigned NumReducedVals = Candidates.
size();
16319 if (NumReducedVals < ReductionLimit &&
16326 IsSupportedHorRdxIdentityOp =
16328 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
16331 if (IsSupportedHorRdxIdentityOp)
16332 for (
Value *V : Candidates)
16333 ++SameValuesCounter.
insert(std::make_pair(V, 0)).first->second;
16344 bool SameScaleFactor =
false;
16345 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
16346 SameValuesCounter.
size() != Candidates.size();
16347 if (OptReusedScalars) {
16349 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
16350 RdxKind == RecurKind::Xor) &&
16352 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
16353 return P.second == SameValuesCounter.
front().second;
16355 Candidates.resize(SameValuesCounter.
size());
16356 transform(SameValuesCounter, Candidates.begin(),
16357 [](
const auto &
P) { return P.first; });
16358 NumReducedVals = Candidates.size();
16360 if (NumReducedVals == 1) {
16361 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
16362 unsigned Cnt = SameValuesCounter.
lookup(OrigV);
16364 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
16365 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16366 VectorizedVals.try_emplace(OrigV, Cnt);
16371 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
16372 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
16376 unsigned ReduxWidth = std::min<unsigned>(
16378 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
16379 RegMaxNumber * RedValsMaxNumber));
16380 unsigned Start = 0;
16381 unsigned Pos = Start;
16383 unsigned PrevReduxWidth = ReduxWidth;
16384 bool CheckForReusedReductionOpsLocal =
false;
16385 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
16386 &CheckForReusedReductionOpsLocal,
16387 &PrevReduxWidth, &
V,
16388 &IgnoreList](
bool IgnoreVL =
false) {
16389 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
16390 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
16393 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
16396 if (Pos < NumReducedVals - ReduxWidth + 1)
16397 return IsAnyRedOpGathered;
16400 return IsAnyRedOpGathered;
16402 bool AnyVectorized =
false;
16403 while (Pos < NumReducedVals - ReduxWidth + 1 &&
16404 ReduxWidth >= ReductionLimit) {
16407 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
16409 CheckForReusedReductionOps =
true;
16412 PrevReduxWidth = ReduxWidth;
16415 if (
V.areAnalyzedReductionVals(VL)) {
16416 (void)AdjustReducedVals(
true);
16422 auto *RedValI = dyn_cast<Instruction>(RedVal);
16425 return V.isDeleted(RedValI);
16428 V.buildTree(VL, IgnoreList);
16429 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
16430 if (!AdjustReducedVals())
16431 V.analyzedReductionVals(VL);
16434 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
16435 if (!AdjustReducedVals())
16436 V.analyzedReductionVals(VL);
16439 V.reorderTopToBottom();
16441 V.reorderBottomToTop(
true);
16445 ExternallyUsedValues);
16446 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
16447 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
16449 for (
Value *V : ReducedVals[Cnt])
16450 if (isa<Instruction>(V))
16451 LocalExternallyUsedValues[TrackedVals[
V]];
16453 if (!IsSupportedHorRdxIdentityOp) {
16456 "Reused values counter map is not empty");
16457 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16458 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16460 Value *
V = Candidates[Cnt];
16461 Value *OrigV = TrackedToOrig.find(V)->second;
16462 ++SameValuesCounter[OrigV];
16468 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16469 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16471 Value *RdxVal = Candidates[Cnt];
16472 if (!Visited.
insert(RdxVal).second)
16476 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
16477 LocalExternallyUsedValues[RdxVal];
16480 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16482 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
16483 if (NumOps != ReducedValsToOps.
find(OrigV)->second.size())
16484 LocalExternallyUsedValues[RdxVal];
16487 if (!IsSupportedHorRdxIdentityOp)
16488 SameValuesCounter.
clear();
16489 for (
Value *RdxVal : VL)
16490 if (RequiredExtract.
contains(RdxVal))
16491 LocalExternallyUsedValues[RdxVal];
16495 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals)
16496 ReplacementToExternal.
try_emplace(Pair.second, Pair.first);
16497 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
16499 auto RIt = ReplacementToExternal.
find(Ext);
16500 while (RIt != ReplacementToExternal.
end()) {
16502 RIt = ReplacementToExternal.
find(Ext);
16504 auto *It = ExternallyUsedValues.
find(Ext);
16505 if (It == ExternallyUsedValues.
end())
16507 LocalExternallyUsedValues[Pair.second].append(It->second);
16509 V.buildExternalUses(LocalExternallyUsedValues);
16511 V.computeMinimumValueSizes();
16516 getReductionCost(
TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
16519 <<
" for reduction\n");
16523 V.getORE()->emit([&]() {
16525 SV_NAME,
"HorSLPNotBeneficial",
16526 ReducedValsToOps.
find(VL[0])->second.front())
16527 <<
"Vectorizing horizontal reduction is possible "
16528 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
16529 <<
" and threshold "
16532 if (!AdjustReducedVals())
16533 V.analyzedReductionVals(VL);
16537 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
16538 <<
Cost <<
". (HorRdx)\n");
16539 V.getORE()->emit([&]() {
16541 SV_NAME,
"VectorizedHorizontalReduction",
16542 ReducedValsToOps.
find(VL[0])->second.front())
16543 <<
"Vectorized horizontal reduction with cost "
16544 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
16545 <<
ore::NV(
"TreeSize",
V.getTreeSize());
16552 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
16554 if (IsCmpSelMinMax)
16555 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
16558 Value *VectorizedRoot =
V.vectorizeTree(LocalExternallyUsedValues,
16559 ReplacedExternals, InsertPt);
16566 if ((isBoolLogicOp(RdxRootInst) ||
16567 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
16569 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
16572 if (OptReusedScalars && !SameScaleFactor) {
16574 emitReusedOps(VectorizedRoot, Builder,
V.getRootNodeScalars(),
16575 SameValuesCounter, TrackedToOrig);
16578 Value *ReducedSubTree =
16579 emitReduction(VectorizedRoot, Builder, ReduxWidth,
TTI);
16580 if (ReducedSubTree->
getType() != VL.front()->getType()) {
16582 ReducedSubTree, VL.front()->getType(),
any_of(VL, [&](
Value *R) {
16584 R, cast<Instruction>(ReductionOps.front().front())
16586 ->getDataLayout());
16594 if (OptReusedScalars && SameScaleFactor)
16595 ReducedSubTree = emitScaleForReusedOps(
16596 ReducedSubTree, Builder, SameValuesCounter.
front().second);
16598 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
16600 for (
Value *RdxVal : VL) {
16601 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16602 if (IsSupportedHorRdxIdentityOp) {
16603 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
16606 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
16607 if (!
V.isVectorized(RdxVal))
16608 RequiredExtract.
insert(RdxVal);
16613 AnyVectorized =
true;
16615 if (OptReusedScalars && !AnyVectorized) {
16616 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
16617 Value *RedVal = emitScaleForReusedOps(
P.first, Builder,
P.second);
16618 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16619 Value *OrigV = TrackedToOrig.find(
P.first)->second;
16620 VectorizedVals.try_emplace(OrigV,
P.second);
16625 if (VectorizedTree) {
16646 if (!AnyBoolLogicOp)
16648 if (isBoolLogicOp(RedOp1) &&
16649 ((!InitStep &&
LHS == VectorizedTree) ||
16652 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
16653 getRdxOperand(RedOp2, 0) ==
RHS ||
16658 if (
LHS != VectorizedTree)
16669 unsigned Sz = InstVals.
size();
16672 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
16675 Value *RdxVal1 = InstVals[
I].second;
16676 Value *StableRdxVal1 = RdxVal1;
16677 auto It1 = TrackedVals.find(RdxVal1);
16678 if (It1 != TrackedVals.end())
16679 StableRdxVal1 = It1->second;
16680 Value *RdxVal2 = InstVals[
I + 1].second;
16681 Value *StableRdxVal2 = RdxVal2;
16682 auto It2 = TrackedVals.find(RdxVal2);
16683 if (It2 != TrackedVals.end())
16684 StableRdxVal2 = It2->second;
16688 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
16690 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
16691 StableRdxVal2,
"op.rdx", ReductionOps);
16692 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
16695 ExtraReds[Sz / 2] = InstVals.
back();
16699 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
16703 for (
Value *RdxVal : Candidates) {
16704 if (!Visited.
insert(RdxVal).second)
16706 unsigned NumOps = VectorizedVals.lookup(RdxVal);
16713 for (
auto &Pair : ExternallyUsedValues) {
16715 for (
auto *
I : Pair.second)
16719 bool InitStep =
true;
16720 while (ExtraReductions.
size() > 1) {
16721 VectorizedTree = ExtraReductions.
front().second;
16723 FinalGen(ExtraReductions, InitStep);
16724 ExtraReductions.
swap(NewReds);
16727 VectorizedTree = ExtraReductions.
front().second;
16729 ReductionRoot->replaceAllUsesWith(VectorizedTree);
16738 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
16745 for (
auto *U :
Ignore->users()) {
16747 "All users must be either in the reduction ops list.");
16750 if (!
Ignore->use_empty()) {
16752 Ignore->replaceAllUsesWith(Undef);
16754 V.eraseInstruction(cast<Instruction>(
Ignore));
16757 }
else if (!CheckForReusedReductionOps) {
16758 for (ReductionOpsType &RdxOps : ReductionOps)
16759 for (
Value *RdxOp : RdxOps)
16760 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16762 return VectorizedTree;
16769 bool IsCmpSelMinMax,
unsigned ReduxWidth,
16772 Type *ScalarTy = ReducedVals.
front()->getType();
16781 int Cnt = ReducedVals.
size();
16782 for (
Value *RdxVal : ReducedVals) {
16787 Cost += GenCostFn();
16792 auto *RdxOp = cast<Instruction>(U);
16793 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
16801 Cost += ScalarCost;
16803 Cost += GenCostFn();
16808 case RecurKind::Add:
16809 case RecurKind::Mul:
16810 case RecurKind::Or:
16811 case RecurKind::And:
16812 case RecurKind::Xor:
16813 case RecurKind::FAdd:
16814 case RecurKind::FMul: {
16819 ScalarCost = EvaluateScalarCost([&]() {
16824 case RecurKind::FMax:
16825 case RecurKind::FMin:
16826 case RecurKind::FMaximum:
16827 case RecurKind::FMinimum:
16828 case RecurKind::SMax:
16829 case RecurKind::SMin:
16830 case RecurKind::UMax:
16831 case RecurKind::UMin: {
16835 ScalarCost = EvaluateScalarCost([&]() {
16845 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
16847 <<
" (It is a splitting reduction)\n");
16848 return VectorCost - ScalarCost;
16854 assert(VectorizedValue &&
"Need to have a vectorized tree node");
16856 "We only handle power-of-two reductions for now");
16857 assert(RdxKind != RecurKind::FMulAdd &&
16858 "A call to the llvm.fmuladd intrinsic is not handled yet");
16860 ++NumVectorInstructions;
16867 assert(IsSupportedHorRdxIdentityOp &&
16868 "The optimization of matched scalar identity horizontal reductions "
16869 "must be supported.");
16871 case RecurKind::Add: {
16873 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
16875 << VectorizedValue <<
". (HorRdx)\n");
16876 return Builder.
CreateMul(VectorizedValue, Scale);
16878 case RecurKind::Xor: {
16880 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
16881 <<
". (HorRdx)\n");
16884 return VectorizedValue;
16886 case RecurKind::FAdd: {
16888 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
16890 << VectorizedValue <<
". (HorRdx)\n");
16891 return Builder.
CreateFMul(VectorizedValue, Scale);
16893 case RecurKind::And:
16894 case RecurKind::Or:
16895 case RecurKind::SMax:
16896 case RecurKind::SMin:
16897 case RecurKind::UMax:
16898 case RecurKind::UMin:
16899 case RecurKind::FMax:
16900 case RecurKind::FMin:
16901 case RecurKind::FMaximum:
16902 case RecurKind::FMinimum:
16904 return VectorizedValue;
16905 case RecurKind::Mul:
16906 case RecurKind::FMul:
16907 case RecurKind::FMulAdd:
16908 case RecurKind::IAnyOf:
16909 case RecurKind::FAnyOf:
16910 case RecurKind::None:
16922 assert(IsSupportedHorRdxIdentityOp &&
16923 "The optimization of matched scalar identity horizontal reductions "
16924 "must be supported.");
16925 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
16926 if (VTy->getElementType() != VL.
front()->getType()) {
16932 R, cast<Instruction>(ReductionOps.front().front())
16934 ->getDataLayout());
16939 case RecurKind::Add: {
16942 for (
Value *V : VL) {
16943 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
16944 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
16948 << VectorizedValue <<
". (HorRdx)\n");
16949 return Builder.
CreateMul(VectorizedValue, Scale);
16951 case RecurKind::And:
16952 case RecurKind::Or:
16955 <<
". (HorRdx)\n");
16956 return VectorizedValue;
16957 case RecurKind::SMax:
16958 case RecurKind::SMin:
16959 case RecurKind::UMax:
16960 case RecurKind::UMin:
16961 case RecurKind::FMax:
16962 case RecurKind::FMin:
16963 case RecurKind::FMaximum:
16964 case RecurKind::FMinimum:
16967 <<
". (HorRdx)\n");
16968 return VectorizedValue;
16969 case RecurKind::Xor: {
16975 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
16977 std::iota(
Mask.begin(),
Mask.end(), 0);
16978 bool NeedShuffle =
false;
16979 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
16981 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
16982 if (Cnt % 2 == 0) {
16984 NeedShuffle =
true;
16990 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
16994 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
16995 return VectorizedValue;
16997 case RecurKind::FAdd: {
17000 for (
Value *V : VL) {
17001 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17002 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
17005 return Builder.
CreateFMul(VectorizedValue, Scale);
17007 case RecurKind::Mul:
17008 case RecurKind::FMul:
17009 case RecurKind::FMulAdd:
17010 case RecurKind::IAnyOf:
17011 case RecurKind::FAnyOf:
17012 case RecurKind::None:
17022 return HorizontalReduction::getRdxKind(V);
17025 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17026 return cast<FixedVectorType>(IE->getType())->getNumElements();
17028 unsigned AggregateSize = 1;
17029 auto *
IV = cast<InsertValueInst>(InsertInst);
17030 Type *CurrentType =
IV->getType();
17032 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
17033 for (
auto *Elt : ST->elements())
17034 if (Elt != ST->getElementType(0))
17035 return std::nullopt;
17036 AggregateSize *= ST->getNumElements();
17037 CurrentType = ST->getElementType(0);
17038 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17039 AggregateSize *= AT->getNumElements();
17040 CurrentType = AT->getElementType();
17041 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17042 AggregateSize *= VT->getNumElements();
17043 return AggregateSize;
17045 return AggregateSize;
17047 return std::nullopt;
17056 unsigned OperandOffset) {
17059 std::optional<unsigned> OperandIndex =
17063 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17065 BuildVectorOpds, InsertElts, *OperandIndex);
17068 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17069 InsertElts[*OperandIndex] = LastInsertInst;
17071 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
17072 }
while (LastInsertInst !=
nullptr &&
17073 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17096 assert((isa<InsertElementInst>(LastInsertInst) ||
17097 isa<InsertValueInst>(LastInsertInst)) &&
17098 "Expected insertelement or insertvalue instruction!");
17101 "Expected empty result vectors!");
17104 if (!AggregateSize)
17106 BuildVectorOpds.
resize(*AggregateSize);
17107 InsertElts.
resize(*AggregateSize);
17112 if (BuildVectorOpds.
size() >= 2)
17130 auto DominatedReduxValue = [&](
Value *R) {
17131 return isa<Instruction>(R) &&
17132 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
17138 if (
P->getIncomingBlock(0) == ParentBB) {
17139 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17140 }
else if (
P->getIncomingBlock(1) == ParentBB) {
17141 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17144 if (Rdx && DominatedReduxValue(Rdx))
17157 if (
P->getIncomingBlock(0) == BBLatch) {
17158 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17159 }
else if (
P->getIncomingBlock(1) == BBLatch) {
17160 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17163 if (Rdx && DominatedReduxValue(Rdx))
17197 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17198 isa<IntrinsicInst>(Root)) &&
17199 "Expected binop, select, or intrinsic for reduction matching");
17201 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17203 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17205 return dyn_cast<Instruction>(
RHS);
17207 return dyn_cast<Instruction>(
LHS);
17214 Value *Op0 =
nullptr;
17215 Value *Op1 =
nullptr;
17218 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17224 Value *B0 =
nullptr, *B1 =
nullptr;
17229bool SLPVectorizerPass::vectorizeHorReduction(
17234 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
17236 if (Root->
getParent() != BB || isa<PHINode>(Root))
17240 auto SelectRoot = [&]() {
17259 std::queue<std::pair<Instruction *, unsigned>>
Stack;
17260 Stack.emplace(SelectRoot(), 0);
17264 if (
R.isAnalyzedReductionRoot(Inst))
17269 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
17271 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI);
17273 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
17274 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17281 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
17286 while (!
Stack.empty()) {
17289 std::tie(Inst, Level) =
Stack.front();
17294 if (
R.isDeleted(Inst))
17296 if (
Value *VectorizedV = TryToReduce(Inst)) {
17298 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
17300 Stack.emplace(
I, Level);
17305 if (!TryAppendToPostponedInsts(Inst)) {
17316 if (VisitedInstrs.
insert(
Op).second)
17317 if (
auto *
I = dyn_cast<Instruction>(
Op))
17320 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
17321 !
R.isDeleted(
I) &&
I->getParent() == BB)
17322 Stack.emplace(
I, Level);
17331 bool Res = vectorizeHorReduction(
P, Root, BB, R,
TTI, PostponedInsts);
17332 Res |= tryToVectorize(PostponedInsts, R);
17339 for (
Value *V : Insts)
17340 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
17341 Res |= tryToVectorize(Inst, R);
17345bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
17347 if (!
R.canMapToVector(IVI->
getType()))
17355 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
17357 return tryToVectorizeList(BuildVectorOpds, R);
17366 (
llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
17370 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
17371 return tryToVectorizeList(BuildVectorInsts, R);
17374template <
typename T>
17379 bool MaxVFOnly,
BoUpSLP &R) {
17380 bool Changed =
false;
17389 auto *SameTypeIt = IncIt;
17390 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
17394 unsigned NumElts = (SameTypeIt - IncIt);
17395 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
17396 << NumElts <<
")\n");
17407 TryToVectorizeHelper(
ArrayRef(IncIt, NumElts), MaxVFOnly)) {
17413 auto GetMinNumElements = [&R](
Value *V) {
17414 unsigned EltSize = R.getVectorElementSize(V);
17415 return std::max(2U, R.getMaxVecRegSize() / EltSize);
17417 if (NumElts < GetMinNumElements(*IncIt) &&
17418 (Candidates.
empty() ||
17419 Candidates.
front()->getType() == (*IncIt)->getType())) {
17420 Candidates.
append(IncIt, std::next(IncIt, NumElts));
17424 if (Candidates.
size() > 1 &&
17425 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
17426 if (TryToVectorizeHelper(Candidates,
false)) {
17429 }
else if (MaxVFOnly) {
17431 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end();
17433 auto *SameTypeIt = It;
17434 while (SameTypeIt !=
End && AreCompatible(*SameTypeIt, *It))
17436 unsigned NumElts = (SameTypeIt - It);
17437 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(It, NumElts),
17443 Candidates.
clear();
17447 IncIt = SameTypeIt;
17459template <
bool IsCompatibility>
17464 "Expected valid element types only.");
17466 return IsCompatibility;
17467 auto *CI1 = cast<CmpInst>(V);
17468 auto *CI2 = cast<CmpInst>(V2);
17469 if (CI1->getOperand(0)->getType()->getTypeID() <
17471 return !IsCompatibility;
17472 if (CI1->getOperand(0)->getType()->getTypeID() >
17481 if (BasePred1 < BasePred2)
17482 return !IsCompatibility;
17483 if (BasePred1 > BasePred2)
17486 bool CI1Preds = Pred1 == BasePred1;
17487 bool CI2Preds = Pred2 == BasePred1;
17488 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
17489 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
17490 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
17494 return !IsCompatibility;
17497 if (
auto *I1 = dyn_cast<Instruction>(Op1))
17498 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
17499 if (IsCompatibility) {
17500 if (I1->getParent() != I2->getParent())
17507 return NodeI2 !=
nullptr;
17510 assert((NodeI1 == NodeI2) ==
17512 "Different nodes should have different DFS numbers");
17513 if (NodeI1 != NodeI2)
17517 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
17519 if (IsCompatibility)
17521 if (I1->getOpcode() != I2->getOpcode())
17522 return I1->getOpcode() < I2->getOpcode();
17525 return IsCompatibility;
17528template <
typename ItT>
17531 bool Changed =
false;
17534 if (
R.isDeleted(
I))
17537 if (
auto *RootOp = dyn_cast<Instruction>(
Op))
17538 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R,
TTI);
17542 if (
R.isDeleted(
I))
17544 Changed |= tryToVectorize(
I, R);
17551 return compareCmp<false>(V, V2, *TLI, *DT);
17554 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
17557 return compareCmp<true>(V1, V2, *TLI, *DT);
17564 if (Vals.
size() <= 1)
17566 Changed |= tryToVectorizeSequence<Value>(
17567 Vals, CompareSorter, AreCompatibleCompares,
17570 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
17572 auto *Select = dyn_cast<SelectInst>(U);
17574 Select->getParent() != cast<Instruction>(V)->getParent();
17577 if (ArePossiblyReducedInOtherBlock)
17579 return tryToVectorizeList(Candidates, R, MaxVFOnly);
17585bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
17587 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
17588 "This function only accepts Insert instructions");
17589 bool OpsChanged =
false;
17592 for (
auto *
I :
reverse(Instructions)) {
17593 if (
R.isDeleted(
I))
17595 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R,
TTI, PostponedInsts);
17598 for (
auto *
I :
reverse(Instructions)) {
17599 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
17601 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
17602 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
17603 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
17604 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
17608 OpsChanged |= tryToVectorize(PostponedInsts, R);
17615 bool Changed =
false;
17622 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
17625 "Expected vectorizable types only.");
17634 if (Opcodes1.
size() < Opcodes2.
size())
17636 if (Opcodes1.
size() > Opcodes2.
size())
17638 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
17641 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
17642 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
17647 return NodeI2 !=
nullptr;
17650 assert((NodeI1 == NodeI2) ==
17652 "Different nodes should have different DFS numbers");
17653 if (NodeI1 != NodeI2)
17656 if (S.getOpcode() && !S.isAltShuffle())
17658 return I1->getOpcode() < I2->getOpcode();
17667 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
17668 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
17676 bool U1 = isa<UndefValue>(Opcodes1[
I]);
17677 bool U2 = isa<UndefValue>(Opcodes2[
I]);
17681 auto ValID1 = Opcodes1[
I]->getValueID();
17682 auto ValID2 = Opcodes2[
I]->getValueID();
17683 if (ValID1 == ValID2)
17685 if (ValID1 < ValID2)
17687 if (ValID1 > ValID2)
17696 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
17701 auto AreCompatiblePHIs = [&PHIToOpcodes,
this](
Value *V1,
Value *
V2) {
17704 if (V1->getType() !=
V2->getType())
17708 if (Opcodes1.
size() != Opcodes2.
size())
17710 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
17712 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
17714 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
17715 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
17716 if (
I1->getParent() != I2->getParent())
17723 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
17725 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
17731 bool HaveVectorizedPhiNodes =
false;
17742 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
17755 if (!Opcodes.
empty())
17759 while (!Nodes.empty()) {
17760 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
17763 for (
Value *V :
PHI->incoming_values()) {
17764 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
17765 Nodes.push_back(PHI1);
17773 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
17774 Incoming, PHICompare, AreCompatiblePHIs,
17776 return tryToVectorizeList(Candidates, R, MaxVFOnly);
17779 Changed |= HaveVectorizedPhiNodes;
17781 }
while (HaveVectorizedPhiNodes);
17783 VisitedInstrs.
clear();
17785 InstSetVector PostProcessInserts;
17789 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
17790 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
17791 if (VectorizeCmps) {
17792 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
17793 PostProcessCmps.
clear();
17795 PostProcessInserts.clear();
17800 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
17801 return PostProcessCmps.
contains(Cmp);
17802 return isa<InsertElementInst, InsertValueInst>(
I) &&
17803 PostProcessInserts.contains(
I);
17809 return I->use_empty() &&
17810 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
17815 if (isa<ScalableVectorType>(It->getType()))
17819 if (
R.isDeleted(&*It))
17822 if (!VisitedInstrs.
insert(&*It).second) {
17823 if (HasNoUsers(&*It) &&
17824 VectorizeInsertsAndCmps(It->isTerminator())) {
17834 if (isa<DbgInfoIntrinsic>(It))
17838 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
17840 if (
P->getNumIncomingValues() == 2) {
17843 if (Root && vectorizeRootInstruction(
P, Root, BB, R,
TTI)) {
17852 for (
unsigned I = 0, E =
P->getNumIncomingValues();
I != E;
I++) {
17857 if (BB ==
P->getIncomingBlock(
I) ||
17858 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
17863 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
17864 PI && !IsInPostProcessInstrs(PI))
17865 Changed |= vectorizeRootInstruction(
nullptr, PI,
17866 P->getIncomingBlock(
I), R,
TTI);
17871 if (HasNoUsers(&*It)) {
17872 bool OpsChanged =
false;
17873 auto *
SI = dyn_cast<StoreInst>(It);
17883 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
17884 SI->getValueOperand()->hasOneUse();
17886 if (TryToVectorizeRoot) {
17887 for (
auto *V : It->operand_values()) {
17890 if (
auto *VI = dyn_cast<Instruction>(V);
17891 VI && !IsInPostProcessInstrs(VI))
17893 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R,
TTI);
17900 VectorizeInsertsAndCmps(It->isTerminator());
17911 if (isa<InsertElementInst, InsertValueInst>(It))
17912 PostProcessInserts.insert(&*It);
17913 else if (isa<CmpInst>(It))
17914 PostProcessCmps.
insert(cast<CmpInst>(&*It));
17921 auto Changed =
false;
17922 for (
auto &Entry : GEPs) {
17925 if (Entry.second.size() < 2)
17928 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
17929 << Entry.second.size() <<
".\n");
17936 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
17937 unsigned EltSize =
R.getVectorElementSize(*Entry.second[0]->idx_begin());
17938 if (MaxVecRegSize < EltSize)
17941 unsigned MaxElts = MaxVecRegSize / EltSize;
17942 for (
unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
17943 auto Len = std::min<unsigned>(BE - BI, MaxElts);
17956 Candidates.remove_if([&R](
Value *
I) {
17957 return R.isDeleted(cast<Instruction>(
I)) ||
17958 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
17966 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
17967 auto *GEPI = GEPList[
I];
17968 if (!Candidates.count(GEPI))
17970 auto *SCEVI = SE->
getSCEV(GEPList[
I]);
17971 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
17972 auto *GEPJ = GEPList[J];
17973 auto *SCEVJ = SE->
getSCEV(GEPList[J]);
17974 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
17975 Candidates.remove(GEPI);
17976 Candidates.remove(GEPJ);
17977 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
17978 Candidates.remove(GEPJ);
17985 if (Candidates.
size() < 2)
17992 auto BundleIndex = 0
u;
17993 for (
auto *V : Candidates) {
17994 auto *
GEP = cast<GetElementPtrInst>(V);
17995 auto *GEPIdx =
GEP->idx_begin()->get();
17996 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
17997 Bundle[BundleIndex++] = GEPIdx;
18009 Changed |= tryToVectorizeList(Bundle, R);
18015bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
18016 bool Changed =
false;
18021 if (
V->getValueOperand()->getType()->getTypeID() <
18022 V2->getValueOperand()->getType()->getTypeID())
18024 if (
V->getValueOperand()->getType()->getTypeID() >
18025 V2->getValueOperand()->getType()->getTypeID())
18027 if (
V->getPointerOperandType()->getTypeID() <
18028 V2->getPointerOperandType()->getTypeID())
18030 if (
V->getPointerOperandType()->getTypeID() >
18031 V2->getPointerOperandType()->getTypeID())
18034 if (isa<UndefValue>(
V->getValueOperand()) ||
18035 isa<UndefValue>(
V2->getValueOperand()))
18037 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
18038 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18040 DT->getNode(
I1->getParent());
18042 DT->getNode(I2->getParent());
18043 assert(NodeI1 &&
"Should only process reachable instructions");
18044 assert(NodeI2 &&
"Should only process reachable instructions");
18045 assert((NodeI1 == NodeI2) ==
18047 "Different nodes should have different DFS numbers");
18048 if (NodeI1 != NodeI2)
18053 return I1->getOpcode() < I2->getOpcode();
18055 if (isa<Constant>(
V->getValueOperand()) &&
18056 isa<Constant>(
V2->getValueOperand()))
18058 return V->getValueOperand()->getValueID() <
18059 V2->getValueOperand()->getValueID();
18071 isa<UndefValue>(
V2->getValueOperand()))
18074 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18075 if (
I1->getParent() != I2->getParent())
18078 return S.getOpcode() > 0;
18081 isa<Constant>(
V2->getValueOperand()))
18084 V2->getValueOperand()->getValueID();
18088 for (
auto &Pair : Stores) {
18089 if (Pair.second.size() < 2)
18093 << Pair.second.size() <<
".\n");
18102 Pair.second.rend());
18103 Changed |= tryToVectorizeSequence<StoreInst>(
18104 ReversedStores, StoreSorter, AreCompatibleStores,
18106 return vectorizeStores(Candidates, R);
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Loop::LoopBounds::Direction Direction
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleCostEstimator(TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
value_type & FindAndConstruct(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
const BasicBlock * getParent() const
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
size_type count(const KeyT &Key) const
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T get() const
Returns the value of the specified pointer type.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
void clear()
Completely clear the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
unsigned getStructNumElements() const
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Value * getOperand(unsigned i) const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
unsigned getTreeSize() const
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isLoadCombineCandidate() const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
testing::Matcher< const detail::ErrorHolder & > Failed()
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const