73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Only vectorize if you gain more than this "
123 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
128 cl::desc(
"Attempt to vectorize horizontal reductions"));
133 "Attempt to vectorize horizontal reductions feeding into a store"));
139 cl::desc(
"Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
144 cl::desc(
"Attempt to vectorize for this register size in bits"));
148 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
156 cl::desc(
"Limit the size of the SLP scheduling region per block"));
160 cl::desc(
"Attempt to vectorize for this register size in bits"));
164 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
168 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
174 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
183 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
187 cl::desc(
"The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
192 cl::desc(
"The maximum stride, considered to be profitable."));
196 cl::desc(
"Display the SLP trees with Graphviz"));
200 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
230 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
237 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
244 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
245 !isa<ExtractValueInst, UndefValue>(V))
247 auto *
I = dyn_cast<Instruction>(V);
248 if (!
I || isa<ExtractValueInst>(
I))
250 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
252 if (isa<ExtractElementInst>(
I))
254 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
263 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
279 for (
int I = 1, E = VL.
size();
I < E;
I++) {
280 auto *II = dyn_cast<Instruction>(VL[
I]);
301 Value *FirstNonUndef =
nullptr;
302 for (
Value *V : VL) {
303 if (isa<UndefValue>(V))
305 if (!FirstNonUndef) {
309 if (V != FirstNonUndef)
312 return FirstNonUndef !=
nullptr;
317 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
318 return Cmp->isCommutative();
319 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
320 return BO->isCommutative() ||
321 (BO->getOpcode() == Instruction::Sub &&
327 ICmpInst::Predicate Pred;
328 if (match(U.getUser(),
329 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
330 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
334 return match(U.getUser(),
335 m_Intrinsic<Intrinsic::abs>(
336 m_Specific(U.get()), m_ConstantInt(Flag))) &&
337 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
340 (BO->getOpcode() == Instruction::FSub &&
343 return match(U.getUser(),
344 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
346 return I->isCommutative();
354 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
355 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
358 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
361 if (CI->getValue().uge(VT->getNumElements()))
363 Index *= VT->getNumElements();
364 Index += CI->getZExtValue();
368 const auto *
IV = cast<InsertValueInst>(InsertInst);
369 Type *CurrentType =
IV->getType();
370 for (
unsigned I :
IV->indices()) {
371 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
372 Index *= ST->getNumElements();
373 CurrentType = ST->getElementType(
I);
374 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
375 Index *= AT->getNumElements();
376 CurrentType = AT->getElementType();
409 if (MaskArg == UseMask::UndefsAsMask)
413 if (MaskArg == UseMask::FirstArg &&
Value < VF)
414 UseMask.reset(
Value);
415 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
416 UseMask.reset(
Value - VF);
424template <
bool IsPoisonOnly = false>
428 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
431 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
434 auto *
C = dyn_cast<Constant>(V);
436 if (!UseMask.empty()) {
438 while (
auto *II = dyn_cast<InsertElementInst>(
Base)) {
439 Base = II->getOperand(0);
440 if (isa<T>(II->getOperand(1)))
447 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
455 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
462 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
463 if (
Constant *Elem =
C->getAggregateElement(
I))
465 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
493static std::optional<TargetTransformInfo::ShuffleKind>
495 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
498 auto *EI0 = cast<ExtractElementInst>(*It);
499 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
502 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
503 Value *Vec1 =
nullptr;
504 Value *Vec2 =
nullptr;
506 ShuffleMode CommonShuffleMode =
Unknown;
508 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
510 if (isa<UndefValue>(VL[
I]))
512 auto *EI = cast<ExtractElementInst>(VL[
I]);
513 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
515 auto *Vec = EI->getVectorOperand();
520 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
522 if (isa<UndefValue>(EI->getIndexOperand()))
524 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
530 unsigned IntIdx =
Idx->getValue().getZExtValue();
534 if (!Vec1 || Vec1 == Vec) {
536 }
else if (!Vec2 || Vec2 == Vec) {
542 if (CommonShuffleMode == Permute)
547 CommonShuffleMode = Permute;
550 CommonShuffleMode =
Select;
553 if (CommonShuffleMode ==
Select && Vec2)
564 assert((Opcode == Instruction::ExtractElement ||
565 Opcode == Instruction::ExtractValue) &&
566 "Expected extractelement or extractvalue instruction.");
567 if (Opcode == Instruction::ExtractElement) {
568 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
571 return CI->getZExtValue();
573 auto *EI = cast<ExtractValueInst>(E);
574 if (EI->getNumIndices() != 1)
576 return *EI->idx_begin();
582struct InstructionsState {
584 Value *OpValue =
nullptr;
595 unsigned getAltOpcode()
const {
600 bool isAltShuffle()
const {
return AltOp != MainOp; }
603 unsigned CheckedOpcode =
I->getOpcode();
604 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
607 InstructionsState() =
delete;
609 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
618 auto *
I = dyn_cast<Instruction>(
Op);
619 if (
I && S.isOpcodeOrAlt(
I))
638 unsigned BaseIndex = 0);
646 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
647 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
648 BaseOp0 == Op0 || BaseOp1 == Op1 ||
659 "Assessing comparisons of different types?");
669 return (BasePred == Pred &&
671 (BasePred == SwappedPred &&
680 unsigned BaseIndex) {
683 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
685 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
686 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
687 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
689 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
691 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
692 unsigned AltOpcode = Opcode;
693 unsigned AltIndex = BaseIndex;
695 bool SwappedPredsCompatible = [&]() {
699 UniquePreds.
insert(BasePred);
700 UniqueNonSwappedPreds.
insert(BasePred);
701 for (
Value *V : VL) {
702 auto *
I = dyn_cast<CmpInst>(V);
708 UniqueNonSwappedPreds.
insert(CurrentPred);
709 if (!UniquePreds.
contains(CurrentPred) &&
710 !UniquePreds.
contains(SwappedCurrentPred))
711 UniquePreds.
insert(CurrentPred);
716 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
720 auto *IBase = cast<Instruction>(VL[BaseIndex]);
723 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
727 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
729 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
730 auto *
I = cast<Instruction>(VL[Cnt]);
731 unsigned InstOpcode =
I->getOpcode();
732 if (IsBinOp && isa<BinaryOperator>(
I)) {
733 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
737 AltOpcode = InstOpcode;
741 }
else if (IsCastOp && isa<CastInst>(
I)) {
742 Value *Op0 = IBase->getOperand(0);
744 Value *Op1 =
I->getOperand(0);
747 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
749 if (Opcode == AltOpcode) {
752 "Cast isn't safe for alternation, logic needs to be updated!");
753 AltOpcode = InstOpcode;
758 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
759 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
760 Type *Ty0 = BaseInst->getOperand(0)->getType();
761 Type *Ty1 = Inst->getOperand(0)->getType();
763 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
770 if ((E == 2 || SwappedPredsCompatible) &&
771 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
776 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
777 if (AltIndex != BaseIndex) {
780 }
else if (BasePred != CurrentPred) {
783 "CmpInst isn't safe for alternation, logic needs to be updated!");
788 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
789 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
792 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
793 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
794 if (Gep->getNumOperands() != 2 ||
795 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
796 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
797 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
799 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
800 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
801 auto *BaseLI = cast<LoadInst>(IBase);
802 if (!LI->isSimple() || !BaseLI->isSimple())
803 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
804 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
805 auto *
CallBase = cast<CallInst>(IBase);
807 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
808 if (Call->hasOperandBundles() &&
809 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
810 Call->op_begin() + Call->getBundleOperandsEndIndex(),
813 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
816 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
819 if (Mappings.
size() != BaseMappings.
size() ||
820 Mappings.
front().ISA != BaseMappings.
front().ISA ||
821 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
822 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
823 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
824 Mappings.
front().Shape.Parameters !=
825 BaseMappings.
front().Shape.Parameters)
826 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
831 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
834 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
835 cast<Instruction>(VL[AltIndex]));
851 case Instruction::Load: {
852 LoadInst *LI = cast<LoadInst>(UserInst);
855 case Instruction::Store: {
856 StoreInst *SI = cast<StoreInst>(UserInst);
857 return (SI->getPointerOperand() == Scalar);
859 case Instruction::Call: {
860 CallInst *CI = cast<CallInst>(UserInst);
863 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
864 Arg.value().get() == Scalar;
876 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
883 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
884 return LI->isSimple();
886 return SI->isSimple();
888 return !
MI->isVolatile();
896 bool ExtendingManyInputs =
false) {
900 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
902 (SubMask.
size() == Mask.size() &&
903 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
904 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
905 "SubMask with many inputs support must be larger than the mask.");
907 Mask.append(SubMask.
begin(), SubMask.
end());
911 int TermValue = std::min(Mask.size(), SubMask.
size());
912 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
914 (!ExtendingManyInputs &&
915 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
917 NewMask[
I] = Mask[SubMask[
I]];
933 const unsigned Sz = Order.
size();
936 for (
unsigned I = 0;
I < Sz; ++
I) {
938 UnusedIndices.
reset(Order[
I]);
940 MaskedIndices.
set(
I);
942 if (MaskedIndices.
none())
945 "Non-synced masked/available indices.");
949 assert(
Idx >= 0 &&
"Indices must be synced.");
961 const unsigned E = Indices.
size();
963 for (
unsigned I = 0;
I < E; ++
I)
964 Mask[Indices[
I]] =
I;
970 assert(!Mask.empty() &&
"Expected non-empty mask.");
974 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
976 Scalars[Mask[
I]] = Prev[
I];
984 auto *
I = dyn_cast<Instruction>(V);
989 auto *IO = dyn_cast<Instruction>(V);
992 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1001 auto *
I = dyn_cast<Instruction>(V);
1005 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1007 auto *IU = dyn_cast<Instruction>(U);
1010 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1026 return !VL.
empty() &&
1030namespace slpvectorizer {
1035 struct ScheduleData;
1060 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1061 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1115 return !VectorizableTree.
empty() &&
1116 !VectorizableTree.
front()->UserTreeIndices.empty();
1121 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1122 return VectorizableTree.
front()->Scalars;
1137 VectorizableTree.
clear();
1138 ScalarToTreeEntry.clear();
1139 MultiNodeScalars.clear();
1141 NonScheduledFirst.
clear();
1142 EntryToLastInstruction.clear();
1143 ExternalUses.
clear();
1144 ExternalUsesAsGEPs.clear();
1145 for (
auto &Iter : BlocksSchedules) {
1146 BlockScheduling *BS = Iter.second.get();
1150 ReductionBitWidth = 0;
1151 CastMaxMinBWSizes.reset();
1152 ExtraBitWidthNodes.
clear();
1153 InstrElementSize.clear();
1154 UserIgnoreList =
nullptr;
1155 PostponedGathers.
clear();
1156 ValueToGatherNodes.
clear();
1213 return MaxVecRegSize;
1218 return MinVecRegSize;
1226 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1228 return MaxVF ? MaxVF : UINT_MAX;
1272 bool TryRecursiveCheck =
true)
const;
1296 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1297 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1319 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1320 MaxLevel(MaxLevel) {}
1374 if (isa<LoadInst>(V1)) {
1376 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1381 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1383 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1386 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1389 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1391 ((
int)V1->getNumUses() == NumLanes ||
1392 AllUsersAreInternal(V1, V2)))
1398 auto CheckSameEntryOrFail = [&]() {
1399 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1400 TE1 && TE1 == R.getTreeEntry(V2))
1405 auto *LI1 = dyn_cast<LoadInst>(V1);
1406 auto *LI2 = dyn_cast<LoadInst>(V2);
1408 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1410 return CheckSameEntryOrFail();
1413 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1414 LI2->getPointerOperand(),
DL, SE,
true);
1415 if (!Dist || *Dist == 0) {
1418 R.TTI->isLegalMaskedGather(
1422 return CheckSameEntryOrFail();
1426 if (std::abs(*Dist) > NumLanes / 2)
1435 auto *C1 = dyn_cast<Constant>(V1);
1436 auto *C2 = dyn_cast<Constant>(V2);
1450 if (isa<UndefValue>(V2))
1454 Value *EV2 =
nullptr;
1467 int Dist = Idx2 - Idx1;
1470 if (std::abs(Dist) == 0)
1472 if (std::abs(Dist) > NumLanes / 2)
1479 return CheckSameEntryOrFail();
1482 auto *I1 = dyn_cast<Instruction>(V1);
1483 auto *I2 = dyn_cast<Instruction>(V2);
1485 if (I1->getParent() != I2->getParent())
1486 return CheckSameEntryOrFail();
1493 if (S.getOpcode() &&
1494 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1495 !S.isAltShuffle()) &&
1497 return cast<Instruction>(V)->getNumOperands() ==
1498 S.MainOp->getNumOperands();
1504 if (isa<UndefValue>(V2))
1507 return CheckSameEntryOrFail();
1541 int ShallowScoreAtThisLevel =
1550 auto *I1 = dyn_cast<Instruction>(
LHS);
1551 auto *I2 = dyn_cast<Instruction>(
RHS);
1552 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1554 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1555 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1556 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1557 ShallowScoreAtThisLevel))
1558 return ShallowScoreAtThisLevel;
1559 assert(I1 && I2 &&
"Should have early exited.");
1566 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1567 OpIdx1 != NumOperands1; ++OpIdx1) {
1569 int MaxTmpScore = 0;
1570 unsigned MaxOpIdx2 = 0;
1571 bool FoundBest =
false;
1575 ? I2->getNumOperands()
1576 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1577 assert(FromIdx <= ToIdx &&
"Bad index");
1578 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1580 if (Op2Used.
count(OpIdx2))
1585 I1, I2, CurrLevel + 1, std::nullopt);
1588 TmpScore > MaxTmpScore) {
1589 MaxTmpScore = TmpScore;
1596 Op2Used.
insert(MaxOpIdx2);
1597 ShallowScoreAtThisLevel += MaxTmpScore;
1600 return ShallowScoreAtThisLevel;
1631 struct OperandData {
1632 OperandData() =
default;
1633 OperandData(
Value *V,
bool APO,
bool IsUsed)
1634 : V(V), APO(APO), IsUsed(IsUsed) {}
1644 bool IsUsed =
false;
1653 enum class ReorderingMode {
1670 const Loop *L =
nullptr;
1673 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1674 return OpsVec[OpIdx][Lane];
1678 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1679 return OpsVec[OpIdx][Lane];
1684 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1685 OpIdx != NumOperands; ++OpIdx)
1686 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1688 OpsVec[OpIdx][Lane].IsUsed =
false;
1692 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1693 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1705 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1706 Value *IdxLaneV = getData(
Idx, Lane).V;
1707 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1710 for (
unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1713 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1714 if (!isa<Instruction>(OpIdxLnV))
1716 Uniques.
insert(OpIdxLnV);
1718 int UniquesCount = Uniques.
size();
1719 int UniquesCntWithIdxLaneV =
1720 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1721 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1722 int UniquesCntWithOpIdxLaneV =
1723 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1724 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1727 UniquesCntWithOpIdxLaneV) -
1728 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1737 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1738 Value *IdxLaneV = getData(
Idx, Lane).V;
1739 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1748 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1749 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1751 return R.areAllUsersVectorized(IdxLaneI)
1759 static const int ScoreScaleFactor = 10;
1767 int Lane,
unsigned OpIdx,
unsigned Idx,
1777 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1778 if (Score <= -SplatScore) {
1783 Score += SplatScore;
1789 Score *= ScoreScaleFactor;
1790 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1808 std::optional<unsigned>
1809 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1812 unsigned NumOperands = getNumOperands();
1815 Value *OpLastLane = getData(OpIdx, LastLane).V;
1818 ReorderingMode RMode = ReorderingModes[OpIdx];
1819 if (RMode == ReorderingMode::Failed)
1820 return std::nullopt;
1823 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1829 std::optional<unsigned>
Idx;
1833 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1839 bool IsUsed = RMode == ReorderingMode::Splat ||
1840 RMode == ReorderingMode::Constant ||
1841 RMode == ReorderingMode::Load;
1843 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1845 OperandData &OpData = getData(
Idx, Lane);
1847 bool OpAPO = OpData.APO;
1856 if (OpAPO != OpIdxAPO)
1861 case ReorderingMode::Load:
1862 case ReorderingMode::Opcode: {
1863 bool LeftToRight = Lane > LastLane;
1864 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1865 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1866 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1867 OpIdx,
Idx, IsUsed);
1868 if (Score >
static_cast<int>(BestOp.Score) ||
1869 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
1872 BestOp.Score = Score;
1873 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1877 case ReorderingMode::Constant:
1878 if (isa<Constant>(
Op) ||
1879 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
1881 if (isa<Constant>(
Op)) {
1883 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1886 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
1890 case ReorderingMode::Splat:
1891 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
1892 IsUsed =
Op == OpLastLane;
1893 if (
Op == OpLastLane) {
1895 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1901 case ReorderingMode::Failed:
1907 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1911 return std::nullopt;
1918 unsigned getBestLaneToStartReordering()
const {
1919 unsigned Min = UINT_MAX;
1920 unsigned SameOpNumber = 0;
1931 for (
int I = getNumLanes();
I > 0; --
I) {
1932 unsigned Lane =
I - 1;
1933 OperandsOrderData NumFreeOpsHash =
1934 getMaxNumOperandsThatCanBeReordered(Lane);
1937 if (NumFreeOpsHash.NumOfAPOs < Min) {
1938 Min = NumFreeOpsHash.NumOfAPOs;
1939 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1941 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1942 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1943 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1946 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1947 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1948 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1949 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1950 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
1951 if (It == HashMap.
end())
1952 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1958 unsigned BestLane = 0;
1959 unsigned CntMin = UINT_MAX;
1961 if (
Data.second.first < CntMin) {
1962 CntMin =
Data.second.first;
1963 BestLane =
Data.second.second;
1970 struct OperandsOrderData {
1973 unsigned NumOfAPOs = UINT_MAX;
1976 unsigned NumOpsWithSameOpcodeParent = 0;
1990 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1991 unsigned CntTrue = 0;
1992 unsigned NumOperands = getNumOperands();
2002 bool AllUndefs =
true;
2003 unsigned NumOpsWithSameOpcodeParent = 0;
2007 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2008 const OperandData &OpData = getData(OpIdx, Lane);
2013 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2015 I->getParent() != Parent) {
2016 if (NumOpsWithSameOpcodeParent == 0) {
2017 NumOpsWithSameOpcodeParent = 1;
2019 Parent =
I->getParent();
2021 --NumOpsWithSameOpcodeParent;
2024 ++NumOpsWithSameOpcodeParent;
2028 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2029 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2033 OperandsOrderData
Data;
2034 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2035 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2043 assert((empty() || VL.
size() == getNumLanes()) &&
2044 "Expected same number of lanes");
2045 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2046 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2047 constexpr unsigned IntrinsicNumOperands = 2;
2048 if (isa<IntrinsicInst>(VL[0]))
2049 NumOperands = IntrinsicNumOperands;
2050 OpsVec.
resize(NumOperands);
2051 unsigned NumLanes = VL.
size();
2052 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2053 OpsVec[OpIdx].
resize(NumLanes);
2054 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2055 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2066 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2067 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2068 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2075 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2078 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2081 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2082 return getData(OpIdx, Lane).V;
2086 bool empty()
const {
return OpsVec.
empty(); }
2089 void clear() { OpsVec.
clear(); }
2094 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2095 bool OpAPO = getData(OpIdx, Lane).APO;
2096 bool IsInvariant = L && L->isLoopInvariant(
Op);
2098 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2102 bool FoundCandidate =
false;
2103 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2104 OperandData &
Data = getData(OpI, Ln);
2105 if (
Data.APO != OpAPO ||
Data.IsUsed)
2107 Value *OpILane = getValue(OpI, Lane);
2108 bool IsConstantOp = isa<Constant>(OpILane);
2117 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2123 isa<Constant>(
Data.V)))) ||
2130 (IsInvariant && !isa<Constant>(
Data.V) &&
2132 L->isLoopInvariant(
Data.V))) {
2133 FoundCandidate =
true;
2140 if (!FoundCandidate)
2143 return getNumLanes() == 2 || Cnt > 1;
2149 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2153 appendOperandsOfVL(RootVL);
2160 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2161 "Expected same num of lanes across all operands");
2162 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2163 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2171 unsigned NumOperands = getNumOperands();
2172 unsigned NumLanes = getNumLanes();
2192 unsigned FirstLane = getBestLaneToStartReordering();
2195 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2196 Value *OpLane0 = getValue(OpIdx, FirstLane);
2199 if (isa<LoadInst>(OpLane0))
2200 ReorderingModes[OpIdx] = ReorderingMode::Load;
2201 else if (isa<Instruction>(OpLane0)) {
2203 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2204 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2206 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2208 else if (isa<Constant>(OpLane0))
2209 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2210 else if (isa<Argument>(OpLane0))
2212 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2215 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2222 auto &&SkipReordering = [
this]() {
2225 for (
const OperandData &
Data : Op0)
2228 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2247 if (SkipReordering())
2250 bool StrategyFailed =
false;
2258 for (
unsigned I = 0;
I < NumOperands; ++
I)
2259 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2261 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2264 int Lane = FirstLane +
Direction * Distance;
2265 if (Lane < 0 || Lane >= (
int)NumLanes)
2268 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2271 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2273 std::optional<unsigned> BestIdx = getBestOperand(
2274 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2281 swap(OpIdx, *BestIdx, Lane);
2284 StrategyFailed =
true;
2287 if (MainAltOps[OpIdx].
size() != 2) {
2288 OperandData &AltOp = getData(OpIdx, Lane);
2289 InstructionsState OpS =
2291 if (OpS.getOpcode() && OpS.isAltShuffle())
2298 if (!StrategyFailed)
2303#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2306 case ReorderingMode::Load:
2308 case ReorderingMode::Opcode:
2310 case ReorderingMode::Constant:
2312 case ReorderingMode::Splat:
2314 case ReorderingMode::Failed:
2335 const unsigned Indent = 2;
2338 OS <<
"Operand " << Cnt++ <<
"\n";
2339 for (
const OperandData &OpData : OpDataVec) {
2341 if (
Value *V = OpData.V)
2345 OS <<
", APO:" << OpData.APO <<
"}\n";
2367 int BestScore = Limit;
2368 std::optional<int>
Index;
2369 for (
int I : seq<int>(0, Candidates.size())) {
2371 Candidates[
I].second,
2374 if (Score > BestScore) {
2389 DeletedInstructions.insert(
I);
2395 return AnalyzedReductionsRoots.count(
I);
2400 AnalyzedReductionsRoots.insert(
I);
2414 AnalyzedReductionsRoots.clear();
2415 AnalyzedReductionVals.
clear();
2416 AnalyzedMinBWVals.
clear();
2428 return NonScheduledFirst.
contains(V);
2441 bool collectValuesToDemote(
const TreeEntry &E,
bool IsProfitableToDemoteRoot,
2445 unsigned &MaxDepthLevel,
2446 bool &IsProfitableToDemote,
2447 bool IsTruncRoot)
const;
2457 canReorderOperands(TreeEntry *UserTE,
2464 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2468 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2470 TreeEntry *TE =
nullptr;
2472 TE = getTreeEntry(V);
2473 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2475 auto It = MultiNodeScalars.find(V);
2476 if (It != MultiNodeScalars.end()) {
2477 for (TreeEntry *E : It->second) {
2478 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2486 if (It != VL.
end()) {
2487 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2495 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2496 unsigned OpIdx)
const {
2497 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2498 const_cast<TreeEntry *
>(UserTE), OpIdx);
2502 bool areAllUsersVectorized(
2511 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
2515 getCastContextHint(
const TreeEntry &TE)
const;
2524 const EdgeInfo &EI);
2535 bool ResizeAllowed =
false)
const;
2546 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
2551 template <
typename BVTy,
typename ResTy,
typename...
Args>
2552 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
2557 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
2563 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
2570 std::optional<TargetTransformInfo::ShuffleKind>
2582 unsigned NumParts)
const;
2594 std::optional<TargetTransformInfo::ShuffleKind>
2595 isGatherShuffledSingleRegisterEntry(
2612 isGatherShuffledEntry(
2615 unsigned NumParts,
bool ForOrder =
false);
2622 Type *ScalarTy)
const;
2626 void setInsertPointAfterBundle(
const TreeEntry *E);
2634 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2647 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2663 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2667 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2684 [Scalars](
Value *V,
int Idx) {
2685 return (isa<UndefValue>(V) &&
2686 Idx == PoisonMaskElem) ||
2687 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2690 if (!ReorderIndices.empty()) {
2697 return IsSame(Scalars, Mask);
2698 if (VL.
size() == ReuseShuffleIndices.size()) {
2700 return IsSame(Scalars, Mask);
2704 return IsSame(Scalars, ReuseShuffleIndices);
2707 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2708 return State == TreeEntry::NeedToGather &&
2709 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2710 UserTreeIndices.front().UserTE == UserEI.UserTE;
2714 bool hasEqualOperands(
const TreeEntry &TE)
const {
2715 if (
TE.getNumOperands() != getNumOperands())
2718 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
2719 unsigned PrevCount =
Used.count();
2720 for (
unsigned K = 0;
K < E; ++
K) {
2723 if (getOperand(K) ==
TE.getOperand(
I)) {
2729 if (PrevCount ==
Used.count())
2738 unsigned getVectorFactor()
const {
2739 if (!ReuseShuffleIndices.empty())
2740 return ReuseShuffleIndices.size();
2741 return Scalars.
size();
2776 VecTreeTy &Container;
2800 assert(Operands[OpIdx].empty() &&
"Already resized?");
2802 "Number of operands is greater than the number of scalars.");
2808 void setOperandsInOrder() {
2810 auto *I0 = cast<Instruction>(Scalars[0]);
2811 Operands.resize(I0->getNumOperands());
2812 unsigned NumLanes = Scalars.size();
2813 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2814 OpIdx != NumOperands; ++OpIdx) {
2816 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2817 auto *
I = cast<Instruction>(Scalars[Lane]);
2818 assert(
I->getNumOperands() == NumOperands &&
2819 "Expected same number of operands");
2820 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2844 unsigned getNumOperands()
const {
return Operands.size(); }
2847 Value *getSingleOperand(
unsigned OpIdx)
const {
2849 assert(!Operands[OpIdx].empty() &&
"No operand available");
2854 bool isAltShuffle()
const {
return MainOp != AltOp; }
2857 unsigned CheckedOpcode =
I->getOpcode();
2858 return (getOpcode() == CheckedOpcode ||
2859 getAltOpcode() == CheckedOpcode);
2866 auto *
I = dyn_cast<Instruction>(
Op);
2867 if (
I && isOpcodeOrAlt(
I))
2872 void setOperations(
const InstructionsState &S) {
2886 unsigned getOpcode()
const {
2887 return MainOp ? MainOp->
getOpcode() : 0;
2890 unsigned getAltOpcode()
const {
2896 int findLaneForValue(
Value *V)
const {
2897 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2898 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2899 if (!ReorderIndices.
empty())
2900 FoundLane = ReorderIndices[FoundLane];
2901 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2902 if (!ReuseShuffleIndices.
empty()) {
2903 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2904 find(ReuseShuffleIndices, FoundLane));
2918 bool isNonPowOf2Vec()
const {
2920 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
2921 "Reshuffling not supported with non-power-of-2 vectors yet.");
2922 return IsNonPowerOf2;
2929 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2930 dbgs() <<
"Operand " << OpI <<
":\n";
2931 for (
const Value *V : Operands[OpI])
2934 dbgs() <<
"Scalars: \n";
2935 for (
Value *V : Scalars)
2937 dbgs() <<
"State: ";
2940 dbgs() <<
"Vectorize\n";
2942 case ScatterVectorize:
2943 dbgs() <<
"ScatterVectorize\n";
2945 case StridedVectorize:
2946 dbgs() <<
"StridedVectorize\n";
2949 dbgs() <<
"NeedToGather\n";
2952 dbgs() <<
"MainOp: ";
2954 dbgs() << *MainOp <<
"\n";
2957 dbgs() <<
"AltOp: ";
2959 dbgs() << *AltOp <<
"\n";
2962 dbgs() <<
"VectorizedValue: ";
2963 if (VectorizedValue)
2964 dbgs() << *VectorizedValue <<
"\n";
2967 dbgs() <<
"ReuseShuffleIndices: ";
2968 if (ReuseShuffleIndices.
empty())
2971 for (
int ReuseIdx : ReuseShuffleIndices)
2972 dbgs() << ReuseIdx <<
", ";
2974 dbgs() <<
"ReorderIndices: ";
2975 for (
unsigned ReorderIdx : ReorderIndices)
2976 dbgs() << ReorderIdx <<
", ";
2978 dbgs() <<
"UserTreeIndices: ";
2979 for (
const auto &EInfo : UserTreeIndices)
2980 dbgs() << EInfo <<
", ";
2987 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
2990 dbgs() <<
"SLP: " << Banner <<
":\n";
2992 dbgs() <<
"SLP: Costs:\n";
2993 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2994 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2995 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2996 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2997 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3003 std::optional<ScheduleData *> Bundle,
3004 const InstructionsState &S,
3005 const EdgeInfo &UserTreeIdx,
3008 TreeEntry::EntryState EntryState =
3009 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3010 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3011 ReuseShuffleIndices, ReorderIndices);
3015 TreeEntry::EntryState EntryState,
3016 std::optional<ScheduleData *> Bundle,
3017 const InstructionsState &S,
3018 const EdgeInfo &UserTreeIdx,
3021 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3022 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3023 "Need to vectorize gather entry?");
3024 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3025 TreeEntry *
Last = VectorizableTree.
back().get();
3026 Last->Idx = VectorizableTree.
size() - 1;
3027 Last->State = EntryState;
3028 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3029 ReuseShuffleIndices.end());
3030 if (ReorderIndices.
empty()) {
3032 Last->setOperations(S);
3035 Last->Scalars.assign(VL.
size(),
nullptr);
3038 if (Idx >= VL.size())
3039 return UndefValue::get(VL.front()->getType());
3043 Last->setOperations(S);
3044 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3046 if (
Last->State != TreeEntry::NeedToGather) {
3047 for (
Value *V : VL) {
3048 const TreeEntry *
TE = getTreeEntry(V);
3050 "Scalar already in tree!");
3053 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3056 ScalarToTreeEntry[
V] =
Last;
3059 ScheduleData *BundleMember = *Bundle;
3060 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3063 "Bundle and VL out of sync");
3065 for (
Value *V : VL) {
3070 BundleMember->TE =
Last;
3071 BundleMember = BundleMember->NextInBundle;
3074 assert(!BundleMember &&
"Bundle and VL out of sync");
3077 bool AllConstsOrCasts =
true;
3080 auto *
I = dyn_cast<CastInst>(V);
3081 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3084 if (AllConstsOrCasts)
3086 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3087 MustGather.
insert(VL.begin(), VL.end());
3090 if (UserTreeIdx.UserTE) {
3091 Last->UserTreeIndices.push_back(UserTreeIdx);
3092 assert((!
Last->isNonPowOf2Vec() ||
Last->ReorderIndices.empty()) &&
3093 "Reordering isn't implemented for non-power-of-2 nodes yet");
3100 TreeEntry::VecTreeTy VectorizableTree;
3105 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3106 VectorizableTree[
Id]->dump();
3112 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3114 const TreeEntry *getTreeEntry(
Value *V)
const {
3115 return ScalarToTreeEntry.lookup(V);
3124 bool areAltOperandsProfitable(
const InstructionsState &S,
3129 TreeEntry::EntryState getScalarsVectorizationState(
3162 using ValueToGatherNodesMap =
3164 ValueToGatherNodesMap ValueToGatherNodes;
3167 struct ExternalUser {
3191 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3192 auto It = AliasCache.
find(Key);
3193 if (It != AliasCache.
end())
3198 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3202 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3234 UserList ExternalUses;
3254 struct ScheduleData {
3257 enum { InvalidDeps = -1 };
3259 ScheduleData() =
default;
3261 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3262 FirstInBundle =
this;
3263 NextInBundle =
nullptr;
3264 NextLoadStore =
nullptr;
3265 IsScheduled =
false;
3266 SchedulingRegionID = BlockSchedulingRegionID;
3267 clearDependencies();
3274 if (hasValidDependencies()) {
3275 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3277 assert(UnscheduledDeps == Dependencies &&
"invariant");
3281 assert(isSchedulingEntity() &&
3282 "unexpected scheduled state");
3283 for (
const ScheduleData *BundleMember =
this; BundleMember;
3284 BundleMember = BundleMember->NextInBundle) {
3285 assert(BundleMember->hasValidDependencies() &&
3286 BundleMember->UnscheduledDeps == 0 &&
3287 "unexpected scheduled state");
3288 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3289 "only bundle is marked scheduled");
3293 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3294 "all bundle members must be in same basic block");
3300 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3304 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3308 bool isPartOfBundle()
const {
3309 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3314 bool isReady()
const {
3315 assert(isSchedulingEntity() &&
3316 "can't consider non-scheduling entity for ready list");
3317 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3323 int incrementUnscheduledDeps(
int Incr) {
3324 assert(hasValidDependencies() &&
3325 "increment of unscheduled deps would be meaningless");
3326 UnscheduledDeps += Incr;
3327 return FirstInBundle->unscheduledDepsInBundle();
3332 void resetUnscheduledDeps() {
3333 UnscheduledDeps = Dependencies;
3337 void clearDependencies() {
3338 Dependencies = InvalidDeps;
3339 resetUnscheduledDeps();
3340 MemoryDependencies.clear();
3341 ControlDependencies.clear();
3344 int unscheduledDepsInBundle()
const {
3345 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3347 for (
const ScheduleData *BundleMember =
this; BundleMember;
3348 BundleMember = BundleMember->NextInBundle) {
3349 if (BundleMember->UnscheduledDeps == InvalidDeps)
3351 Sum += BundleMember->UnscheduledDeps;
3357 if (!isSchedulingEntity()) {
3358 os <<
"/ " << *Inst;
3359 }
else if (NextInBundle) {
3361 ScheduleData *SD = NextInBundle;
3363 os <<
';' << *SD->Inst;
3364 SD = SD->NextInBundle;
3375 Value *OpValue =
nullptr;
3378 TreeEntry *
TE =
nullptr;
3382 ScheduleData *FirstInBundle =
nullptr;
3386 ScheduleData *NextInBundle =
nullptr;
3390 ScheduleData *NextLoadStore =
nullptr;
3404 int SchedulingRegionID = 0;
3407 int SchedulingPriority = 0;
3413 int Dependencies = InvalidDeps;
3419 int UnscheduledDeps = InvalidDeps;
3423 bool IsScheduled =
false;
3428 const BoUpSLP::ScheduleData &SD) {
3453 struct BlockScheduling {
3455 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3459 ScheduleStart =
nullptr;
3460 ScheduleEnd =
nullptr;
3461 FirstLoadStoreInRegion =
nullptr;
3462 LastLoadStoreInRegion =
nullptr;
3463 RegionHasStackSave =
false;
3467 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3470 ScheduleRegionSize = 0;
3474 ++SchedulingRegionID;
3478 if (BB !=
I->getParent())
3481 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3482 if (SD && isInSchedulingRegion(SD))
3487 ScheduleData *getScheduleData(
Value *V) {
3488 if (
auto *
I = dyn_cast<Instruction>(V))
3489 return getScheduleData(
I);
3493 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3495 return getScheduleData(V);
3496 auto I = ExtraScheduleDataMap.find(V);
3497 if (
I != ExtraScheduleDataMap.end()) {
3498 ScheduleData *SD =
I->second.lookup(Key);
3499 if (SD && isInSchedulingRegion(SD))
3505 bool isInSchedulingRegion(ScheduleData *SD)
const {
3506 return SD->SchedulingRegionID == SchedulingRegionID;
3511 template <
typename ReadyListType>
3512 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3513 SD->IsScheduled =
true;
3516 for (ScheduleData *BundleMember = SD; BundleMember;
3517 BundleMember = BundleMember->NextInBundle) {
3518 if (BundleMember->Inst != BundleMember->OpValue)
3524 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3525 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3526 if (OpDef && OpDef->hasValidDependencies() &&
3527 OpDef->incrementUnscheduledDeps(-1) == 0) {
3531 ScheduleData *DepBundle = OpDef->FirstInBundle;
3532 assert(!DepBundle->IsScheduled &&
3533 "already scheduled bundle gets ready");
3534 ReadyList.insert(DepBundle);
3536 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3544 if (TreeEntry *TE = BundleMember->TE) {
3546 int Lane = std::distance(
TE->Scalars.begin(),
3547 find(
TE->Scalars, BundleMember->Inst));
3548 assert(Lane >= 0 &&
"Lane not set");
3556 auto *
In = BundleMember->Inst;
3559 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3560 In->getNumOperands() ==
TE->getNumOperands()) &&
3561 "Missed TreeEntry operands?");
3564 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3565 OpIdx != NumOperands; ++OpIdx)
3566 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3571 for (
Use &U : BundleMember->Inst->operands())
3572 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3576 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3577 if (MemoryDepSD->hasValidDependencies() &&
3578 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3581 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3582 assert(!DepBundle->IsScheduled &&
3583 "already scheduled bundle gets ready");
3584 ReadyList.insert(DepBundle);
3586 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3590 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3591 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3594 ScheduleData *DepBundle = DepSD->FirstInBundle;
3595 assert(!DepBundle->IsScheduled &&
3596 "already scheduled bundle gets ready");
3597 ReadyList.insert(DepBundle);
3599 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3610 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3611 ScheduleStart->comesBefore(ScheduleEnd) &&
3612 "Not a valid scheduling region?");
3614 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3615 auto *SD = getScheduleData(
I);
3618 assert(isInSchedulingRegion(SD) &&
3619 "primary schedule data not in window?");
3620 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3621 "entire bundle in window!");
3623 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3626 for (
auto *SD : ReadyInsts) {
3627 assert(SD->isSchedulingEntity() && SD->isReady() &&
3628 "item in ready list not ready?");
3633 void doForAllOpcodes(
Value *V,
3635 if (ScheduleData *SD = getScheduleData(V))
3637 auto I = ExtraScheduleDataMap.find(V);
3638 if (
I != ExtraScheduleDataMap.end())
3639 for (
auto &
P :
I->second)
3640 if (isInSchedulingRegion(
P.second))
3645 template <
typename ReadyListType>
3646 void initialFillReadyList(ReadyListType &ReadyList) {
3647 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3648 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3649 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3651 ReadyList.insert(SD);
3653 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3668 std::optional<ScheduleData *>
3670 const InstructionsState &S);
3676 ScheduleData *allocateScheduleDataChunks();
3680 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3685 ScheduleData *PrevLoadStore,
3686 ScheduleData *NextLoadStore);
3690 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3694 void resetSchedule();
3715 ExtraScheduleDataMap;
3728 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3732 ScheduleData *LastLoadStoreInRegion =
nullptr;
3737 bool RegionHasStackSave =
false;
3740 int ScheduleRegionSize = 0;
3749 int SchedulingRegionID = 1;
3757 void scheduleBlock(BlockScheduling *BS);
3764 struct OrdersTypeDenseMapInfo {
3777 static unsigned getHashValue(
const OrdersType &V) {
3798 unsigned MaxVecRegSize;
3799 unsigned MinVecRegSize;
3814 unsigned ReductionBitWidth = 0;
3818 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3837 struct ChildIteratorType
3839 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3850 return R.VectorizableTree[0].get();
3854 return {
N->UserTreeIndices.begin(),
N->Container};
3858 return {
N->UserTreeIndices.end(),
N->Container};
3863 class nodes_iterator {
3874 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3878 return nodes_iterator(R->VectorizableTree.begin());
3882 return nodes_iterator(R->VectorizableTree.end());
3885 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3896 OS << Entry->Idx <<
".\n";
3899 for (
auto *V : Entry->Scalars) {
3901 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3902 return EU.Scalar == V;
3912 if (Entry->State == TreeEntry::NeedToGather)
3914 if (Entry->State == TreeEntry::ScatterVectorize ||
3915 Entry->State == TreeEntry::StridedVectorize)
3916 return "color=blue";
3925 for (
auto *
I : DeletedInstructions) {
3926 for (
Use &U :
I->operands()) {
3927 auto *
Op = dyn_cast<Instruction>(U.get());
3928 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
3932 I->dropAllReferences();
3934 for (
auto *
I : DeletedInstructions) {
3936 "trying to erase instruction with users.");
3937 I->eraseFromParent();
3943#ifdef EXPENSIVE_CHECKS
3954 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3955 "Expected non-empty mask.");
3958 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
3960 Reuses[Mask[
I]] = Prev[
I];
3968 bool BottomOrder =
false) {
3969 assert(!Mask.empty() &&
"Expected non-empty mask.");
3970 unsigned Sz = Mask.size();
3973 if (Order.
empty()) {
3975 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
3977 PrevOrder.
swap(Order);
3980 for (
unsigned I = 0;
I < Sz; ++
I)
3982 Order[
I] = PrevOrder[Mask[
I]];
3984 return Data.value() == Sz ||
Data.index() ==
Data.value();
3993 if (Order.
empty()) {
3995 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4005 for (
unsigned I = 0;
I < Sz; ++
I)
4007 Order[MaskOrder[
I]] =
I;
4011std::optional<BoUpSLP::OrdersType>
4013 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
4017 Type *ScalarTy = GatheredScalars.
front()->getType();
4018 int NumScalars = GatheredScalars.
size();
4020 return std::nullopt;
4023 if (NumParts == 0 || NumParts >= NumScalars)
4029 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4031 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4034 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4035 return std::nullopt;
4036 OrdersType CurrentOrder(NumScalars, NumScalars);
4037 if (GatherShuffles.
size() == 1 &&
4039 Entries.front().front()->isSame(TE.Scalars)) {
4042 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4043 return CurrentOrder;
4047 return all_of(Mask, [&](
int I) {
4054 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4055 (Entries.size() != 1 ||
4056 Entries.front().front()->ReorderIndices.empty())) ||
4057 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4058 return std::nullopt;
4063 for (
int I : seq<int>(0, NumParts)) {
4064 if (ShuffledSubMasks.
test(
I))
4066 const int VF = GetVF(
I);
4071 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4072 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4073 ShuffledSubMasks.
set(
I);
4077 int FirstMin = INT_MAX;
4078 int SecondVecFound =
false;
4079 for (
int K : seq<int>(0, PartSz)) {
4080 int Idx = Mask[
I * PartSz + K];
4082 Value *V = GatheredScalars[
I * PartSz + K];
4084 SecondVecFound =
true;
4093 SecondVecFound =
true;
4097 FirstMin = (FirstMin / PartSz) * PartSz;
4099 if (SecondVecFound) {
4100 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4101 ShuffledSubMasks.
set(
I);
4104 for (
int K : seq<int>(0, PartSz)) {
4105 int Idx = Mask[
I * PartSz + K];
4109 if (
Idx >= PartSz) {
4110 SecondVecFound =
true;
4113 if (CurrentOrder[
I * PartSz +
Idx] >
4114 static_cast<unsigned>(
I * PartSz + K) &&
4115 CurrentOrder[
I * PartSz +
Idx] !=
4116 static_cast<unsigned>(
I * PartSz +
Idx))
4117 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4120 if (SecondVecFound) {
4121 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4122 ShuffledSubMasks.
set(
I);
4127 int PartSz = NumScalars / NumParts;
4128 if (!ExtractShuffles.
empty())
4129 TransformMaskToOrder(
4130 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4131 if (!ExtractShuffles[
I])
4134 for (
unsigned Idx : seq<unsigned>(0, PartSz)) {
4135 int K =
I * PartSz +
Idx;
4138 if (!TE.ReuseShuffleIndices.empty())
4139 K = TE.ReuseShuffleIndices[K];
4140 if (!TE.ReorderIndices.empty())
4141 K = std::distance(TE.ReorderIndices.begin(),
4142 find(TE.ReorderIndices, K));
4143 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4146 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4148 .getKnownMinValue());
4153 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4154 if (ShuffledSubMasks.
any())
4155 return std::nullopt;
4156 PartSz = NumScalars;
4159 if (!Entries.empty())
4160 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4161 if (!GatherShuffles[
I])
4163 return std::max(Entries[
I].front()->getVectorFactor(),
4164 Entries[
I].back()->getVectorFactor());
4167 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4168 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4169 return std::nullopt;
4170 return std::move(CurrentOrder);
4175 bool CompareOpcodes =
true) {
4178 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4181 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4184 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4188 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4193template <
typename T>
4195 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4197 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4198 return CommonAlignment;
4203 unsigned Sz = Order.
size();
4205 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4216static std::optional<Value *>
4222 const SCEV *PtrSCEVLowest =
nullptr;
4223 const SCEV *PtrSCEVHighest =
nullptr;
4229 return std::nullopt;
4231 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4232 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4236 if (isa<SCEVCouldNotCompute>(Diff))
4237 return std::nullopt;
4239 PtrSCEVLowest = PtrSCEV;
4243 if (isa<SCEVCouldNotCompute>(Diff1))
4244 return std::nullopt;
4246 PtrSCEVHighest = PtrSCEV;
4252 if (isa<SCEVCouldNotCompute>(Dist))
4253 return std::nullopt;
4254 int Size =
DL.getTypeStoreSize(ElemTy);
4255 auto TryGetStride = [&](
const SCEV *Dist,
4256 const SCEV *Multiplier) ->
const SCEV * {
4257 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4258 if (M->getOperand(0) == Multiplier)
4259 return M->getOperand(1);
4260 if (M->getOperand(1) == Multiplier)
4261 return M->getOperand(0);
4264 if (Multiplier == Dist)
4269 const SCEV *Stride =
nullptr;
4270 if (
Size != 1 || SCEVs.
size() > 2) {
4272 Stride = TryGetStride(Dist, Sz);
4274 return std::nullopt;
4276 if (!Stride || isa<SCEVConstant>(Stride))
4277 return std::nullopt;
4280 using DistOrdPair = std::pair<int64_t, int>;
4282 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4284 bool IsConsecutive =
true;
4285 for (
const SCEV *PtrSCEV : SCEVs) {
4287 if (PtrSCEV != PtrSCEVLowest) {
4289 const SCEV *Coeff = TryGetStride(Diff, Stride);
4291 return std::nullopt;
4292 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4293 if (!SC || isa<SCEVCouldNotCompute>(SC))
4294 return std::nullopt;
4298 return std::nullopt;
4299 Dist = SC->getAPInt().getZExtValue();
4303 return std::nullopt;
4304 auto Res = Offsets.emplace(Dist, Cnt);
4306 return std::nullopt;
4308 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4311 if (Offsets.size() != SCEVs.
size())
4312 return std::nullopt;
4313 SortedIndices.
clear();
4314 if (!IsConsecutive) {
4318 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4319 SortedIndices[Cnt] = Pair.second;
4329static std::pair<InstructionCost, InstructionCost>
4345 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4351 const unsigned Sz = VL.
size();
4353 auto *POIter = PointerOps.
begin();
4354 for (
Value *V : VL) {
4355 auto *L = cast<LoadInst>(V);
4358 *POIter = L->getPointerOperand();
4369 "supported with VectorizeNonPowerOf2");
4373 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4384 if (Order.
empty()) {
4385 Ptr0 = PointerOps.
front();
4386 PtrN = PointerOps.
back();
4388 Ptr0 = PointerOps[Order.
front()];
4389 PtrN = PointerOps[Order.
back()];
4391 std::optional<int> Diff =
4394 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4397 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4409 (
static_cast<unsigned>(std::abs(*Diff)) <=
4412 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4413 *Diff == -(
static_cast<int>(Sz) - 1))) {
4414 int Stride = *Diff /
static_cast<int>(Sz - 1);
4415 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4427 else if (
Ptr != Ptr0)
4432 if (((Dist / Stride) * Stride) != Dist ||
4433 !Dists.
insert(Dist).second)
4436 if (Dists.
size() == Sz)
4442 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment) {
4443 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4445 unsigned MaxVF = std::max<unsigned>(
bit_floor(VL.
size() / 2), MinVF);
4446 MaxVF = std::min(
getMaximumVF(Sz, Instruction::Load), MaxVF);
4447 for (
unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4448 unsigned VectorizedCnt = 0;
4450 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End;
4451 Cnt += VF, ++VectorizedCnt) {
4469 if (VectorizedCnt == VL.
size() / VF) {
4472 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4473 TTI, PointerOps, PointerOps.
front(), Instruction::GetElementPtr,
4477 Instruction::Load, VecTy,
4479 false, CommonAlignment,
CostKind) +
4480 VectorGEPCost - ScalarGEPCost;
4484 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
4487 auto [ScalarGEPCost, VectorGEPCost] =
4489 LI0->getPointerOperand(), Instruction::Load,
4492 Instruction::Load, SubVecTy, LI0->getAlign(),
4493 LI0->getPointerAddressSpace(),
CostKind,
4495 VectorGEPCost - ScalarGEPCost;
4499 auto [ScalarGEPCost, VectorGEPCost] =
4501 LI0->getPointerOperand(), Instruction::Load,
4505 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4506 false, CommonAlignment,
CostKind) +
4507 VectorGEPCost - ScalarGEPCost;
4511 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4513 LI0->getPointerOperand(), Instruction::GetElementPtr,
4517 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4518 false, CommonAlignment,
CostKind) +
4519 VectorGEPCost - ScalarGEPCost;
4524 "Expected only consecutive, strided or masked gather loads.");
4527 for (
int Idx : seq<int>(0, VL.
size()))
4536 if (MaskedGatherCost >= VecLdCost)
4546 bool ProfitableGatherPointers =
4549 return L->isLoopInvariant(V);
4551 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
4552 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
4554 (
GEP &&
GEP->getNumOperands() == 2 &&
4555 isa<Constant, Instruction>(
GEP->getOperand(1)));
4557 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4562 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4581 "Expected list of pointer operands.");
4586 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4591 std::optional<int> Diff =
4597 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4603 if (Bases.
size() > VL.
size() / 2 - 1)
4607 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4613 bool AnyConsecutive =
false;
4614 for (
auto &
Base : Bases) {
4615 auto &Vec =
Base.second;
4616 if (Vec.size() > 1) {
4618 const std::tuple<Value *, int, unsigned> &
Y) {
4619 return std::get<1>(
X) < std::get<1>(
Y);
4621 int InitialOffset = std::get<1>(Vec[0]);
4623 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4629 SortedIndices.
clear();
4630 if (!AnyConsecutive)
4633 for (
auto &
Base : Bases) {
4634 for (
auto &
T :
Base.second)
4639 "Expected SortedIndices to be the size of VL");
4643std::optional<BoUpSLP::OrdersType>
4645 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
4646 Type *ScalarTy = TE.Scalars[0]->getType();
4649 Ptrs.
reserve(TE.Scalars.size());
4650 for (
Value *V : TE.Scalars) {
4651 auto *L = dyn_cast<LoadInst>(V);
4652 if (!L || !L->isSimple())
4653 return std::nullopt;
4659 return std::move(Order);
4660 return std::nullopt;
4671 if (VU->
getType() != V->getType())
4674 if (!VU->
hasOneUse() && !V->hasOneUse())
4680 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4686 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
4687 bool IsReusedIdx =
false;
4689 if (IE2 == VU && !IE1)
4691 if (IE1 == V && !IE2)
4692 return V->hasOneUse();
4693 if (IE1 && IE1 != V) {
4695 IsReusedIdx |= ReusedIdx.
test(Idx1);
4696 ReusedIdx.
set(Idx1);
4697 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4700 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4702 if (IE2 && IE2 != VU) {
4704 IsReusedIdx |= ReusedIdx.
test(Idx2);
4705 ReusedIdx.
set(Idx2);
4706 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4709 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4711 }
while (!IsReusedIdx && (IE1 || IE2));
4715std::optional<BoUpSLP::OrdersType>
4718 if (TE.isNonPowOf2Vec())
4719 return std::nullopt;
4723 if (!TE.ReuseShuffleIndices.empty()) {
4725 return std::nullopt;
4733 unsigned Sz = TE.Scalars.size();
4734 if (TE.State == TreeEntry::NeedToGather) {
4735 if (std::optional<OrdersType> CurrentOrder =
4740 ::addMask(Mask, TE.ReuseShuffleIndices);
4741 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4742 unsigned Sz = TE.Scalars.size();
4743 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
4746 Res[
Idx + K * Sz] =
I + K * Sz;
4748 return std::move(Res);
4751 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4753 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4754 return std::nullopt;
4758 if (TE.ReorderIndices.empty())
4759 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4762 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4763 unsigned VF = ReorderMask.
size();
4765 unsigned NumParts = VF / Sz;
4767 for (
unsigned I = 0;
I < VF;
I += Sz) {
4769 unsigned UndefCnt = 0;
4778 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
4780 return std::nullopt;
4782 for (
unsigned K = 0; K < NumParts; ++K)
4783 ResOrder[Val + Sz * K] =
I + K;
4785 return std::move(ResOrder);
4787 unsigned VF = TE.getVectorFactor();
4790 TE.ReuseShuffleIndices.end());
4791 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4793 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4794 return Idx && *Idx < Sz;
4797 if (TE.ReorderIndices.empty())
4798 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4801 for (
unsigned I = 0;
I < VF; ++
I) {
4802 int &
Idx = ReusedMask[
I];
4805 Value *V = TE.Scalars[ReorderMask[
Idx]];
4807 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
4813 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
4814 auto *It = ResOrder.
begin();
4815 for (
unsigned K = 0; K < VF; K += Sz) {
4819 std::iota(SubMask.begin(), SubMask.end(), 0);
4821 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
4822 std::advance(It, Sz);
4824 if (TE.State == TreeEntry::NeedToGather &&
4826 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
4827 return std::nullopt;
4828 return std::move(ResOrder);
4830 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4831 any_of(TE.UserTreeIndices,
4833 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4835 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
4836 return std::nullopt;
4837 if ((TE.State == TreeEntry::Vectorize ||
4838 TE.State == TreeEntry::StridedVectorize) &&
4839 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4840 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4842 return TE.ReorderIndices;
4843 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4844 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
4845 Value *V1 = TE.Scalars[I1];
4846 Value *V2 = TE.Scalars[I2];
4847 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
4853 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
4854 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4855 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4856 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4863 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4864 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4865 if (EE1->getOperand(0) != EE2->getOperand(0))
4871 auto IsIdentityOrder = [](
const OrdersType &Order) {
4872 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4877 if (!TE.ReorderIndices.empty())
4878 return TE.ReorderIndices;
4881 std::iota(Phis.begin(), Phis.end(), 0);
4883 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4886 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4887 ResOrder[Id] = PhiToId[Phis[Id]];
4888 if (IsIdentityOrder(ResOrder))
4889 return std::nullopt;
4890 return std::move(ResOrder);
4892 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4896 if ((TE.getOpcode() == Instruction::ExtractElement ||
4897 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4898 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4900 auto *EE = dyn_cast<ExtractElementInst>(V);
4901 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4906 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4908 if (Reuse || !CurrentOrder.
empty())
4909 return std::move(CurrentOrder);
4917 int Sz = TE.Scalars.size();
4919 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4921 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
4922 if (It == TE.Scalars.begin())
4925 if (It != TE.Scalars.end()) {
4927 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4942 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4945 return std::move(Order);
4950 return std::nullopt;
4951 if (TE.Scalars.size() >= 4)
4955 return CurrentOrder;
4957 return std::nullopt;
4967 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
4969 if (Cluster != FirstCluster)
4975void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
4978 const unsigned Sz =
TE.Scalars.size();
4980 if (
TE.State != TreeEntry::NeedToGather ||
4987 addMask(NewMask,
TE.ReuseShuffleIndices);
4989 TE.ReorderIndices.clear();
4996 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
4997 *
End =
TE.ReuseShuffleIndices.end();
4998 It !=
End; std::advance(It, Sz))
4999 std::iota(It, std::next(It, Sz), 0);
5005 "Expected same size of orders");
5006 unsigned Sz = Order.
size();
5008 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5009 if (Order[
Idx] != Sz)
5010 UsedIndices.
set(Order[
Idx]);
5012 if (SecondaryOrder.
empty()) {
5013 for (
unsigned Idx : seq<unsigned>(0, Sz))
5014 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5017 for (
unsigned Idx : seq<unsigned>(0, Sz))
5018 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5019 !UsedIndices.
test(SecondaryOrder[
Idx]))
5020 Order[
Idx] = SecondaryOrder[
Idx];
5040 ExternalUserReorderMap;
5045 const std::unique_ptr<TreeEntry> &TE) {
5048 findExternalStoreUsersReorderIndices(TE.get());
5049 if (!ExternalUserReorderIndices.
empty()) {
5050 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5052 std::move(ExternalUserReorderIndices));
5058 if (TE->isAltShuffle()) {
5061 unsigned Opcode0 = TE->getOpcode();
5062 unsigned Opcode1 = TE->getAltOpcode();
5065 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
5066 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
5067 OpcodeMask.
set(Lane);
5069 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5070 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5076 if (std::optional<OrdersType> CurrentOrder =
5086 const TreeEntry *UserTE = TE.get();
5088 if (UserTE->UserTreeIndices.size() != 1)
5091 return EI.UserTE->State == TreeEntry::Vectorize &&
5092 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5095 UserTE = UserTE->UserTreeIndices.back().UserTE;
5098 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5099 if (!(TE->State == TreeEntry::Vectorize ||
5100 TE->State == TreeEntry::StridedVectorize) ||
5101 !TE->ReuseShuffleIndices.empty())
5102 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5103 if (TE->State == TreeEntry::Vectorize &&
5104 TE->getOpcode() == Instruction::PHI)
5105 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5110 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5112 auto It = VFToOrderedEntries.
find(VF);
5113 if (It == VFToOrderedEntries.
end())
5125 for (
const TreeEntry *OpTE : OrderedEntries) {
5128 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5131 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5133 if (OpTE->State == TreeEntry::NeedToGather ||
5134 !OpTE->ReuseShuffleIndices.empty()) {
5135 auto It = GathersToOrders.find(OpTE);
5136 if (It != GathersToOrders.end())
5139 if (OpTE->isAltShuffle()) {
5140 auto It = AltShufflesToOrders.find(OpTE);
5141 if (It != AltShufflesToOrders.end())
5144 if (OpTE->State == TreeEntry::Vectorize &&
5145 OpTE->getOpcode() == Instruction::PHI) {
5146 auto It = PhisToOrders.
find(OpTE);
5147 if (It != PhisToOrders.
end())
5150 return OpTE->ReorderIndices;
5153 auto It = ExternalUserReorderMap.
find(OpTE);
5154 if (It != ExternalUserReorderMap.
end()) {
5155 const auto &ExternalUserReorderIndices = It->second;
5159 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5160 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
5161 ExternalUserReorderIndices.size();
5163 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
5164 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5171 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5172 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5175 unsigned E = Order.size();
5178 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5181 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5183 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5186 if (OrdersUses.empty())
5189 const unsigned Sz = Order.size();
5190 for (
unsigned Idx : seq<unsigned>(0, Sz))
5191 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5196 unsigned IdentityCnt = 0;
5197 unsigned FilledIdentityCnt = 0;
5199 for (
auto &Pair : OrdersUses) {
5200 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5201 if (!Pair.first.empty())
5202 FilledIdentityCnt += Pair.second;
5203 IdentityCnt += Pair.second;
5208 unsigned Cnt = IdentityCnt;
5209 for (
auto &Pair : OrdersUses) {
5213 if (Cnt < Pair.second ||
5214 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5215 Cnt == Pair.second && !BestOrder.
empty() &&
5216 IsIdentityOrder(BestOrder))) {
5218 BestOrder = Pair.first;
5225 if (IsIdentityOrder(BestOrder))
5231 unsigned E = BestOrder.
size();
5233 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5236 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5238 if (TE->Scalars.size() != VF) {
5239 if (TE->ReuseShuffleIndices.size() == VF) {
5245 return EI.UserTE->Scalars.size() == VF ||
5246 EI.UserTE->Scalars.size() ==
5249 "All users must be of VF size.");
5252 reorderNodeWithReuses(*TE, Mask);
5256 if ((TE->State == TreeEntry::Vectorize ||
5257 TE->State == TreeEntry::StridedVectorize) &&
5260 !TE->isAltShuffle()) {
5264 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5265 TE->reorderOperands(Mask);
5268 TE->reorderOperands(Mask);
5269 assert(TE->ReorderIndices.empty() &&
5270 "Expected empty reorder sequence.");
5273 if (!TE->ReuseShuffleIndices.empty()) {
5280 addMask(NewReuses, TE->ReuseShuffleIndices);
5281 TE->ReuseShuffleIndices.swap(NewReuses);
5287bool BoUpSLP::canReorderOperands(
5288 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5292 if (UserTE->isNonPowOf2Vec())
5295 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
5296 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
5297 return OpData.first ==
I &&
5298 (OpData.second->State == TreeEntry::Vectorize ||
5299 OpData.second->State == TreeEntry::StridedVectorize);
5302 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
5304 if (
any_of(TE->UserTreeIndices,
5305 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5309 Edges.emplace_back(
I, TE);
5315 if (TE->State != TreeEntry::Vectorize &&
5316 TE->State != TreeEntry::StridedVectorize &&
5317 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5321 TreeEntry *
Gather =
nullptr;
5323 [&
Gather, UserTE,
I](TreeEntry *TE) {
5324 assert(TE->State != TreeEntry::Vectorize &&
5325 TE->State != TreeEntry::StridedVectorize &&
5326 "Only non-vectorized nodes are expected.");
5327 if (
any_of(TE->UserTreeIndices,
5328 [UserTE,
I](
const EdgeInfo &EI) {
5329 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5331 assert(TE->isSame(UserTE->getOperand(
I)) &&
5332 "Operand entry does not match operands.");
5353 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5354 if (TE->State != TreeEntry::Vectorize &&
5355 TE->State != TreeEntry::StridedVectorize)
5357 if (std::optional<OrdersType> CurrentOrder =
5359 OrderedEntries.
insert(TE.get());
5360 if (!(TE->State == TreeEntry::Vectorize ||
5361 TE->State == TreeEntry::StridedVectorize) ||
5362 !TE->ReuseShuffleIndices.empty())
5363 GathersToOrders.
insert(TE.get());
5372 while (!OrderedEntries.
empty()) {
5377 for (TreeEntry *TE : OrderedEntries) {
5378 if (!(TE->State == TreeEntry::Vectorize ||
5379 TE->State == TreeEntry::StridedVectorize ||
5380 (TE->State == TreeEntry::NeedToGather &&
5382 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5385 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5387 !Visited.
insert(TE).second) {
5393 for (
EdgeInfo &EI : TE->UserTreeIndices) {
5394 TreeEntry *UserTE = EI.
UserTE;
5395 auto It =
Users.find(UserTE);
5396 if (It ==
Users.end())
5397 It =
Users.insert({UserTE, {}}).first;
5398 It->second.emplace_back(EI.
EdgeIdx, TE);
5402 for (TreeEntry *TE : Filtered)
5403 OrderedEntries.remove(TE);
5405 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5407 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
5408 return Data1.first->Idx > Data2.first->Idx;
5410 for (
auto &
Data : UsersVec) {
5413 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
5415 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5416 OrderedEntries.remove(
Op.second);
5429 for (
const auto &
Op :
Data.second) {
5430 TreeEntry *OpTE =
Op.second;
5431 if (!VisitedOps.
insert(OpTE).second)
5433 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5435 const auto Order = [&]() ->
const OrdersType {
5436 if (OpTE->State == TreeEntry::NeedToGather ||
5437 !OpTE->ReuseShuffleIndices.empty())
5440 return OpTE->ReorderIndices;
5444 if (Order.size() == 1)
5447 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
5448 return P.second == OpTE;
5451 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5452 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5455 unsigned E = Order.size();
5458 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5461 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5464 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5466 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
5467 const auto AllowsReordering = [&](
const TreeEntry *TE) {
5469 if (TE->isNonPowOf2Vec())
5471 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5472 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5473 (IgnoreReorder && TE->Idx == 0))
5475 if (TE->State == TreeEntry::NeedToGather) {
5484 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
5485 TreeEntry *UserTE = EI.
UserTE;
5486 if (!VisitedUsers.
insert(UserTE).second)
5491 if (AllowsReordering(UserTE))
5499 if (
static_cast<unsigned>(
count_if(
5500 Ops, [UserTE, &AllowsReordering](
5501 const std::pair<unsigned, TreeEntry *> &
Op) {
5502 return AllowsReordering(
Op.second) &&
5505 return EI.UserTE == UserTE;
5507 })) <= Ops.
size() / 2)
5508 ++Res.first->second;
5511 if (OrdersUses.empty()) {
5512 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5513 OrderedEntries.remove(
Op.second);
5517 const unsigned Sz = Order.size();
5518 for (
unsigned Idx : seq<unsigned>(0, Sz))
5519 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5524 unsigned IdentityCnt = 0;
5525 unsigned VF =
Data.second.front().second->getVectorFactor();
5527 for (
auto &Pair : OrdersUses) {
5528 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5529 IdentityCnt += Pair.second;
5534 unsigned Cnt = IdentityCnt;
5535 for (
auto &Pair : OrdersUses) {
5539 if (Cnt < Pair.second) {
5541 BestOrder = Pair.first;
5548 if (IsIdentityOrder(BestOrder)) {
5549 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5550 OrderedEntries.remove(
Op.second);
5559 unsigned E = BestOrder.
size();
5561 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5563 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
5564 TreeEntry *TE =
Op.second;
5565 OrderedEntries.remove(TE);
5566 if (!VisitedOps.
insert(TE).second)
5568 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
5569 reorderNodeWithReuses(*TE, Mask);
5573 if (TE->State != TreeEntry::Vectorize &&
5574 TE->State != TreeEntry::StridedVectorize &&
5575 (TE->State != TreeEntry::ScatterVectorize ||
5576 TE->ReorderIndices.empty()))
5578 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
5579 TE->ReorderIndices.empty()) &&
5580 "Non-matching sizes of user/operand entries.");
5582 if (IgnoreReorder && TE == VectorizableTree.front().get())
5583 IgnoreReorder =
false;
5586 for (TreeEntry *
Gather : GatherOps) {
5588 "Unexpected reordering of gathers.");
5589 if (!
Gather->ReuseShuffleIndices.empty()) {
5595 OrderedEntries.remove(
Gather);
5599 if (
Data.first->State != TreeEntry::Vectorize ||
5600 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5601 Data.first->getMainOp()) ||
5602 Data.first->isAltShuffle())
5603 Data.first->reorderOperands(Mask);
5604 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
5605 Data.first->isAltShuffle() ||
5606 Data.first->State == TreeEntry::StridedVectorize) {
5610 if (
Data.first->ReuseShuffleIndices.empty() &&
5611 !
Data.first->ReorderIndices.empty() &&
5612 !
Data.first->isAltShuffle()) {
5615 OrderedEntries.insert(
Data.first);
5623 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5624 VectorizableTree.front()->ReuseShuffleIndices.empty())
5625 VectorizableTree.front()->ReorderIndices.clear();
5632 for (
auto &TEPtr : VectorizableTree) {
5633 TreeEntry *Entry = TEPtr.get();
5636 if (Entry->State == TreeEntry::NeedToGather)
5640 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5641 Value *Scalar = Entry->Scalars[Lane];
5642 if (!isa<Instruction>(Scalar))
5645 auto It = ScalarToExtUses.
find(Scalar);
5646 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
5650 const auto *ExtI = ExternallyUsedValues.
find(Scalar);
5651 if (ExtI != ExternallyUsedValues.
end()) {
5652 int FoundLane = Entry->findLaneForValue(Scalar);
5653 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
5654 << FoundLane <<
" from " << *Scalar <<
".\n");
5655 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
5656 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
5659 for (
User *U : Scalar->users()) {
5667 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5671 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5675 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5677 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5678 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
5680 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
5684 if (It != ScalarToExtUses.
end()) {
5685 ExternalUses[It->second].User =
nullptr;
5690 int FoundLane = Entry->findLaneForValue(Scalar);
5692 <<
" from lane " << FoundLane <<
" from " << *Scalar
5694 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
5695 ExternalUses.emplace_back(Scalar, U, FoundLane);
5704BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
5706 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5707 Value *V = TE->Scalars[Lane];
5713 for (
User *U : V->users()) {
5714 auto *SI = dyn_cast<StoreInst>(U);
5715 if (SI ==
nullptr || !SI->isSimple() ||
5719 if (getTreeEntry(U))
5723 auto &StoresVec = PtrToStoresMap[
Ptr];
5726 if (StoresVec.size() > Lane)
5729 if (!StoresVec.empty() &&
5730 SI->getParent() != StoresVec.back()->getParent())
5733 if (!StoresVec.empty() &&
5734 SI->getValueOperand()->getType() !=
5735 StoresVec.back()->getValueOperand()->getType())
5737 StoresVec.push_back(SI);
5740 return PtrToStoresMap;
5744 OrdersType &ReorderIndices)
const {
5752 StoreOffsetVec[0] = {S0, 0};
5755 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
5757 std::optional<int> Diff =
5759 SI->getPointerOperand(), *
DL, *SE,
5764 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
5769 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
5770 const std::pair<StoreInst *, int> &Pair2) {
5771 int Offset1 = Pair1.second;
5772 int Offset2 = Pair2.second;
5773 return Offset1 < Offset2;
5777 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5778 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx - 1].second + 1)
5783 ReorderIndices.reserve(StoresVec.
size());
5786 [SI](
const std::pair<StoreInst *, int> &Pair) {
5787 return Pair.first ==
SI;
5789 StoreOffsetVec.begin();
5790 ReorderIndices.push_back(
Idx);
5795 auto IsIdentityOrder = [](
const OrdersType &Order) {
5796 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
5801 if (IsIdentityOrder(ReorderIndices))
5802 ReorderIndices.clear();
5809 for (
unsigned Idx : Order)
5816BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
5817 unsigned NumLanes =
TE->Scalars.size();
5820 collectUserStores(TE);
5829 for (
const auto &Pair : PtrToStoresMap) {
5830 auto &StoresVec = Pair.second;
5832 if (StoresVec.size() != NumLanes)
5837 if (!canFormVector(StoresVec, ReorderIndices))
5842 ExternalReorderIndices.
push_back(ReorderIndices);
5844 return ExternalReorderIndices;
5850 UserIgnoreList = &UserIgnoreLst;
5853 buildTree_rec(Roots, 0,
EdgeInfo());
5860 buildTree_rec(Roots, 0,
EdgeInfo());
5867 Value *NeedsScheduling =
nullptr;
5868 for (
Value *V : VL) {
5871 if (!NeedsScheduling) {
5872 NeedsScheduling = V;
5877 return NeedsScheduling;
5888 bool AllowAlternate) {
5892 if (
auto *LI = dyn_cast<LoadInst>(V)) {
5895 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
5900 if (isa<ExtractElementInst, UndefValue>(V))
5902 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
5904 !isa<UndefValue>(EI->getIndexOperand()))
5907 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
5910 if ((isa<BinaryOperator, CastInst>(
I)) &&
5920 : cast<CastInst>(
I)->getOperand(0)->getType()));
5922 if (isa<CastInst>(
I)) {
5923 std::pair<size_t, size_t> OpVals =
5929 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
5931 if (CI->isCommutative())
5937 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
5951 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
5952 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5953 SubKey =
hash_value(Gep->getPointerOperand());
5957 !isa<ConstantInt>(
I->getOperand(1))) {
5965 return std::make_pair(Key, SubKey);
5975bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
5977 unsigned Opcode0 = S.getOpcode();
5978 unsigned Opcode1 = S.getAltOpcode();
5981 for (
unsigned Lane : seq<unsigned>(0, VL.
size()))
5982 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
5983 OpcodeMask.set(Lane);
5986 Opcode0, Opcode1, OpcodeMask))
5989 for (
unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
5993 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
5997 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
6003 switch (Res.value_or(0)) {
6018 constexpr unsigned NumAltInsts = 3;
6019 unsigned NonInstCnt = 0;
6022 unsigned UndefCnt = 0;
6024 unsigned ExtraShuffleInsts = 0;
6033 return is_contained(Operands.back(), V);
6036 ++ExtraShuffleInsts;
6053 if (isa<Constant, ExtractElementInst>(V) ||
6054 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
6055 if (isa<UndefValue>(V))
6061 if (!Res.second && Res.first->second == 1)
6062 ++ExtraShuffleInsts;
6063 ++Res.first->getSecond();
6064 if (
auto *
I = dyn_cast<Instruction>(V))
6065 UniqueOpcodes.
insert(
I->getOpcode());
6066 else if (Res.second)
6069 return none_of(Uniques, [&](
const auto &
P) {
6070 return P.first->hasNUsesOrMore(
P.second + 1) &&
6072 return getTreeEntry(U) || Uniques.contains(U);
6081 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6082 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
6083 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6086BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6089 assert(S.MainOp &&
"Expected instructions with same/alternate opcodes only.");
6091 unsigned ShuffleOrOp =
6092 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
6093 auto *VL0 = cast<Instruction>(S.OpValue);
6094 switch (ShuffleOrOp) {
6095 case Instruction::PHI: {
6098 return TreeEntry::NeedToGather;
6101 for (
Value *
Incoming : cast<PHINode>(V)->incoming_values()) {
6103 if (Term &&
Term->isTerminator()) {
6105 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
6106 return TreeEntry::NeedToGather;
6110 return TreeEntry::Vectorize;
6112 case Instruction::ExtractValue:
6113 case Instruction::ExtractElement: {
6114 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6117 return TreeEntry::NeedToGather;
6118 if (Reuse || !CurrentOrder.empty())
6119 return TreeEntry::Vectorize;
6121 return TreeEntry::NeedToGather;
6123 case Instruction::InsertElement: {
6127 for (
Value *V : VL) {
6128 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
6130 "Non-constant or undef index?");
6134 return !SourceVectors.contains(V);
6137 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
6138 "different source vectors.\n");
6139 return TreeEntry::NeedToGather;
6142 return TreeEntry::Vectorize;
6144 case Instruction::Load: {
6153 return TreeEntry::Vectorize;
6155 return TreeEntry::ScatterVectorize;
6157 return TreeEntry::StridedVectorize;
6160 Type *ScalarTy = VL0->getType();
6161 if (
DL->getTypeSizeInBits(ScalarTy) !=
6162 DL->getTypeAllocSizeInBits(ScalarTy))
6163 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
6165 [](
Value *V) {
return !cast<LoadInst>(V)->isSimple(); }))
6170 return TreeEntry::NeedToGather;
6174 case Instruction::ZExt:
6175 case Instruction::SExt:
6176 case Instruction::FPToUI:
6177 case Instruction::FPToSI:
6178 case Instruction::FPExt:
6179 case Instruction::PtrToInt:
6180 case Instruction::IntToPtr:
6181 case Instruction::SIToFP:
6182 case Instruction::UIToFP:
6183 case Instruction::Trunc:
6184 case Instruction::FPTrunc:
6185 case Instruction::BitCast: {
6186 Type *SrcTy = VL0->getOperand(0)->getType();
6187 for (
Value *V : VL) {
6188 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6191 dbgs() <<
"SLP: Gathering casts with different src types.\n");
6192 return TreeEntry::NeedToGather;
6195 return TreeEntry::Vectorize;
6197 case Instruction::ICmp:
6198 case Instruction::FCmp: {
6202 Type *ComparedTy = VL0->getOperand(0)->getType();
6203 for (
Value *V : VL) {
6205 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
6206 Cmp->getOperand(0)->getType() != ComparedTy) {
6207 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
6208 return TreeEntry::NeedToGather;
6211 return TreeEntry::Vectorize;
6213 case Instruction::Select:
6214 case Instruction::FNeg:
6215 case Instruction::Add:
6216 case Instruction::FAdd:
6217 case Instruction::Sub:
6218 case Instruction::FSub:
6219 case Instruction::Mul:
6220 case Instruction::FMul:
6221 case Instruction::UDiv:
6222 case Instruction::SDiv:
6223 case Instruction::FDiv:
6224 case Instruction::URem:
6225 case Instruction::SRem:
6226 case Instruction::FRem:
6227 case Instruction::Shl:
6228 case Instruction::LShr:
6229 case Instruction::AShr:
6230 case Instruction::And:
6231 case Instruction::Or:
6232 case Instruction::Xor:
6233 return TreeEntry::Vectorize;
6234 case Instruction::GetElementPtr: {
6236 for (
Value *V : VL) {
6237 auto *
I = dyn_cast<GetElementPtrInst>(V);
6240 if (
I->getNumOperands() != 2) {
6241 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
6242 return TreeEntry::NeedToGather;
6248 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6249 for (
Value *V : VL) {
6250 auto *
GEP = dyn_cast<GEPOperator>(V);
6253 Type *CurTy =
GEP->getSourceElementType();
6255 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
6256 return TreeEntry::NeedToGather;
6261 Type *Ty1 = VL0->getOperand(1)->getType();
6262 for (
Value *V : VL) {
6263 auto *
I = dyn_cast<GetElementPtrInst>(V);
6266 auto *
Op =
I->getOperand(1);
6267 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6268 (
Op->getType() != Ty1 &&
6269 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6270 Op->getType()->getScalarSizeInBits() >
6271 DL->getIndexSizeInBits(
6272 V->getType()->getPointerAddressSpace())))) {
6274 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
6275 return TreeEntry::NeedToGather;
6279 return TreeEntry::Vectorize;
6281 case Instruction::Store: {
6283 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6286 if (
DL->getTypeSizeInBits(ScalarTy) !=
6287 DL->getTypeAllocSizeInBits(ScalarTy)) {
6288 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
6289 return TreeEntry::NeedToGather;
6293 for (
Value *V : VL) {
6294 auto *
SI = cast<StoreInst>(V);
6295 if (!
SI->isSimple()) {
6297 return TreeEntry::NeedToGather;
6306 if (CurrentOrder.empty()) {
6307 Ptr0 = PointerOps.
front();
6308 PtrN = PointerOps.
back();
6310 Ptr0 = PointerOps[CurrentOrder.front()];
6311 PtrN = PointerOps[CurrentOrder.back()];
6313 std::optional<int> Dist =
6316 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
6317 return TreeEntry::Vectorize;
6321 return TreeEntry::NeedToGather;
6323 case Instruction::Call: {
6326 CallInst *CI = cast<CallInst>(VL0);
6337 return TreeEntry::NeedToGather;
6342 for (
unsigned J = 0; J != NumArgs; ++J)
6345 for (
Value *V : VL) {
6346 CallInst *CI2 = dyn_cast<CallInst>(V);
6352 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
6354 return TreeEntry::NeedToGather;
6358 for (
unsigned J = 0; J != NumArgs; ++J) {
6361 if (ScalarArgs[J] != A1J) {
6363 <<
"SLP: mismatched arguments in call:" << *CI
6364 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
6365 return TreeEntry::NeedToGather;
6374 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
6375 <<
"!=" << *V <<
'\n');
6376 return TreeEntry::NeedToGather;
6380 return TreeEntry::Vectorize;
6382 case Instruction::ShuffleVector: {
6385 if (!S.isAltShuffle()) {
6386 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
6387 return TreeEntry::NeedToGather;
6392 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
6393 "the whole alt sequence is not profitable.\n");
6394 return TreeEntry::NeedToGather;
6397 return TreeEntry::Vectorize;
6401 return TreeEntry::NeedToGather;
6415 PHIHandler() =
delete;
6417 : DT(DT), Main(Main), Phis(Phis),
6418 Operands(Main->getNumIncomingValues(),
6420 void buildOperands() {
6421 constexpr unsigned FastLimit = 4;
6431 auto *
P = cast<PHINode>(V);
6432 if (
P->getIncomingBlock(
I) == InBB)
6447 Blocks.try_emplace(InBB).first->second.push_back(
I);
6450 auto *
P = cast<PHINode>(V);
6451 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
6459 auto It =
Blocks.find(InBB);
6465 for (
const auto &
P :
Blocks) {
6466 if (
P.getSecond().size() <= 1)
6468 unsigned BasicI =
P.getSecond().front();
6471 [&](
const auto &Data) {
6472 return !Data.value() ||
6473 Data.value() ==
Operands[BasicI][Data.index()];
6475 "Expected empty operands list.");
6485 const EdgeInfo &UserTreeIdx) {
6491 auto TryToFindDuplicates = [&](
const InstructionsState &S,
6492 bool DoNotFail =
false) {
6495 for (
Value *V : VL) {
6502 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
6507 size_t NumUniqueScalarValues = UniqueValues.
size();
6508 if (NumUniqueScalarValues == VL.size()) {
6509 ReuseShuffleIndicies.
clear();
6512 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6513 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
6514 "for nodes with padding.\n");
6515 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6519 if (NumUniqueScalarValues <= 1 ||
6520 (UniquePositions.size() == 1 &&
all_of(UniqueValues,
6522 return isa<UndefValue>(V) ||
6525 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6526 if (DoNotFail && UniquePositions.size() > 1 &&
6527 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6529 return isa<ExtractElementInst>(V) ||
6530 areAllUsersVectorized(cast<Instruction>(V),
6534 if (PWSz == VL.size()) {
6535 ReuseShuffleIndicies.
clear();
6537 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
6538 NonUniqueValueVL.
append(PWSz - UniqueValues.
size(),
6539 UniqueValues.
back());
6540 VL = NonUniqueValueVL;
6545 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6556 if (!EphValues.
empty()) {
6557 for (
Value *V : VL) {
6558 if (EphValues.
count(V)) {
6560 <<
") is ephemeral.\n");
6561 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6571 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6576 cast<Instruction>(
I)->getOpcode() ==
6577 cast<Instruction>(S.MainOp)->getOpcode();
6579 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
6580 if (TryToFindDuplicates(S))
6581 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6582 ReuseShuffleIndicies);
6587 if (S.getOpcode() == Instruction::ExtractElement &&
6588 isa<ScalableVectorType>(
6589 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6590 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
6591 if (TryToFindDuplicates(S))
6592 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6593 ReuseShuffleIndicies);
6598 if (S.OpValue->getType()->isVectorTy() &&
6599 !isa<InsertElementInst>(S.OpValue)) {
6601 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6605 if (
StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6606 if (
SI->getValueOperand()->getType()->isVectorTy()) {
6607 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
6608 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6617 auto &&NotProfitableForVectorization = [&S,
this,
6619 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6628 for (
Value *V : VL) {
6629 auto *
I = cast<Instruction>(V);
6631 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6635 if ((IsCommutative &&
6636 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6638 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
6640 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
6642 auto *
I1 = cast<Instruction>(VL.front());
6643 auto *I2 = cast<Instruction>(VL.back());
6646 I2->getOperand(
Op));
6647 if (
static_cast<unsigned>(
count_if(
6648 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6650 })) >= S.MainOp->getNumOperands() / 2)
6652 if (S.MainOp->getNumOperands() > 2)
6654 if (IsCommutative) {
6659 I2->getOperand((
Op + 1) % E));
6661 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6670 bool IsScatterVectorizeUserTE =
6671 UserTreeIdx.UserTE &&
6672 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6673 bool AreAllSameInsts =
6675 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6679 auto *
I = dyn_cast<GetElementPtrInst>(V);
6683 BB =
I->getParent();
6684 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
6687 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6690 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6693 NotProfitableForVectorization(VL)) {
6694 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
6695 if (TryToFindDuplicates(S))
6696 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6697 ReuseShuffleIndicies);
6705 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6706 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.OpValue <<
".\n");
6707 if (!E->isSame(VL)) {
6708 auto It = MultiNodeScalars.
find(S.OpValue);
6709 if (It != MultiNodeScalars.
end()) {
6710 auto *TEIt =
find_if(It->getSecond(),
6711 [&](TreeEntry *ME) { return ME->isSame(VL); });
6712 if (TEIt != It->getSecond().end())
6722 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
6723 if (TryToFindDuplicates(S))
6724 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6725 ReuseShuffleIndicies);
6731 E->UserTreeIndices.push_back(UserTreeIdx);
6732 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.OpValue
6739 for (
Value *V : VL) {
6740 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6743 if (getTreeEntry(V)) {
6745 <<
") is already in tree.\n");
6746 if (TryToFindDuplicates(S))
6747 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6748 ReuseShuffleIndicies);
6754 if (UserIgnoreList && !UserIgnoreList->empty()) {
6755 for (
Value *V : VL) {
6756 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6757 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
6758 if (TryToFindDuplicates(S))
6759 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6760 ReuseShuffleIndicies);
6768 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6769 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6771 assert(S.OpValue->getType()->isPointerTy() &&
6772 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6773 "Expected pointers only.");
6775 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
6776 assert(It != VL.end() &&
"Expected at least one GEP.");
6782 auto *VL0 = cast<Instruction>(S.OpValue);
6789 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6798 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6803 if (!TryToFindDuplicates(S,
true))
6809 TreeEntry::EntryState State = getScalarsVectorizationState(
6810 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6811 if (State == TreeEntry::NeedToGather) {
6812 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6813 ReuseShuffleIndicies);
6817 auto &BSRef = BlocksSchedules[BB];
6819 BSRef = std::make_unique<BlockScheduling>(BB);
6821 BlockScheduling &BS = *BSRef;
6823 std::optional<ScheduleData *> Bundle =
6824 BS.tryScheduleBundle(UniqueValues,
this, S);
6825#ifdef EXPENSIVE_CHECKS
6830 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
6831 assert((!BS.getScheduleData(VL0) ||
6832 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6833 "tryScheduleBundle should cancelScheduling on failure");
6834 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6835 ReuseShuffleIndicies);
6836 NonScheduledFirst.insert(VL.front());
6839 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
6841 unsigned ShuffleOrOp = S.isAltShuffle() ?
6842 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
6843 switch (ShuffleOrOp) {
6844 case Instruction::PHI: {
6845 auto *PH = cast<PHINode>(VL0);
6848 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6852 PHIHandler Handler(*DT, PH, VL);
6853 Handler.buildOperands();
6854 for (
unsigned I : seq<unsigned>(0, PH->getNumOperands()))
6855 TE->setOperand(
I, Handler.getOperands(
I));
6856 for (
unsigned I : seq<unsigned>(0, PH->getNumOperands()))
6857 buildTree_rec(Handler.getOperands(
I),
Depth + 1, {TE, I});
6860 case Instruction::ExtractValue:
6861 case Instruction::ExtractElement: {
6862 if (CurrentOrder.empty()) {
6863 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
6864 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6865 ReuseShuffleIndicies);
6869 Op0.
assign(VL.size(), VL0->getOperand(0));
6870 VectorizableTree.back()->setOperand(0, Op0);
6874 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
6876 for (
unsigned Idx : CurrentOrder)
6883 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6884 ReuseShuffleIndicies, CurrentOrder);
6888 Op0.
assign(VL.size(), VL0->getOperand(0));
6889 VectorizableTree.back()->setOperand(0, Op0);
6892 case Instruction::InsertElement: {
6893 assert(ReuseShuffleIndicies.
empty() &&
"All inserts should be unique");
6895 auto OrdCompare = [](
const std::pair<int, int> &P1,
6896 const std::pair<int, int> &P2) {
6897 return P1.first > P2.first;
6900 decltype(OrdCompare)>
6901 Indices(OrdCompare);
6902 for (
int I = 0, E = VL.size();
I < E; ++
I) {
6904 Indices.emplace(
Idx,
I);
6906 OrdersType CurrentOrder(VL.size(), VL.size());
6907 bool IsIdentity =
true;
6908 for (
int I = 0, E = VL.size();
I < E; ++
I) {
6909 CurrentOrder[Indices.top().second] =
I;
6910 IsIdentity &= Indices.top().second ==
I;
6914 CurrentOrder.clear();
6915 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6916 std::nullopt, CurrentOrder);
6919 constexpr int NumOps = 2;
6921 for (
int I = 0;
I < NumOps; ++
I) {
6923 VectorOperands[
I].
push_back(cast<Instruction>(V)->getOperand(
I));
6925 TE->setOperand(
I, VectorOperands[
I]);
6927 buildTree_rec(VectorOperands[NumOps - 1],
Depth + 1, {
TE, NumOps - 1});
6930 case Instruction::Load: {
6937 TreeEntry *
TE =
nullptr;
6940 case TreeEntry::Vectorize:
6941 if (CurrentOrder.empty()) {
6943 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6944 ReuseShuffleIndicies);
6948 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6949 ReuseShuffleIndicies, CurrentOrder);
6952 TE->setOperandsInOrder();
6954 case TreeEntry::StridedVectorize:
6956 if (CurrentOrder.empty()) {
6957 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6958 UserTreeIdx, ReuseShuffleIndicies);
6960 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6961 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6963 TE->setOperandsInOrder();
6966 case TreeEntry::ScatterVectorize:
6968 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6969 UserTreeIdx, ReuseShuffleIndicies);
6970 TE->setOperandsInOrder();
6971 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
6972 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
6974 case TreeEntry::NeedToGather:
6979 case Instruction::ZExt:
6980 case Instruction::SExt:
6981 case Instruction::FPToUI:
6982 case Instruction::FPToSI:
6983 case Instruction::FPExt:
6984 case Instruction::PtrToInt:
6985 case Instruction::IntToPtr:
6986 case Instruction::SIToFP:
6987 case Instruction::UIToFP:
6988 case Instruction::Trunc:
6989 case Instruction::FPTrunc:
6990 case Instruction::BitCast: {
6991 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6992 std::make_pair(std::numeric_limits<unsigned>::min(),
6993 std::numeric_limits<unsigned>::max()));
6994 if (ShuffleOrOp == Instruction::ZExt ||
6995 ShuffleOrOp == Instruction::SExt) {
6996 CastMaxMinBWSizes = std::make_pair(
7002 }
else if (ShuffleOrOp == Instruction::Trunc) {
7003 CastMaxMinBWSizes = std::make_pair(
7009 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
7010 }
else if (ShuffleOrOp == Instruction::SIToFP ||
7011 ShuffleOrOp == Instruction::UIToFP) {
7012 unsigned NumSignBits =
7014 if (
auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7016 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
7018 if (NumSignBits * 2 >=
7020 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
7022 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7023 ReuseShuffleIndicies);
7026 TE->setOperandsInOrder();
7027 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7031 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
7037 case Instruction::ICmp:
7038 case Instruction::FCmp: {
7041 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7042 ReuseShuffleIndicies);
7050 "Commutative Predicate mismatch");
7051 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7054 for (
Value *V : VL) {
7055 auto *
Cmp = cast<CmpInst>(V);
7058 if (
Cmp->getPredicate() != P0)
7060 Left.push_back(LHS);
7061 Right.push_back(RHS);
7068 if (ShuffleOrOp == Instruction::ICmp) {
7069 unsigned NumSignBits0 =
7071 if (NumSignBits0 * 2 >=
7073 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
7074 unsigned NumSignBits1 =
7076 if (NumSignBits1 * 2 >=
7078 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
7082 case Instruction::Select:
7083 case Instruction::FNeg:
7084 case Instruction::Add:
7085 case Instruction::FAdd:
7086 case Instruction::Sub:
7087 case Instruction::FSub:
7088 case Instruction::Mul:
7089 case Instruction::FMul:
7090 case Instruction::UDiv:
7091 case Instruction::SDiv:
7092 case Instruction::FDiv:
7093 case Instruction::URem:
7094 case Instruction::SRem:
7095 case Instruction::FRem:
7096 case Instruction::Shl:
7097 case Instruction::LShr:
7098 case Instruction::AShr:
7099 case Instruction::And:
7100 case Instruction::Or:
7101 case Instruction::Xor: {
7102 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7103 ReuseShuffleIndicies);
7110 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7118 TE->setOperandsInOrder();
7119 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7123 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
7129 case Instruction::GetElementPtr: {
7130 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7131 ReuseShuffleIndicies);
7135 for (
Value *V : VL) {
7136 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
7141 Operands.front().push_back(
GEP->getPointerOperand());
7150 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7152 [VL0Ty, IndexIdx](
Value *V) {
7153 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
7156 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
7160 ->getPointerOperandType()
7163 for (
Value *V : VL) {
7164 auto *
I = dyn_cast<GetElementPtrInst>(V);
7167 ConstantInt::get(Ty, 0,
false));
7170 auto *
Op =
I->getOperand(IndexIdx);
7171 auto *CI = dyn_cast<ConstantInt>(
Op);
7176 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7180 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
7184 case Instruction::Store: {
7188 for (
Value *V : VL) {
7189 auto *
SI = cast<StoreInst>(V);
7190 *OIter =
SI->getValueOperand();
7194 if (CurrentOrder.empty()) {
7196 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7197 ReuseShuffleIndicies);
7198 TE->setOperandsInOrder();
7203 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7204 ReuseShuffleIndicies, CurrentOrder);
7205 TE->setOperandsInOrder();
7207 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");
7211 case Instruction::Call: {
7214 CallInst *CI = cast<CallInst>(VL0);
7217 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7218 ReuseShuffleIndicies);
7223 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7227 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7231 for (
Value *V : VL) {
7232 auto *CI2 = cast<CallInst>(V);
7239 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7246 TE->setOperandsInOrder();
7247 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
7254 for (
Value *V : VL) {
7255 auto *CI2 = cast<CallInst>(V);
7262 case Instruction::ShuffleVector: {
7263 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7264 ReuseShuffleIndicies);
7268 auto *CI = dyn_cast<CmpInst>(VL0);
7269 if (isa<BinaryOperator>(VL0) || CI) {
7272 return cast<CmpInst>(V)->isCommutative();
7274 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7276 auto *MainCI = cast<CmpInst>(S.MainOp);
7277 auto *AltCI = cast<CmpInst>(S.AltOp);
7281 "Expected different main/alternate predicates.");
7284 for (
Value *V : VL) {
7285 auto *
Cmp = cast<CmpInst>(V);
7296 Left.push_back(LHS);
7297 Right.push_back(RHS);
7307 TE->setOperandsInOrder();
7308 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7312 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
7328 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7329 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
7331 for (
const auto *Ty : ST->elements())
7332 if (Ty != *ST->element_begin())
7334 N *= ST->getNumElements();
7335 EltTy = *ST->element_begin();
7336 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
7337 N *= AT->getNumElements();
7338 EltTy = AT->getElementType();
7340 auto *VT = cast<FixedVectorType>(EltTy);
7341 N *= VT->getNumElements();
7342 EltTy = VT->getElementType();
7349 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7357 bool ResizeAllowed)
const {
7358 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7359 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
7360 auto *E0 = cast<Instruction>(*It);
7362 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7366 Value *Vec = E0->getOperand(0);
7368 CurrentOrder.
clear();
7372 if (E0->getOpcode() == Instruction::ExtractValue) {
7377 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7381 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
7384 unsigned E = VL.
size();
7385 if (!ResizeAllowed && NElts != E)
7388 unsigned MinIdx = NElts, MaxIdx = 0;
7390 auto *Inst = dyn_cast<Instruction>(V);
7393 if (Inst->getOperand(0) != Vec)
7395 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
7396 if (isa<UndefValue>(EE->getIndexOperand()))
7401 const unsigned ExtIdx = *
Idx;
7402 if (ExtIdx >= NElts)
7404 Indices[
I] = ExtIdx;
7405 if (MinIdx > ExtIdx)
7407 if (MaxIdx < ExtIdx)
7410 if (MaxIdx - MinIdx + 1 > E)
7412 if (MaxIdx + 1 <= E)
7416 bool ShouldKeepOrder =
true;
7422 CurrentOrder.
assign(E, E);
7423 for (
unsigned I = 0;
I < E; ++
I) {
7426 const unsigned ExtIdx = Indices[
I] - MinIdx;
7427 if (CurrentOrder[ExtIdx] != E) {
7428 CurrentOrder.
clear();
7431 ShouldKeepOrder &= ExtIdx ==
I;
7432 CurrentOrder[ExtIdx] =
I;
7434 if (ShouldKeepOrder)
7435 CurrentOrder.
clear();
7437 return ShouldKeepOrder;
7440bool BoUpSLP::areAllUsersVectorized(
7442 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
7444 return ScalarToTreeEntry.contains(U) ||
7445 isVectorLikeInstWithConstOps(U) ||
7446 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7450static std::pair<InstructionCost, InstructionCost>
7458 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
7459 FMF = FPCI->getFastMathFlags();
7462 dyn_cast<IntrinsicInst>(CI));
7463 auto IntrinsicCost =
7470 auto LibCost = IntrinsicCost;
7477 return {IntrinsicCost, LibCost};
7480void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7484 unsigned Sz = Scalars.size();
7487 if (!ReorderIndices.empty())
7489 for (
unsigned I = 0;
I < Sz; ++
I) {
7491 if (!ReorderIndices.empty())
7493 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
7494 if (IsAltOp(OpInst)) {
7504 if (!ReuseShuffleIndices.empty()) {
7507 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7517 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7518 auto *AltCI = cast<CmpInst>(AltOp);
7521 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
7522 auto *CI = cast<CmpInst>(
I);
7530 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
7531 "CmpInst expected to match either main or alternate predicate or "
7534 return MainP !=
P && MainP != SwappedP;
7541 const auto *Op0 = Ops.
front();
7547 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
7551 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
7553 if (
auto *CI = dyn_cast<ConstantInt>(V))
7554 return CI->getValue().isPowerOf2();
7557 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
7559 if (
auto *CI = dyn_cast<ConstantInt>(V))
7560 return CI->getValue().isNegatedPowerOf2();
7565 if (IsConstant && IsUniform)
7567 else if (IsConstant)
7581class BaseShuffleAnalysis {
7588 int Limit =
Mask.size();
7600 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
7616 unsigned VF =
Mask.size();
7618 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
7621 int MaskedIdx =
Mask[ExtMask[
I] % VF];
7662 bool SinglePermute) {
7666 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
7668 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7674 if (isIdentityMask(Mask, SVTy,
false)) {
7675 if (!IdentityOp || !SinglePermute ||
7676 (isIdentityMask(Mask, SVTy,
true) &&
7678 IdentityMask.
size()))) {
7683 IdentityMask.
assign(Mask);
7703 if (SV->isZeroEltSplat()) {
7705 IdentityMask.
assign(Mask);
7707 int LocalVF =
Mask.size();
7709 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7710 LocalVF = SVOpTy->getNumElements();
7714 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
7716 ExtMask[
Idx] = SV->getMaskValue(
I);
7726 if (!IsOp1Undef && !IsOp2Undef) {
7728 for (
int &
I : Mask) {
7731 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
7738 SV->getShuffleMask().end());
7739 combineMasks(LocalVF, ShuffleMask, Mask);
7740 Mask.swap(ShuffleMask);
7742 Op = SV->getOperand(0);
7744 Op = SV->getOperand(1);
7746 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
7747 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7752 "Expected masks of same sizes.");
7757 Mask.swap(IdentityMask);
7758 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7759 return SinglePermute &&
7760 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
7762 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
7763 Shuffle->isZeroEltSplat() &&
7776 template <
typename T,
typename ShuffleBuilderTy>
7778 ShuffleBuilderTy &Builder) {
7779 assert(V1 &&
"Expected at least one vector value.");
7781 Builder.resizeToMatch(V1, V2);
7782 int VF =
Mask.size();
7783 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
7784 VF = FTy->getNumElements();
7791 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
7794 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7796 CombinedMask1[
I] =
Mask[
I];
7798 CombinedMask2[
I] =
Mask[
I] - VF;
7805 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
7806 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
7809 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7810 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7815 ExtMask1[
Idx] = SV1->getMaskValue(
I);
7818 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7820 ExtMask1, UseMask::SecondArg);
7825 ExtMask2[
Idx] = SV2->getMaskValue(
I);
7828 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7830 ExtMask2, UseMask::SecondArg);
7831 if (SV1->getOperand(0)->getType() ==
7832 SV2->getOperand(0)->getType() &&
7833 SV1->getOperand(0)->getType() != SV1->getType() &&
7836 Op1 = SV1->getOperand(0);
7837 Op2 = SV2->getOperand(0);
7839 SV1->getShuffleMask().end());
7840 int LocalVF = ShuffleMask1.size();
7841 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
7842 LocalVF = FTy->getNumElements();
7843 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7844 CombinedMask1.swap(ShuffleMask1);
7846 SV2->getShuffleMask().end());
7847 LocalVF = ShuffleMask2.size();
7848 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
7849 LocalVF = FTy->getNumElements();
7850 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7851 CombinedMask2.swap(ShuffleMask2);
7854 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
7855 Builder.resizeToMatch(Op1, Op2);
7856 VF = std::max(cast<VectorType>(Op1->
getType())
7858 .getKnownMinValue(),
7859 cast<VectorType>(Op2->
getType())
7861 .getKnownMinValue());
7862 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7865 "Expected undefined mask element");
7866 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
7872 isa<ShuffleVectorInst>(Op1) &&
7873 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7875 return Builder.createIdentity(Op1);
7876 return Builder.createShuffleVector(
7880 if (isa<PoisonValue>(V1))
7881 return Builder.createPoison(
7882 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
7884 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
7885 assert(V1 &&
"Expected non-null value after looking through shuffles.");
7888 return Builder.createShuffleVector(V1, NewMask);
7889 return Builder.createIdentity(V1);
7905 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7908 Mask, NumSrcElts, NumSubElts,
Index)) {
7909 if (
Index + NumSubElts > NumSrcElts &&
7910 Index + NumSrcElts <=
static_cast<int>(Mask.size()))
7920static std::pair<InstructionCost, InstructionCost>
7931 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7941 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7945 for (
Value *V : Ptrs) {
7950 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7955 if (!
Ptr || !
Ptr->hasOneUse())
7959 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
7965 TTI::PointersChainInfo::getKnownStride(),
7975 [](
const Value *V) {
7976 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7977 return Ptr && !
Ptr->hasAllConstantIndices();
7979 ? TTI::PointersChainInfo::getUnknownStride()
7980 : TTI::PointersChainInfo::getKnownStride();
7984 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
7986 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
7987 if (It != Ptrs.
end())
7988 BaseGEP = cast<GEPOperator>(*It);
7993 BaseGEP->getPointerOperand(), Indices, VecTy,
7998 return std::make_pair(ScalarCost, VecCost);
8003 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8004 TreeEntry &E = *TE.get();
8005 switch (E.getOpcode()) {
8006 case Instruction::Load: {
8007 Type *ScalarTy = E.getMainOp()->getType();
8009 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8016 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8023 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8024 false, CommonAlignment,
CostKind, BaseLI);
8025 if (StridedCost < OriginalVecCost)
8028 E.State = TreeEntry::StridedVectorize;
8032 case Instruction::Store: {
8034 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8036 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8043 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8050 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8051 false, CommonAlignment,
CostKind, BaseSI);
8052 if (StridedCost < OriginalVecCost)
8055 E.State = TreeEntry::StridedVectorize;
8072 bool IsFinalized =
false;
8075 Type *ScalarTy =
nullptr;
8086 bool SameNodesEstimated =
true;
8095 if (
auto *VTy = dyn_cast<VectorType>(Ty))
8111 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8112 unsigned MinVF = R.getMinVF(2 * Sz);
8113 if (VL.
size() > 2 &&
8114 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8115 (InVectors.
empty() &&
8118 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8119 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8120 return S.getOpcode() == Instruction::Load &&
8123 !
all_of(Gathers, [&](
Value *V) {
return R.getTreeEntry(V); }) &&
8129 unsigned StartIdx = 0;
8130 unsigned VF = VL.
size() / 2;
8131 for (; VF >= MinVF; VF /= 2) {
8132 for (
unsigned Cnt = StartIdx,
End = VL.
size(); Cnt + VF <=
End;
8135 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8137 if (SliceS.getOpcode() != Instruction::Load ||
8138 SliceS.isAltShuffle())
8146 CurrentOrder, PointerOps);
8156 CurrentOrder.
empty()) ||
8165 if (Cnt == StartIdx)
8174 if (StartIdx >= VL.
size())
8177 if (!VectorizedLoads.
empty())
8180 if (!VectorizedLoads.
empty()) {
8182 bool NeedInsertSubvectorAnalysis =
8183 !NumParts || (VL.
size() / VF) > NumParts;
8189 getBuildVectorCost(VL.
slice(
I, std::min(
End -
I, VF)), Root);
8196 for (
Value *V : VectorizedLoads) {
8197 auto *LI = cast<LoadInst>(V);
8204 for (
const std::pair<unsigned, LoadsState> &
P : VectorizedStarts) {
8205 auto *LI = cast<LoadInst>(VL[
P.first]);
8214 false, Alignment, CostKind, LI);
8218 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8219 auto [ScalarGEPCost, VectorGEPCost] =
8221 Instruction::Load, CostKind, LI->
getType(), LoadTy);
8222 GatherCost += VectorGEPCost - ScalarGEPCost;
8224 for (
unsigned P : ScatterVectorized) {
8225 auto *LI0 = cast<LoadInst>(VL[
P]);
8227 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8229 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8230 false, CommonAlignment, CostKind, LI0);
8234 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8242 auto [ScalarGEPCost, VectorGEPCost] =
8244 CostKind, ScalarTy, VecTy);
8245 GatherCost += VectorGEPCost - ScalarGEPCost;
8246 if (!Order.
empty()) {
8250 VecTy, Mask, CostKind);
8253 GatherCost += R.getGatherCost(PointerOps,
true,
8254 PointerOps.
front()->getType());
8257 if (NeedInsertSubvectorAnalysis) {
8260 for (
unsigned I = VF, E = VL.
size();
I < E;
I += VF) {
8261 for (
unsigned Idx : seq<unsigned>(0, E))
8264 ShuffleMask, CostKind,
I, LoadTy);
8267 GatherCost -= ScalarsCost;
8269 GatherCost = std::min(BaseCost, GatherCost);
8270 }
else if (!Root &&
isSplat(VL)) {
8273 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
8274 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
8277 count(VL, *It) > 1 &&
8281 CostKind, std::distance(VL.
begin(), It),
8286 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8292 VecTy, ShuffleMask, CostKind,
8297 (
all_of(Gathers, IsaPred<UndefValue>)
8299 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
8307 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8308 unsigned NumParts) {
8309 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
8311 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
8312 auto *EE = dyn_cast<ExtractElementInst>(V);
8315 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8318 return std::max(Sz, VecTy->getNumElements());
8320 unsigned NumSrcRegs =
8322 if (NumSrcRegs == 0)
8327 auto CheckPerRegistersShuffle =
8332 int FirstRegId = -1;
8333 for (
int &
I : Mask) {
8336 int RegId = (
I / NumElts) * NumParts + (
I % NumElts) / EltsPerVector;
8339 RegIndices.
insert(RegId);
8340 if (RegIndices.
size() > 2)
8341 return std::nullopt;
8342 if (RegIndices.
size() == 2)
8344 I = (
I % NumElts) % EltsPerVector +
8345 (RegId == FirstRegId ? 0 : EltsPerVector);
8354 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8355 if (!ShuffleKinds[Part])
8358 Mask.slice(Part * EltsPerVector,
8359 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8360 ? Mask.size() % EltsPerVector
8364 std::optional<TTI::ShuffleKind> RegShuffleKind =
8365 CheckPerRegistersShuffle(SubMask);
8366 if (!RegShuffleKind) {
8385 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8392 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
8394 unsigned SliceSize) {
8395 if (SameNodesEstimated) {
8401 if ((InVectors.
size() == 2 &&
8402 InVectors.
front().get<
const TreeEntry *>() == &E1 &&
8403 InVectors.
back().get<
const TreeEntry *>() == E2) ||
8404 (!E2 && InVectors.
front().get<
const TreeEntry *>() == &E1)) {
8407 "Expected all poisoned elements.");
8410 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
8415 Cost += createShuffle(InVectors.
front(),
8416 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
8418 transformMaskAfterShuffle(CommonMask, CommonMask);
8420 SameNodesEstimated =
false;
8421 if (!E2 && InVectors.
size() == 1) {
8422 unsigned VF = E1.getVectorFactor();
8425 cast<FixedVectorType>(V1->
getType())->getNumElements());
8427 const auto *E = InVectors.
front().get<
const TreeEntry *>();
8428 VF = std::max(VF, E->getVectorFactor());
8430 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8432 CommonMask[
Idx] = Mask[
Idx] + VF;
8433 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
8434 transformMaskAfterShuffle(CommonMask, CommonMask);
8436 Cost += createShuffle(&E1, E2, Mask);
8437 transformMaskAfterShuffle(CommonMask, Mask);
8441 class ShuffleCostBuilder {
8444 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
8446 return Mask.empty() ||
8447 (VF == Mask.size() &&
8455 ~ShuffleCostBuilder() =
default;
8460 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8461 if (isEmptyOrIdentity(Mask, VF))
8464 cast<VectorType>(V1->
getType()), Mask);
8469 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8470 if (isEmptyOrIdentity(Mask, VF))
8473 cast<VectorType>(V1->
getType()), Mask);
8479 void resizeToMatch(
Value *&,
Value *&)
const {}
8489 ShuffleCostBuilder Builder(
TTI);
8492 unsigned CommonVF = Mask.size();
8494 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
8496 if (E.State == TreeEntry::NeedToGather &&
allConstant(E.Scalars))
8498 Type *EScalarTy = E.Scalars.front()->getType();
8499 bool IsSigned =
true;
8500 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8502 IsSigned = It->second.second;
8504 if (EScalarTy != ScalarTy) {
8505 unsigned CastOpcode = Instruction::Trunc;
8506 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8507 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8509 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8518 if (isa<Constant>(V))
8520 auto *VecTy = cast<VectorType>(V->getType());
8522 if (EScalarTy != ScalarTy) {
8524 unsigned CastOpcode = Instruction::Trunc;
8525 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8526 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8528 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8535 if (!V1 && !V2 && !P2.
isNull()) {
8537 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8538 unsigned VF = E->getVectorFactor();
8539 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8540 CommonVF = std::max(VF, E2->getVectorFactor());
8543 return Idx < 2 * static_cast<int>(CommonVF);
8545 "All elements in mask must be less than 2 * CommonVF.");
8546 if (E->Scalars.size() == E2->Scalars.size()) {
8550 for (
int &
Idx : CommonMask) {
8553 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
8555 else if (
Idx >=
static_cast<int>(CommonVF))
8556 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
8560 CommonVF = E->Scalars.size();
8561 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8562 GetNodeMinBWAffectedCost(*E2, CommonVF);
8564 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8565 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8569 }
else if (!V1 && P2.
isNull()) {
8571 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8572 unsigned VF = E->getVectorFactor();
8576 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8577 "All elements in mask must be less than CommonVF.");
8578 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8580 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
8581 for (
int &
Idx : CommonMask) {
8585 CommonVF = E->Scalars.size();
8587 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8590 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8591 CommonVF == CommonMask.
size() &&
8593 [](
const auto &&
P) {
8595 static_cast<unsigned>(
P.value()) !=
P.index();
8603 }
else if (V1 && P2.
isNull()) {
8605 ExtraCost += GetValueMinBWAffectedCost(V1);
8606 CommonVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8609 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8610 "All elements in mask must be less than CommonVF.");
8611 }
else if (V1 && !V2) {
8613 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8614 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8615 CommonVF = std::max(VF, E2->getVectorFactor());
8618 return Idx < 2 * static_cast<int>(CommonVF);
8620 "All elements in mask must be less than 2 * CommonVF.");
8621 if (E2->Scalars.size() == VF && VF != CommonVF) {
8623 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
8624 for (
int &
Idx : CommonMask) {
8627 if (
Idx >=
static_cast<int>(CommonVF))
8628 Idx = E2Mask[
Idx - CommonVF] + VF;
8632 ExtraCost += GetValueMinBWAffectedCost(V1);
8634 ExtraCost += GetNodeMinBWAffectedCost(
8635 *E2, std::min(CommonVF, E2->getVectorFactor()));
8637 }
else if (!V1 && V2) {
8639 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8640 const TreeEntry *E1 = P1.
get<
const TreeEntry *>();
8641 CommonVF = std::max(VF, E1->getVectorFactor());
8644 return Idx < 2 * static_cast<int>(CommonVF);
8646 "All elements in mask must be less than 2 * CommonVF.");
8647 if (E1->Scalars.size() == VF && VF != CommonVF) {
8649 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
8650 for (
int &
Idx : CommonMask) {
8653 if (
Idx >=
static_cast<int>(CommonVF))
8654 Idx = E1Mask[
Idx - CommonVF] + VF;
8660 ExtraCost += GetNodeMinBWAffectedCost(
8661 *E1, std::min(CommonVF, E1->getVectorFactor()));
8663 ExtraCost += GetValueMinBWAffectedCost(V2);
8666 assert(V1 && V2 &&
"Expected both vectors.");
8667 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8669 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8672 return Idx < 2 * static_cast<int>(CommonVF);
8674 "All elements in mask must be less than 2 * CommonVF.");
8676 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8677 if (V1->
getType() != V2->getType()) {
8681 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
8683 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8689 if (InVectors.
size() == 2)
8691 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8692 V1, V2, CommonMask, Builder);
8699 : ScalarTy(ScalarTy),
TTI(
TTI),
8700 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8701 CheckedExtracts(CheckedExtracts) {}
8703 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8704 unsigned NumParts,
bool &UseVecBaseAsInput) {
8705 UseVecBaseAsInput =
false;
8708 Value *VecBase =
nullptr;
8711 if (NumParts == VL.
size())
8715 bool PrevNodeFound =
any_of(
8717 [&](
const std::unique_ptr<TreeEntry> &TE) {
8718 return ((!TE->isAltShuffle() &&
8719 TE->getOpcode() == Instruction::ExtractElement) ||
8720 TE->State == TreeEntry::NeedToGather) &&
8721 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8722 return VL.size() > Data.index() &&
8723 (Mask[Data.index()] == PoisonMaskElem ||
8724 isa<UndefValue>(VL[Data.index()]) ||
8725 Data.value() == VL[Data.index()]);
8729 unsigned SliceSize = VL.
size() / NumParts;
8730 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8731 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8732 for (
auto [
I, V] :
enumerate(VL.
slice(Part * SliceSize, SliceSize))) {
8734 if (isa<UndefValue>(V) ||
8743 auto *EE = cast<ExtractElementInst>(V);
8744 VecBase = EE->getVectorOperand();
8745 UniqueBases.
insert(VecBase);
8746 const TreeEntry *VE = R.getTreeEntry(V);
8747 if (!CheckedExtracts.
insert(V).second ||
8748 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8751 return isa<GetElementPtrInst>(U) &&
8752 !R.areAllUsersVectorized(cast<Instruction>(U),
8760 unsigned Idx = *EEIdx;
8762 if (EE->hasOneUse() || !PrevNodeFound) {
8764 if (isa<SExtInst, ZExtInst>(Ext) &&
8765 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8770 EE->getVectorOperandType(),
Idx);
8773 Ext->getOpcode(), Ext->getType(), EE->getType(),
8789 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8792 transformMaskAfterShuffle(CommonMask, CommonMask);
8793 SameNodesEstimated =
false;
8794 if (NumParts != 1 && UniqueBases.
size() != 1) {
8795 UseVecBaseAsInput =
true;
8803 std::optional<InstructionCost>
8807 return std::nullopt;
8813 return Idx < static_cast<int>(E1.getVectorFactor());
8815 "Expected single vector shuffle mask.");
8819 if (InVectors.
empty()) {
8820 CommonMask.
assign(Mask.begin(), Mask.end());
8821 InVectors.
assign({&E1, &E2});
8824 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8827 if (NumParts == 0 || NumParts >= Mask.size())
8829 unsigned SliceSize = Mask.size() / NumParts;
8832 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8833 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8836 if (InVectors.
empty()) {
8837 CommonMask.
assign(Mask.begin(), Mask.end());
8838 InVectors.
assign(1, &E1);
8841 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8844 if (NumParts == 0 || NumParts >= Mask.size())
8846 unsigned SliceSize = Mask.size() / NumParts;
8849 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8850 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
8851 if (!SameNodesEstimated && InVectors.
size() == 1)
8864 cast<ExtractElementInst>(InVectors.
front()
8865 .get<
const TreeEntry *>()
8866 ->Scalars[
P.index()]);
8867 return EI->getVectorOperand() == V1 ||
8868 EI->getVectorOperand() == V2;
8870 "Expected extractelement vectors.");
8874 if (InVectors.
empty()) {
8876 "Expected empty input mask/vectors.");
8877 CommonMask.
assign(Mask.begin(), Mask.end());
8884 InVectors.
front().is<
const TreeEntry *>() && !CommonMask.
empty() &&
8888 .get<const TreeEntry *>()
8889 ->Scalars[
P.index()];
8891 return P.value() == Mask[
P.index()] ||
8892 isa<UndefValue>(Scalar);
8893 if (isa<Constant>(V1))
8895 auto *EI = cast<ExtractElementInst>(Scalar);
8896 return EI->getVectorOperand() == V1;
8898 "Expected only tree entry for extractelement vectors.");
8902 "Expected only tree entries from extracts/reused buildvectors.");
8903 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8904 if (InVectors.
size() == 2) {
8905 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
8906 transformMaskAfterShuffle(CommonMask, CommonMask);
8907 VF = std::max<unsigned>(VF, CommonMask.
size());
8908 }
else if (
const auto *InTE =
8909 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
8910 VF = std::max(VF, InTE->getVectorFactor());
8914 ->getNumElements());
8917 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8919 CommonMask[
Idx] = Mask[
Idx] + VF;
8922 Value *Root =
nullptr) {
8923 Cost += getBuildVectorCost(VL, Root);
8927 unsigned VF = VL.
size();
8929 VF = std::min(VF, MaskVF);
8931 if (isa<UndefValue>(V)) {
8941 cast<FixedVectorType>(Root->
getType())->getNumElements()),
8942 getAllOnesValue(*R.DL, ScalarTy));
8952 if (InVectors.
size() == 2)
8953 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
8955 Cost += createShuffle(Vec,
nullptr, CommonMask);
8956 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8960 "Expected vector length for the final value before action.");
8962 Action(V, CommonMask);
8963 InVectors.
front() = V;
8966 if (CommonMask.
empty()) {
8967 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
8971 createShuffle(InVectors.
front(),
8972 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
8978 "Shuffle construction must be finalized.");
8982const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
8983 unsigned Idx)
const {
8985 if (
const TreeEntry *TE = getTreeEntry(
Op)) {
8986 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8987 return EI.EdgeIdx == Idx && EI.UserTE == E;
8988 }) != TE->UserTreeIndices.end())
8990 auto MIt = MultiNodeScalars.
find(
Op);
8991 if (MIt != MultiNodeScalars.
end()) {
8992 for (
const TreeEntry *TE : MIt->second) {
8993 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8994 return EI.EdgeIdx == Idx && EI.UserTE == E;
8995 }) != TE->UserTreeIndices.end())
9001 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9002 return TE->State == TreeEntry::NeedToGather &&
9003 find_if(
TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
9004 return EI.EdgeIdx == Idx && EI.UserTE == E;
9005 }) !=
TE->UserTreeIndices.end();
9007 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
9012 if (
TE.State == TreeEntry::ScatterVectorize ||
9013 TE.State == TreeEntry::StridedVectorize)
9015 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
9016 !
TE.isAltShuffle()) {
9017 if (
TE.ReorderIndices.empty())
9056 Type *ScalarTy = VL[0]->getType();
9057 if (E->State != TreeEntry::NeedToGather) {
9058 if (
auto *SI = dyn_cast<StoreInst>(VL[0]))
9059 ScalarTy =
SI->getValueOperand()->getType();
9060 else if (
auto *CI = dyn_cast<CmpInst>(VL[0]))
9062 else if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9063 ScalarTy =
IE->getOperand(1)->getType();
9072 auto It = MinBWs.
find(E);
9073 Type *OrigScalarTy = ScalarTy;
9074 if (It != MinBWs.
end()) {
9078 unsigned EntryVF = E->getVectorFactor();
9081 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9082 if (E->State == TreeEntry::NeedToGather) {
9085 if (isa<InsertElementInst>(VL[0]))
9087 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9088 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
9093 if (!E->ReorderIndices.empty() &&
9094 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9096 if (E->getOpcode() == Instruction::Store) {
9098 NewMask.
resize(E->ReorderIndices.size());
9099 copy(E->ReorderIndices, NewMask.
begin());
9105 if (NeedToShuffleReuses)
9106 ::addMask(Mask, E->ReuseShuffleIndices);
9110 assert((E->State == TreeEntry::Vectorize ||
9111 E->State == TreeEntry::ScatterVectorize ||
9112 E->State == TreeEntry::StridedVectorize) &&
9116 (E->getOpcode() == Instruction::GetElementPtr &&
9117 E->getMainOp()->getType()->isPointerTy())) &&
9120 unsigned ShuffleOrOp =
9121 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
9123 const unsigned Sz = UniqueValues.
size();
9125 for (
unsigned I = 0;
I < Sz; ++
I) {
9126 if (getTreeEntry(UniqueValues[
I]) == E)
9130 auto GetCastContextHint = [&](
Value *
V) {
9131 if (
const TreeEntry *OpTE = getTreeEntry(V))
9132 return getCastContextHint(*OpTE);
9133 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
9134 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9143 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
9147 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9149 for (
unsigned I = 0;
I < Sz; ++
I) {
9150 if (UsedScalars.test(
I))
9152 ScalarCost += ScalarEltCost(
I);
9160 const EdgeInfo &EI = E->UserTreeIndices.front();
9161 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9163 It != MinBWs.
end()) {
9164 auto UserBWIt = MinBWs.
find(EI.UserTE);
9165 Type *UserScalarTy =
9166 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9167 if (UserBWIt != MinBWs.
end())
9169 UserBWIt->second.first);
9170 if (ScalarTy != UserScalarTy) {
9171 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9172 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
9177 VecOpcode = Instruction::Trunc;
9180 It->second.second ? Instruction::SExt : Instruction::ZExt;
9187 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9188 ScalarCost,
"Calculated costs for Tree"));
9189 return VecCost - ScalarCost;
9194 assert((E->State == TreeEntry::Vectorize ||
9195 E->State == TreeEntry::StridedVectorize) &&
9196 "Entry state expected to be Vectorize or StridedVectorize here.");
9200 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
9201 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9202 "Calculated GEPs cost for Tree"));
9204 return VecCost - ScalarCost;
9207 switch (ShuffleOrOp) {
9208 case Instruction::PHI: {
9212 for (
Value *V : UniqueValues) {
9213 auto *
PHI = dyn_cast<PHINode>(V);
9218 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
9222 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
9224 if (!OpTE->ReuseShuffleIndices.empty())
9225 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9226 OpTE->Scalars.size());
9229 return CommonCost - ScalarCost;
9231 case Instruction::ExtractValue:
9232 case Instruction::ExtractElement: {
9233 auto GetScalarCost = [&](
unsigned Idx) {
9234 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
9236 if (ShuffleOrOp == Instruction::ExtractElement) {
9237 auto *EE = cast<ExtractElementInst>(
I);
9238 SrcVecTy = EE->getVectorOperandType();
9240 auto *EV = cast<ExtractValueInst>(
I);
9241 Type *AggregateTy = EV->getAggregateOperand()->getType();
9243 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9244 NumElts = ATy->getNumElements();
9249 if (
I->hasOneUse()) {
9251 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9252 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
9259 Ext->getOpcode(),
Ext->getType(),
I->getType(),
9267 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
9268 return GetCostDiff(GetScalarCost, GetVectorCost);
9270 case Instruction::InsertElement: {
9271 assert(E->ReuseShuffleIndices.empty() &&
9272 "Unique insertelements only are expected.");
9273 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
9274 unsigned const NumElts = SrcVecTy->getNumElements();
9275 unsigned const NumScalars = VL.
size();
9281 unsigned OffsetEnd = OffsetBeg;
9282 InsertMask[OffsetBeg] = 0;
9285 if (OffsetBeg >
Idx)
9287 else if (OffsetEnd <
Idx)
9289 InsertMask[
Idx] =
I + 1;
9293 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9294 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9296 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9297 unsigned InsertVecSz = std::min<unsigned>(
9299 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9300 bool IsWholeSubvector =
9301 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9305 if (OffsetBeg + InsertVecSz > VecSz) {
9308 InsertVecSz = VecSz;
9314 if (!E->ReorderIndices.empty()) {
9319 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
9321 bool IsIdentity =
true;
9323 Mask.swap(PrevMask);
9324 for (
unsigned I = 0;
I < NumScalars; ++
I) {
9326 DemandedElts.
setBit(InsertIdx);
9327 IsIdentity &= InsertIdx - OffsetBeg ==
I;
9328 Mask[InsertIdx - OffsetBeg] =
I;
9330 assert(
Offset < NumElts &&
"Failed to find vector index offset");
9345 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
9346 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9354 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9355 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
9356 if (InsertVecSz != VecSz) {
9368 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
9377 case Instruction::ZExt:
9378 case Instruction::SExt:
9379 case Instruction::FPToUI:
9380 case Instruction::FPToSI:
9381 case Instruction::FPExt:
9382 case Instruction::PtrToInt:
9383 case Instruction::IntToPtr:
9384 case Instruction::SIToFP:
9385 case Instruction::UIToFP:
9386 case Instruction::Trunc:
9387 case Instruction::FPTrunc:
9388 case Instruction::BitCast: {
9389 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9392 unsigned Opcode = ShuffleOrOp;
9393 unsigned VecOpcode = Opcode;
9395 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
9397 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
9398 if (SrcIt != MinBWs.
end()) {
9399 SrcBWSz = SrcIt->second.first;
9403 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9404 if (BWSz == SrcBWSz) {
9405 VecOpcode = Instruction::BitCast;
9406 }
else if (BWSz < SrcBWSz) {
9407 VecOpcode = Instruction::Trunc;
9408 }
else if (It != MinBWs.
end()) {
9409 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9410 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9411 }
else if (SrcIt != MinBWs.
end()) {
9412 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9414 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9416 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
9417 !SrcIt->second.second) {
9418 VecOpcode = Instruction::UIToFP;
9421 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9429 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9431 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
9435 VecOpcode == Opcode ? VI :
nullptr);
9437 return GetCostDiff(GetScalarCost, GetVectorCost);
9439 case Instruction::FCmp:
9440 case Instruction::ICmp:
9441 case Instruction::Select: {
9445 match(VL0, MatchCmp))
9451 auto GetScalarCost = [&](
unsigned Idx) {
9452 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9458 !
match(VI, MatchCmp)) ||
9459 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9465 Builder.getInt1Ty(), CurrentPred,
CostKind,
9472 E->getOpcode(), VecTy, MaskTy, VecPred,
CostKind, VL0);
9484 if (IntrinsicAndUse.second)
9487 VecCost = std::min(VecCost, IntrinsicCost);
9489 return VecCost + CommonCost;
9491 return GetCostDiff(GetScalarCost, GetVectorCost);
9493 case Instruction::FNeg:
9494 case Instruction::Add:
9495 case Instruction::FAdd:
9496 case Instruction::Sub:
9497 case Instruction::FSub:
9498 case Instruction::Mul:
9499 case Instruction::FMul:
9500 case Instruction::UDiv:
9501 case Instruction::SDiv:
9502 case Instruction::FDiv:
9503 case Instruction::URem:
9504 case Instruction::SRem:
9505 case Instruction::FRem:
9506 case Instruction::Shl:
9507 case Instruction::LShr:
9508 case Instruction::AShr:
9509 case Instruction::And:
9510 case Instruction::Or:
9511 case Instruction::Xor: {
9512 auto GetScalarCost = [&](
unsigned Idx) {
9513 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9514 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9523 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
9524 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9527 auto *CI = dyn_cast<ConstantInt>(
Op);
9528 return CI && CI->getValue().countr_one() >= It->second.first;
9533 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9537 Op2Info, std::nullopt,
nullptr, TLI) +
9540 return GetCostDiff(GetScalarCost, GetVectorCost);
9542 case Instruction::GetElementPtr: {
9543 return CommonCost + GetGEPCostDiff(VL, VL0);
9545 case Instruction::Load: {
9546 auto GetScalarCost = [&](
unsigned Idx) {
9547 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
9549 VI->getAlign(),
VI->getPointerAddressSpace(),
9552 auto *LI0 = cast<LoadInst>(VL0);
9555 if (E->State == TreeEntry::Vectorize) {
9557 Instruction::Load, VecTy, LI0->getAlign(),
9559 }
else if (E->State == TreeEntry::StridedVectorize) {
9560 Align CommonAlignment =
9561 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9563 Instruction::Load, VecTy, LI0->getPointerOperand(),
9566 assert(E->State == TreeEntry::ScatterVectorize &&
"Unknown EntryState");
9567 Align CommonAlignment =
9568 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9570 Instruction::Load, VecTy, LI0->getPointerOperand(),
9573 return VecLdCost + CommonCost;
9579 if (E->State == TreeEntry::ScatterVectorize)
9585 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
9586 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9588 case Instruction::Store: {
9589 bool IsReorder = !E->ReorderIndices.empty();
9590 auto GetScalarCost = [=](
unsigned Idx) {
9591 auto *
VI = cast<StoreInst>(VL[
Idx]);
9594 VI->getAlign(),
VI->getPointerAddressSpace(),
9598 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9602 if (E->State == TreeEntry::StridedVectorize) {
9603 Align CommonAlignment =
9604 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9606 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9609 assert(E->State == TreeEntry::Vectorize &&
9610 "Expected either strided or consecutive stores.");
9613 Instruction::Store, VecTy, BaseSI->getAlign(),
9614 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
9616 return VecStCost + CommonCost;
9620 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
9621 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
9624 return GetCostDiff(GetScalarCost, GetVectorCost) +
9625 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9627 case Instruction::Call: {
9628 auto GetScalarCost = [&](
unsigned Idx) {
9629 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
9640 auto *CI = cast<CallInst>(VL0);
9644 It != MinBWs.
end() ? It->second.first : 0);
9646 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9648 return GetCostDiff(GetScalarCost, GetVectorCost);
9650 case Instruction::ShuffleVector: {
9651 assert(E->isAltShuffle() &&
9656 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9657 "Invalid Shuffle Vector Operand");
9660 auto TryFindNodeWithEqualOperands = [=]() {
9661 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9664 if (
TE->isAltShuffle() &&
9665 ((
TE->getOpcode() == E->getOpcode() &&
9666 TE->getAltOpcode() == E->getAltOpcode()) ||
9667 (
TE->getOpcode() == E->getAltOpcode() &&
9668 TE->getAltOpcode() == E->getOpcode())) &&
9669 TE->hasEqualOperands(*E))
9674 auto GetScalarCost = [&](
unsigned Idx) {
9675 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9676 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
9686 if (TryFindNodeWithEqualOperands()) {
9688 dbgs() <<
"SLP: diamond match for alternate node found.\n";
9695 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
9697 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
9698 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9700 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9701 CI0->getPredicate(),
CostKind, VL0);
9702 VecCost += TTIRef.getCmpSelInstrCost(
9703 E->getOpcode(), VecTy, MaskTy,
9704 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
9707 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9710 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9711 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9713 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9714 if (SrcIt != MinBWs.
end()) {
9715 SrcBWSz = SrcIt->second.first;
9719 if (BWSz <= SrcBWSz) {
9722 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9726 <<
"SLP: alternate extension, which should be truncated.\n";
9732 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9735 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9739 E->buildAltOpShuffleMask(
9741 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
9742 return I->getOpcode() == E->getAltOpcode();
9751 unsigned Opcode0 = E->getOpcode();
9752 unsigned Opcode1 = E->getAltOpcode();
9755 for (
unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9756 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9757 OpcodeMask.set(Lane);
9760 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9762 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
9763 return AltVecCost < VecCost ? AltVecCost : VecCost;
9768 return GetCostDiff(GetScalarCost, GetVectorCost);
9775bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
9777 << VectorizableTree.size() <<
" is fully vectorizable .\n");
9779 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
9781 return TE->State == TreeEntry::NeedToGather &&
9783 [
this](
Value *V) { return EphValues.contains(V); }) &&
9785 TE->Scalars.size() < Limit ||
9786 ((
TE->getOpcode() == Instruction::ExtractElement ||
9787 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9789 (
TE->State == TreeEntry::NeedToGather &&
9790 TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()));
9794 if (VectorizableTree.size() == 1 &&
9795 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9797 AreVectorizableGathers(VectorizableTree[0].
get(),
9798 VectorizableTree[0]->Scalars.size()) &&
9799 VectorizableTree[0]->getVectorFactor() > 2)))
9802 if (VectorizableTree.size() != 2)
9810 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9811 AreVectorizableGathers(VectorizableTree[1].
get(),
9812 VectorizableTree[0]->Scalars.size()))
9816 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9817 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9818 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9819 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9827 bool MustMatchOrInst) {
9831 Value *ZextLoad = Root;
9832 const APInt *ShAmtC;
9833 bool FoundOr =
false;
9834 while (!isa<ConstantExpr>(ZextLoad) &&
9837 ShAmtC->
urem(8) == 0))) {
9838 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9839 ZextLoad = BinOp->getOperand(0);
9840 if (BinOp->getOpcode() == Instruction::Or)
9845 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9852 Type *SrcTy = Load->getType();
9859 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
9860 << *(cast<Instruction>(Root)) <<
"\n");
9869 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9870 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9878 unsigned NumElts = Stores.
size();
9879 for (
Value *Scalar : Stores) {
9890 if (VectorizableTree.size() == 2 &&
9891 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9892 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9893 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9894 !(
isSplat(VectorizableTree[1]->Scalars) ||
9902 constexpr int Limit = 4;
9904 !VectorizableTree.empty() &&
9905 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9906 return (TE->State == TreeEntry::NeedToGather &&
9907 TE->getOpcode() != Instruction::ExtractElement &&
9908 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9909 TE->getOpcode() == Instruction::PHI;
9920 if (isFullyVectorizableTinyTree(ForReduction))
9925 bool IsAllowedSingleBVNode =
9926 VectorizableTree.size() > 1 ||
9927 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9928 !VectorizableTree.front()->isAltShuffle() &&
9929 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9930 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9932 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9933 return TE->State == TreeEntry::NeedToGather &&
9935 return isa<ExtractElementInst, UndefValue>(V) ||
9936 (IsAllowedSingleBVNode &&
9937 !V->hasNUsesOrMore(UsesLimit) &&
9938 any_of(V->users(), IsaPred<InsertElementInst>));
9943 assert(VectorizableTree.empty()
9944 ? ExternalUses.empty()
9945 :
true &&
"We shouldn't have any external users");
9957 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9970 for (
const auto &TEPtr : VectorizableTree) {
9971 if (TEPtr->State != TreeEntry::Vectorize)
9973 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9979 auto *NodeA = DT->
getNode(
A->getParent());
9980 auto *NodeB = DT->
getNode(
B->getParent());
9981 assert(NodeA &&
"Should only process reachable instructions");
9982 assert(NodeB &&
"Should only process reachable instructions");
9983 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9984 "Different nodes should have different DFS numbers");
9986 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9987 return B->comesBefore(
A);
9997 LiveValues.
erase(PrevInst);
9998 for (
auto &J : PrevInst->
operands()) {
9999 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10000 LiveValues.
insert(cast<Instruction>(&*J));
10004 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
10005 for (
auto *
X : LiveValues)
10006 dbgs() <<
" " <<
X->getName();
10007 dbgs() <<
", Looking at ";
10012 unsigned NumCalls = 0;
10016 while (InstIt != PrevInstIt) {
10018 PrevInstIt = Inst->getParent()->rbegin();
10023 if (
auto *II = dyn_cast<IntrinsicInst>(
I)) {
10024 if (II->isAssumeLikeIntrinsic())
10028 for (
auto &ArgOp : II->args())
10030 if (
auto *FPMO = dyn_cast<FPMathOperator>(II))
10031 FMF = FPMO->getFastMathFlags();
10038 if (IntrCost < CallCost)
10045 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10046 &*PrevInstIt != PrevInst)
10054 for (
auto *II : LiveValues) {
10055 auto *ScalarTy = II->getType();
10056 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10057 ScalarTy = VectorTy->getElementType();
10075 const auto *I1 = IE1;
10076 const auto *I2 = IE2;
10088 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10090 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10091 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
10093 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10094 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10101struct ValueSelect {
10102 template <
typename U>
10103 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
10106 template <
typename U>
10107 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
10125template <
typename T>
10131 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
10133 auto VMIt = std::next(ShuffleMask.begin());
10136 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10138 if (!IsBaseUndef.
all()) {
10140 std::pair<T *, bool> Res =
10141 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
10143 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
10147 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
10149 auto *V = ValueSelect::get<T *>(
Base);
10151 assert((!V || GetVF(V) == Mask.size()) &&
10152 "Expected base vector of VF number of elements.");
10153 Prev = Action(Mask, {
nullptr, Res.first});
10154 }
else if (ShuffleMask.size() == 1) {
10157 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10163 Prev = Action(Mask, {ShuffleMask.begin()->first});
10167 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10168 unsigned Vec2VF = GetVF(VMIt->first);
10169 if (Vec1VF == Vec2VF) {
10173 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10176 Mask[
I] = SecMask[
I] + Vec1VF;
10179 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10182 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10184 std::pair<T *, bool> Res2 =
10185 ResizeAction(VMIt->first, VMIt->second,
false);
10187 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10194 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
10197 Prev = Action(Mask, {Res1.first, Res2.first});
10199 VMIt = std::next(VMIt);
10201 bool IsBaseNotUndef = !IsBaseUndef.
all();
10202 (void)IsBaseNotUndef;
10204 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10206 std::pair<T *, bool> Res =
10207 ResizeAction(VMIt->first, VMIt->second,
false);
10209 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10212 "Multiple uses of scalars.");
10213 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
10218 Prev = Action(Mask, {Prev, Res.first});
10226 << VectorizableTree.size() <<
".\n");
10228 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10231 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
10232 TreeEntry &TE = *VectorizableTree[
I];
10233 if (TE.State == TreeEntry::NeedToGather) {
10234 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
10235 E && E->getVectorFactor() == TE.getVectorFactor() &&
10236 E->isSame(TE.Scalars)) {
10241 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10250 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10260 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10261 for (ExternalUser &EU : ExternalUses) {
10263 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10264 !ExtractCostCalculated.
insert(EU.Scalar).second)
10270 if (EphValues.
count(EU.User))
10274 if (isa<FixedVectorType>(EU.Scalar->getType()))
10279 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
10280 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
10281 if (!UsedInserts.
insert(VU).second)
10285 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10288 [
this, VU](
const std::pair<Value *, const TreeEntry *> &Pair) {
10290 VU, cast<InsertElementInst>(Pair.first),
10292 Value *Op0 = II->getOperand(0);
10293 if (getTreeEntry(II) && !getTreeEntry(Op0))
10299 if (It == FirstUsers.
end()) {
10306 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
10307 if (IEBase != EU.User &&
10308 (!IEBase->hasOneUse() ||
10312 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
10315 IEBase = cast<InsertElementInst>(
Base);
10318 "InsertElementInstruction used already.");
10320 Base = IEBase->getOperand(0);
10321 }
while (E == getTreeEntry(
Base));
10324 Base = cast<InsertElementInst>(
Base)->getOperand(0);
10328 VecId = FirstUsers.
size() - 1;
10329 auto It = MinBWs.
find(ScalarTE);
10330 if (It != MinBWs.
end() &&
10332 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
10334 unsigned BWSz = It->second.first;
10335 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
10336 unsigned VecOpcode;
10337 if (DstBWSz < BWSz)
10338 VecOpcode = Instruction::Trunc;
10341 It->second.second ? Instruction::SExt : Instruction::ZExt;
10347 FTy->getNumElements()),
10350 <<
" for extending externally used vector with "
10351 "non-equal minimum bitwidth.\n");
10357 VecId = std::distance(FirstUsers.
begin(), It);
10359 int InIdx = *InsertIdx;
10363 Mask[InIdx] = EU.Lane;
10364 DemandedElts[VecId].setBit(InIdx);
10372 if (
auto *
GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10373 if (!ValueToExtUses) {
10374 ValueToExtUses.emplace();
10376 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
10382 if (!getTreeEntry(V))
10384 auto It = ValueToExtUses->find(V);
10385 if (It != ValueToExtUses->end()) {
10387 ExternalUses[It->second].User = nullptr;
10392 if (CanBeUsedAsGEP) {
10394 ExternalUsesAsGEPs.
insert(EU.Scalar);
10403 auto It = MinBWs.
find(getTreeEntry(EU.Scalar));
10404 if (It != MinBWs.
end()) {
10407 It->second.second ? Instruction::SExt : Instruction::ZExt;
10417 if (!VectorizedVals.
empty()) {
10418 const TreeEntry &Root = *VectorizableTree.front().get();
10419 auto BWIt = MinBWs.find(&Root);
10420 if (BWIt != MinBWs.end()) {
10421 Type *DstTy = Root.Scalars.front()->getType();
10422 unsigned OriginalSz =
DL->getTypeSizeInBits(DstTy);
10424 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10425 if (OriginalSz != SrcSz) {
10426 unsigned Opcode = Instruction::Trunc;
10427 if (OriginalSz > SrcSz)
10428 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10438 Cost += SpillCost + ExtractCost;
10442 unsigned VF =
Mask.size();
10443 unsigned VecVF =
TE->getVectorFactor();
10445 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
10448 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
10454 dbgs() <<
"SLP: Adding cost " <<
C
10455 <<
" for final shuffle of insertelement external users.\n";
10456 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10458 return std::make_pair(TE,
true);
10460 return std::make_pair(TE,
false);
10463 for (
int I = 0, E = FirstUsers.size();
I < E; ++
I) {
10464 Value *
Base = cast<Instruction>(FirstUsers[
I].first)->getOperand(0);
10465 auto Vector = ShuffleMasks[
I].takeVector();
10469 assert((TEs.size() == 1 || TEs.size() == 2) &&
10470 "Expected exactly 1 or 2 tree entries.");
10471 if (TEs.size() == 1) {
10473 VF = TEs.front()->getVectorFactor();
10479 (
Data.index() < VF &&
10480 static_cast<int>(
Data.index()) ==
Data.value());
10485 <<
" for final shuffle of insertelement "
10486 "external users.\n";
10487 TEs.front()->
dump();
10488 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10494 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10495 VF = TEs.front()->getVectorFactor();
10504 <<
" for final shuffle of vector node and external "
10505 "insertelement users.\n";
10506 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10507 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10513 (void)performExtractsShuffleAction<const TreeEntry>(
10515 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
10516 EstimateShufflesCost);
10518 cast<FixedVectorType>(FirstUsers[
I].first->getType()), DemandedElts[
I],
10520 Cost -= InsertCost;
10524 if (ReductionBitWidth != 0) {
10525 assert(UserIgnoreList &&
"Expected reduction tree.");
10526 const TreeEntry &E = *VectorizableTree.front().get();
10527 auto It = MinBWs.find(&E);
10528 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10529 unsigned SrcSize = It->second.first;
10530 unsigned DstSize = ReductionBitWidth;
10531 unsigned Opcode = Instruction::Trunc;
10532 if (SrcSize < DstSize)
10533 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10540 switch (E.getOpcode()) {
10541 case Instruction::SExt:
10542 case Instruction::ZExt:
10543 case Instruction::Trunc: {
10544 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10545 CCH = getCastContextHint(*OpTE);
10555 <<
" for final resize for reduction from " << SrcVecTy
10556 <<
" to " << DstVecTy <<
"\n";
10557 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10565 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
10566 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
10567 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
10571 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
10582std::optional<TTI::ShuffleKind>
10583BoUpSLP::tryToGatherSingleRegisterExtractElements(
10589 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
10590 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10592 if (isa<UndefValue>(VL[
I]))
10596 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10597 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10606 ExtractMask.reset(*
Idx);
10611 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
10615 for (
const auto &
Data : VectorOpToIdx)
10616 VFToVector[cast<FixedVectorType>(
Data.first->getType())->getNumElements()]
10617 .push_back(
Data.first);
10618 for (
auto &
Data : VFToVector) {
10620 return VectorOpToIdx.find(V1)->second.size() >
10621 VectorOpToIdx.find(V2)->second.size();
10626 const int UndefSz = UndefVectorExtracts.
size();
10627 unsigned SingleMax = 0;
10628 Value *SingleVec =
nullptr;
10629 unsigned PairMax = 0;
10630 std::pair<Value *, Value *> PairVec(
nullptr,
nullptr);
10631 for (
auto &
Data : VFToVector) {
10633 if (SingleMax < VectorOpToIdx[V1].
size() + UndefSz) {
10634 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10638 if (
Data.second.size() > 1)
10639 V2 = *std::next(
Data.second.begin());
10640 if (V2 && PairMax < VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() +
10642 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[
V2].size() + UndefSz;
10643 PairVec = std::make_pair(V1, V2);
10646 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10647 return std::nullopt;
10653 if (SingleMax >= PairMax && SingleMax) {
10654 for (
int Idx : VectorOpToIdx[SingleVec])
10657 for (
Value *V : {PairVec.first, PairVec.second})
10658 for (
int Idx : VectorOpToIdx[V])
10662 for (
int Idx : UndefVectorExtracts)
10666 std::optional<TTI::ShuffleKind> Res =
10672 return std::nullopt;
10676 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
10677 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
10678 isa<UndefValue>(GatheredExtracts[
I])) {
10682 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10683 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10684 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10699 unsigned NumParts)
const {
10700 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
10703 unsigned SliceSize = VL.
size() / NumParts;
10704 for (
unsigned Part = 0; Part < NumParts; ++Part) {
10710 std::optional<TTI::ShuffleKind> Res =
10711 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10712 ShufflesRes[Part] = Res;
10713 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
10715 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
10716 return Res.has_value();
10718 ShufflesRes.clear();
10719 return ShufflesRes;
10722std::optional<TargetTransformInfo::ShuffleKind>
10723BoUpSLP::isGatherShuffledSingleRegisterEntry(
10729 const EdgeInfo &TEUseEI =
TE->UserTreeIndices.front();
10730 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10734 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10735 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10738 TEInsertBlock = TEInsertPt->
getParent();
10741 return std::nullopt;
10742 auto *NodeUI = DT->
getNode(TEInsertBlock);
10743 assert(NodeUI &&
"Should only process reachable instructions");
10745 auto CheckOrdering = [&](
const Instruction *InsertPt) {
10759 auto *NodeEUI = DT->
getNode(InsertBlock);
10762 assert((NodeUI == NodeEUI) ==
10763 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10764 "Different nodes should have different DFS numbers");
10766 if (TEInsertPt->
getParent() != InsertBlock &&
10769 if (TEInsertPt->
getParent() == InsertBlock &&
10783 for (
Value *V : VL) {
10788 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10792 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
10793 "Must contain at least single gathered value.");
10794 assert(TEPtr->UserTreeIndices.size() == 1 &&
10795 "Expected only single user of a gather node.");
10796 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10798 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10801 : &getLastInstructionInBundle(UseEI.UserTE);
10802 if (TEInsertPt == InsertPt) {
10806 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10810 if (TEUseEI.UserTE != UseEI.UserTE &&
10811 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10817 if ((TEInsertBlock != InsertPt->
getParent() ||
10818 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10819 !CheckOrdering(InsertPt))
10823 if (
const TreeEntry *VTE = getTreeEntry(V)) {
10825 if (VTE->State != TreeEntry::Vectorize) {
10826 auto It = MultiNodeScalars.
find(V);
10827 if (It == MultiNodeScalars.
end())
10829 VTE = *It->getSecond().begin();
10831 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
10832 return MTE->State == TreeEntry::Vectorize;
10834 if (MIt == It->getSecond().end())
10839 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10840 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10844 if (VToTEs.
empty())
10846 if (UsedTEs.
empty()) {
10860 if (!VToTEs.
empty()) {
10866 VToTEs = SavedVToTEs;
10875 if (UsedTEs.
size() == 2)
10877 UsedTEs.push_back(SavedVToTEs);
10884 if (UsedTEs.
empty()) {
10886 return std::nullopt;
10890 if (UsedTEs.
size() == 1) {
10893 UsedTEs.front().
end());
10894 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10895 return TE1->Idx < TE2->Idx;
10898 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
10899 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
10901 if (It != FirstEntries.end() &&
10902 ((*It)->getVectorFactor() == VL.size() ||
10903 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
10904 TE->ReuseShuffleIndices.size() == VL.size() &&
10905 (*It)->isSame(
TE->Scalars)))) {
10906 Entries.push_back(*It);
10907 if ((*It)->getVectorFactor() == VL.size()) {
10908 std::iota(std::next(
Mask.begin(), Part * VL.size()),
10909 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
10915 for (
int I = 0, Sz = VL.size();
I < Sz; ++
I)
10916 if (isa<PoisonValue>(VL[
I]))
10922 Entries.push_back(FirstEntries.front());
10925 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
10928 for (
const TreeEntry *TE : UsedTEs.front()) {
10929 unsigned VF =
TE->getVectorFactor();
10930 auto It = VFToTE.
find(VF);
10931 if (It != VFToTE.
end()) {
10932 if (It->second->Idx >
TE->Idx)
10933 It->getSecond() =
TE;
10940 UsedTEs.back().
end());
10941 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10942 return TE1->Idx < TE2->Idx;
10944 for (
const TreeEntry *TE : SecondEntries) {
10945 auto It = VFToTE.
find(
TE->getVectorFactor());
10946 if (It != VFToTE.
end()) {
10948 Entries.push_back(It->second);
10949 Entries.push_back(TE);
10955 if (Entries.empty()) {
10957 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10958 return TE1->Idx < TE2->Idx;
10960 Entries.push_back(SecondEntries.front());
10961 VF = std::max(Entries.front()->getVectorFactor(),
10962 Entries.back()->getVectorFactor());
10966 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
10969 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
10970 auto *
PHI = cast<PHINode>(V);
10971 auto *PHI1 = cast<PHINode>(V1);
10976 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
10978 Value *In1 = PHI1->getIncomingValue(
I);
10983 if (cast<Instruction>(In)->
getParent() !=
10993 auto MightBeIgnored = [=](
Value *
V) {
10994 auto *
I = dyn_cast<Instruction>(V);
10995 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
10997 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
11002 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
11004 bool UsedInSameVTE =
false;
11005 auto It = UsedValuesEntry.
find(V1);
11006 if (It != UsedValuesEntry.
end())
11007 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
11008 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11010 cast<Instruction>(V)->getParent() ==
11011 cast<Instruction>(V1)->getParent() &&
11012 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11017 for (
int I = 0, E = VL.size();
I < E; ++
I) {
11019 auto It = UsedValuesEntry.
find(V);
11020 if (It == UsedValuesEntry.
end())
11026 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
11027 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
11029 unsigned Idx = It->second;
11036 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
11037 if (!UsedIdxs.test(
I))
11043 for (std::pair<unsigned, int> &Pair : EntryLanes)
11044 if (Pair.first ==
I)
11045 Pair.first = TempEntries.
size();
11048 Entries.swap(TempEntries);
11049 if (EntryLanes.size() == Entries.size() &&
11051 .
slice(Part * VL.size(),
11052 std::min<int>(VL.size(),
TE->Scalars.size())))) {
11058 return std::nullopt;
11061 bool IsIdentity = Entries.size() == 1;
11064 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
11065 unsigned Idx = Part * VL.size() + Pair.second;
11068 (ForOrder ? std::distance(
11069 Entries[Pair.first]->Scalars.begin(),
11070 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11071 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11072 IsIdentity &=
Mask[
Idx] == Pair.second;
11074 switch (Entries.size()) {
11076 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11080 if (EntryLanes.size() > 2 || VL.size() <= 2)
11088 std::fill(std::next(
Mask.begin(), Part * VL.size()),
11090 return std::nullopt;
11094BoUpSLP::isGatherShuffledEntry(
11098 assert(NumParts > 0 && NumParts < VL.
size() &&
11099 "Expected positive number of registers.");
11102 if (TE == VectorizableTree.front().get())
11105 if (
TE->isNonPowOf2Vec())
11108 assert(
TE->UserTreeIndices.size() == 1 &&
11109 "Expected only single user of the gather node.");
11111 "Number of scalars must be divisible by NumParts.");
11112 unsigned SliceSize = VL.
size() / NumParts;
11114 for (
unsigned Part = 0; Part < NumParts; ++Part) {
11117 std::optional<TTI::ShuffleKind> SubRes =
11118 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11121 SubEntries.
clear();
11124 SubEntries.
front()->getVectorFactor() == VL.
size() &&
11125 (SubEntries.
front()->isSame(
TE->Scalars) ||
11126 SubEntries.
front()->isSame(VL))) {
11128 LocalSubEntries.
swap(SubEntries);
11131 std::iota(
Mask.begin(),
Mask.end(), 0);
11133 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
11134 if (isa<PoisonValue>(VL[
I]))
11136 Entries.emplace_back(1, LocalSubEntries.
front());
11142 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
11150 Type *ScalarTy)
const {
11152 bool DuplicateNonConst =
false;
11160 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
11161 if (
V->getType() != ScalarTy) {
11172 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
11175 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
11183 EstimateInsertCost(
I, V);
11184 ShuffleMask[
I] =
I;
11188 DuplicateNonConst =
true;
11190 ShuffleMask[
I] = Res.first->second;
11196 if (DuplicateNonConst)
11198 VecTy, ShuffleMask);
11210 VLOperands Ops(VL, R);
11213 Left = Ops.getVL(0);
11214 Right = Ops.getVL(1);
11217Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
11220 return *Res.second;
11224 auto *Front = E->getMainOp();
11227 if (E->getOpcode() == Instruction::GetElementPtr &&
11228 !isa<GetElementPtrInst>(V))
11230 auto *I = cast<Instruction>(V);
11231 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11232 isVectorLikeInstWithConstOps(I);
11235 auto FindLastInst = [&]() {
11237 for (
Value *V : E->Scalars) {
11238 auto *
I = dyn_cast<Instruction>(V);
11241 if (LastInst->
getParent() ==
I->getParent()) {
11246 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11247 !isa<GetElementPtrInst>(
I)) ||
11250 "Expected vector-like or non-GEP in GEP node insts only.");
11258 auto *NodeB = DT->
getNode(
I->getParent());
11259 assert(NodeA &&
"Should only process reachable instructions");
11260 assert(NodeB &&
"Should only process reachable instructions");
11261 assert((NodeA == NodeB) ==
11262 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11263 "Different nodes should have different DFS numbers");
11264 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11271 auto FindFirstInst = [&]() {
11273 for (
Value *V : E->Scalars) {
11274 auto *
I = dyn_cast<Instruction>(V);
11277 if (FirstInst->
getParent() ==
I->getParent()) {
11278 if (
I->comesBefore(FirstInst))
11282 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11283 !isa<GetElementPtrInst>(
I)) ||
11286 "Expected vector-like or non-GEP in GEP node insts only.");
11294 auto *NodeB = DT->
getNode(
I->getParent());
11295 assert(NodeA &&
"Should only process reachable instructions");
11296 assert(NodeB &&
"Should only process reachable instructions");
11297 assert((NodeA == NodeB) ==
11298 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11299 "Different nodes should have different DFS numbers");
11300 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11309 (E->State != TreeEntry::NeedToGather &&
11311 if ((E->getOpcode() == Instruction::GetElementPtr &&
11314 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11318 return !isVectorLikeInstWithConstOps(V) &&
11319 isUsedOutsideBlock(V);
11321 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11323 return isa<ExtractElementInst, UndefValue>(V) ||
11324 areAllOperandsNonInsts(V);
11326 Res.second = FindLastInst();
11328 Res.second = FindFirstInst();
11329 return *Res.second;
11336 if (BlocksSchedules.count(BB)) {
11337 Value *
V = E->isOneOf(E->Scalars.back());
11340 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11341 if (Bundle && Bundle->isPartOfBundle())
11342 for (; Bundle; Bundle = Bundle->NextInBundle)
11343 if (Bundle->OpValue == Bundle->Inst)
11344 Res.second = Bundle->Inst;
11366 Res.second = FindLastInst();
11367 assert(Res.second &&
"Failed to find last instruction in bundle");
11368 return *Res.second;
11371void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
11372 auto *Front = E->getMainOp();
11373 Instruction *LastInst = &getLastInstructionInBundle(E);
11374 assert(LastInst &&
"Failed to find last instruction in bundle");
11377 bool IsPHI = isa<PHINode>(LastInst);
11380 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11382 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
11386 Builder.SetInsertPoint(
11390 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11400 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
11403 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
11404 InsertBB = InsertBB->getSinglePredecessor();
11405 return InsertBB && InsertBB == InstBB;
11407 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11408 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
11409 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11410 getTreeEntry(Inst) ||
11411 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
11412 PostponedIndices.
insert(
I).second)
11416 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
11419 if (
Scalar->getType() != Ty) {
11421 "Expected integer types only.");
11422 Scalar = Builder.CreateIntCast(
11426 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11427 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11430 GatherShuffleExtractSeq.
insert(InsElt);
11431 CSEBlocks.
insert(InsElt->getParent());
11433 if (isa<Instruction>(V)) {
11434 if (TreeEntry *Entry = getTreeEntry(V)) {
11436 User *UserOp =
nullptr;
11438 if (
auto *SI = dyn_cast<Instruction>(Scalar))
11444 unsigned FoundLane = Entry->findLaneForValue(V);
11445 ExternalUses.emplace_back(V, UserOp, FoundLane);
11455 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11463 if (!isa<UndefValue>(VL[
I])) {
11467 if (isa<PoisonValue>(VL[
I]))
11469 if (
auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11474 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11477 for (
int I : NonConsts)
11478 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11481 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11482 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11520 bool IsFinalized =
false;
11530 Type *ScalarTy =
nullptr;
11534 class ShuffleIRBuilder {
11547 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11548 CSEBlocks(CSEBlocks),
DL(
DL) {}
11549 ~ShuffleIRBuilder() =
default;
11552 if (V1->
getType() != V2->getType()) {
11555 "Expected integer vector types only.");
11556 if (V1->
getType() != V2->getType()) {
11557 if (cast<VectorType>(V2->getType())
11559 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
11561 ->getIntegerBitWidth())
11570 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11571 GatherShuffleExtractSeq.
insert(
I);
11572 CSEBlocks.
insert(
I->getParent());
11581 unsigned VF = Mask.size();
11582 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11586 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11587 GatherShuffleExtractSeq.
insert(
I);
11588 CSEBlocks.
insert(
I->getParent());
11592 Value *createIdentity(
Value *V) {
return V; }
11593 Value *createPoison(
Type *Ty,
unsigned VF) {
11598 void resizeToMatch(
Value *&V1,
Value *&V2) {
11599 if (V1->
getType() == V2->getType())
11601 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11602 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11603 int VF = std::max(V1VF, V2VF);
11604 int MinVF = std::min(V1VF, V2VF);
11606 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
11608 Value *&
Op = MinVF == V1VF ? V1 : V2;
11610 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
11611 GatherShuffleExtractSeq.
insert(
I);
11612 CSEBlocks.
insert(
I->getParent());
11625 assert(V1 &&
"Expected at least one vector value.");
11626 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11627 R.CSEBlocks, *R.DL);
11628 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11636 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11644 std::optional<bool> IsSigned = std::nullopt) {
11645 auto *VecTy = cast<VectorType>(V->getType());
11655 : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11659 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11660 unsigned NumParts,
bool &UseVecBaseAsInput) {
11661 UseVecBaseAsInput =
false;
11663 Value *VecBase =
nullptr;
11664 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
11668 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
11669 VecBase = EI->getVectorOperand();
11670 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
11671 VecBase = TE->VectorizedValue;
11672 assert(VecBase &&
"Expected vectorized value.");
11673 UniqueBases.
insert(VecBase);
11676 if (!EI->hasOneUse() || (NumParts != 1 &&
count(E->Scalars, EI) > 1) ||
11678 const TreeEntry *UTE = R.getTreeEntry(U);
11679 return !UTE || R.MultiNodeScalars.contains(U) ||
11680 (isa<GetElementPtrInst>(U) &&
11681 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11682 count_if(R.VectorizableTree,
11683 [&](const std::unique_ptr<TreeEntry> &TE) {
11684 return any_of(TE->UserTreeIndices,
11685 [&](const EdgeInfo &Edge) {
11686 return Edge.UserTE == UTE;
11688 is_contained(TE->Scalars, EI);
11692 R.eraseInstruction(EI);
11694 if (NumParts == 1 || UniqueBases.
size() == 1) {
11695 VecBase = castToScalarTyElem(VecBase);
11698 UseVecBaseAsInput =
true;
11708 Value *Vec =
nullptr;
11710 unsigned SliceSize = E->Scalars.size() / NumParts;
11711 for (
unsigned Part = 0; Part < NumParts; ++Part) {
11715 constexpr int MaxBases = 2;
11723 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11724 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
11725 VecOp = TE->VectorizedValue;
11726 assert(VecOp &&
"Expected vectorized value.");
11728 cast<FixedVectorType>(VecOp->
getType())->getNumElements();
11730 assert((PrevSize ==
Size || PrevSize == 0) &&
11731 "Expected vectors of the same size.");
11734 VecOp = castToScalarTyElem(VecOp);
11735 Bases[SubMask[
I] <
Size ? 0 : 1] = VecOp;
11737 if (!Bases.front())
11740 if (Bases.back()) {
11741 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11742 TransformToIdentity(SubMask);
11744 SubVec = Bases.front();
11751 Mask.slice(
P * SliceSize, SliceSize);
11756 "Expected first part or all previous parts masked.");
11757 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11759 unsigned VF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11761 unsigned SubVecVF =
11762 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
11763 VF = std::max(VF, SubVecVF);
11766 for (
int &
Idx : SubMask)
11769 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11770 Vec = createShuffle(Vec, SubVec, VecMask);
11771 TransformToIdentity(VecMask);
11779 std::optional<Value *>
11785 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
11787 return std::nullopt;
11799 Value *V1 = E1.VectorizedValue;
11801 V1 = castToScalarTyElem(V1,
all_of(E1.Scalars, [&](
Value *V) {
11802 return !isKnownNonNegative(
11803 V, SimplifyQuery(*R.DL));
11805 Value *V2 = E2.VectorizedValue;
11806 if (V2->getType()->isIntOrIntVectorTy())
11807 V2 = castToScalarTyElem(V2,
all_of(E2.Scalars, [&](
Value *V) {
11808 return !isKnownNonNegative(
11809 V, SimplifyQuery(*R.DL));
11816 Value *V1 = E1.VectorizedValue;
11818 V1 = castToScalarTyElem(V1,
all_of(E1.Scalars, [&](
Value *V) {
11819 return !isKnownNonNegative(
11820 V, SimplifyQuery(*R.DL));
11826 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
11827 V1 = castToScalarTyElem(V1);
11828 V2 = castToScalarTyElem(V2);
11829 if (InVectors.
empty()) {
11832 CommonMask.
assign(Mask.begin(), Mask.end());
11836 if (InVectors.
size() == 2) {
11837 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11838 transformMaskAfterShuffle(CommonMask, CommonMask);
11839 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
11841 Vec = createShuffle(Vec,
nullptr, CommonMask);
11842 transformMaskAfterShuffle(CommonMask, CommonMask);
11844 V1 = createShuffle(V1, V2, Mask);
11845 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11847 CommonMask[
Idx] =
Idx + Sz;
11848 InVectors.
front() = Vec;
11849 if (InVectors.
size() == 2)
11850 InVectors.
back() = V1;
11856 V1 = castToScalarTyElem(V1);
11857 if (InVectors.
empty()) {
11858 if (!isa<FixedVectorType>(V1->
getType())) {
11859 V1 = createShuffle(V1,
nullptr, CommonMask);
11861 transformMaskAfterShuffle(CommonMask, Mask);
11864 CommonMask.
assign(Mask.begin(), Mask.end());
11867 const auto *It =
find(InVectors, V1);
11868 if (It == InVectors.
end()) {
11869 if (InVectors.
size() == 2 ||
11871 !isa<FixedVectorType>(V1->
getType())) {
11873 if (InVectors.
size() == 2) {
11874 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11875 transformMaskAfterShuffle(CommonMask, CommonMask);
11876 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11877 CommonMask.
size()) {
11878 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
11879 transformMaskAfterShuffle(CommonMask, CommonMask);
11881 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11884 V->getType() != V1->
getType()
11886 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
11887 ->getNumElements();
11888 if (V->getType() != V1->
getType())
11889 V1 = createShuffle(V1,
nullptr, Mask);
11890 InVectors.
front() = V;
11891 if (InVectors.
size() == 2)
11892 InVectors.
back() = V1;
11899 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11905 int VF = CommonMask.
size();
11906 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
11907 VF = FTy->getNumElements();
11908 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11910 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
11919 Value *Root =
nullptr) {
11920 return R.gather(VL, Root, ScalarTy);
11929 IsFinalized =
true;
11932 if (InVectors.
size() == 2) {
11933 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11936 Vec = createShuffle(Vec,
nullptr, CommonMask);
11938 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11942 "Expected vector length for the final value before action.");
11943 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11946 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11947 Vec = createShuffle(Vec,
nullptr, ResizeMask);
11949 Action(Vec, CommonMask);
11950 InVectors.
front() = Vec;
11952 if (!ExtMask.
empty()) {
11953 if (CommonMask.
empty()) {
11957 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
11960 NewMask[
I] = CommonMask[ExtMask[
I]];
11962 CommonMask.
swap(NewMask);
11965 if (CommonMask.
empty()) {
11966 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11967 return InVectors.
front();
11969 if (InVectors.
size() == 2)
11970 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11971 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
11976 "Shuffle construction must be finalized.");
11980Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
11981 bool PostponedPHIs) {
11982 ValueList &VL = E->getOperand(NodeIdx);
11983 const unsigned VF = VL.size();
11986 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11987 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
11988 if (It != VL.end())
11991 if (S.getOpcode()) {
11992 auto CheckSameVE = [&](
const TreeEntry *VE) {
11993 return VE->isSame(VL) &&
11994 (
any_of(VE->UserTreeIndices,
11995 [E, NodeIdx](
const EdgeInfo &EI) {
11996 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11998 any_of(VectorizableTree,
11999 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
12000 return TE->isOperandGatherNode({E, NodeIdx}) &&
12001 VE->isSame(TE->Scalars);
12004 TreeEntry *VE = getTreeEntry(S.OpValue);
12005 bool IsSameVE = VE && CheckSameVE(VE);
12007 auto It = MultiNodeScalars.
find(S.OpValue);
12008 if (It != MultiNodeScalars.
end()) {
12009 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
12010 return TE != VE && CheckSameVE(TE);
12012 if (
I != It->getSecond().end()) {
12020 ShuffleInstructionBuilder ShuffleBuilder(
12021 cast<VectorType>(
V->getType())->getElementType(), Builder, *
this);
12022 ShuffleBuilder.add(V, Mask);
12023 return ShuffleBuilder.finalize(std::nullopt);
12026 if (VF != cast<FixedVectorType>(
V->getType())->getNumElements()) {
12027 if (!VE->ReuseShuffleIndices.empty()) {
12048 if (isa<PoisonValue>(V))
12050 Mask[
I] = VE->findLaneForValue(V);
12052 V = FinalShuffle(V, Mask);
12054 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
12055 "Expected vectorization factor less "
12056 "than original vector size.");
12058 std::iota(UniformMask.begin(), UniformMask.end(), 0);
12059 V = FinalShuffle(V, UniformMask);
12065 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
12066 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12067 }) == VE->UserTreeIndices.end()) {
12069 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12070 return TE->State == TreeEntry::NeedToGather &&
12071 TE->UserTreeIndices.front().UserTE == E &&
12072 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12074 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
12075 (*It)->VectorizedValue =
V;
12084 auto *
I =
find_if(VectorizableTree,
12085 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
12086 return TE->isOperandGatherNode({E, NodeIdx});
12088 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
12089 assert(
I->get()->UserTreeIndices.size() == 1 &&
12090 "Expected only single user for the gather node.");
12091 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
12095template <
typename BVTy,
typename ResTy,
typename...
Args>
12096ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
12098 assert(E->State == TreeEntry::NeedToGather &&
"Expected gather node.");
12099 unsigned VF = E->getVectorFactor();
12101 bool NeedFreeze =
false;
12103 E->ReuseShuffleIndices.end());
12109 if (!ReorderMask.
empty())
12112 unsigned I,
unsigned SliceSize) {
12114 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12117 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12118 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12119 if (UserTE->getNumOperands() != 2)
12122 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
12123 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
12124 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12125 }) !=
TE->UserTreeIndices.end();
12127 if (It == VectorizableTree.end())
12130 if ((
Mask.size() < InputVF &&
12133 (
Mask.size() == InputVF &&
12135 std::iota(std::next(
Mask.begin(),
I * SliceSize),
12136 std::next(
Mask.begin(), (
I + 1) * SliceSize), 0);
12140 std::fill(std::next(
Mask.begin(),
I * SliceSize),
12141 std::next(
Mask.begin(), (
I + 1) * SliceSize), IVal);
12145 BVTy ShuffleBuilder(ScalarTy, Params...);
12146 ResTy Res = ResTy();
12150 Value *ExtractVecBase =
nullptr;
12151 bool UseVecBaseAsInput =
false;
12154 Type *OrigScalarTy = GatheredScalars.front()->getType();
12157 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12159 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
12161 bool Resized =
false;
12163 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12164 if (!ExtractShuffles.
empty()) {
12169 if (
const auto *TE = getTreeEntry(
12170 cast<ExtractElementInst>(E->Scalars[
Idx])->getVectorOperand()))
12173 if (std::optional<ResTy> Delayed =
12174 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12176 PostponedGathers.
insert(E);
12181 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
12182 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12183 ExtractVecBase = VecBase;
12184 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12185 if (VF == VecBaseTy->getNumElements() &&
12186 GatheredScalars.size() != VF) {
12188 GatheredScalars.append(VF - GatheredScalars.size(),
12194 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
12195 E->isAltShuffle() ||
12196 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
12198 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12200 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12202 if (!GatherShuffles.
empty()) {
12203 if (std::optional<ResTy> Delayed =
12204 ShuffleBuilder.needToDelay(E, Entries)) {
12206 PostponedGathers.
insert(E);
12211 if (GatherShuffles.
size() == 1 &&
12213 Entries.front().front()->isSame(E->Scalars)) {
12218 <<
"SLP: perfect diamond match for gather bundle "
12221 Mask.resize(E->Scalars.size());
12222 const TreeEntry *FrontTE = Entries.front().front();
12223 if (FrontTE->ReorderIndices.empty() &&
12224 ((FrontTE->ReuseShuffleIndices.empty() &&
12225 E->Scalars.size() == FrontTE->Scalars.size()) ||
12226 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12227 std::iota(
Mask.begin(),
Mask.end(), 0);
12230 if (isa<PoisonValue>(V)) {
12234 Mask[
I] = FrontTE->findLaneForValue(V);
12237 ShuffleBuilder.add(*FrontTE, Mask);
12238 Res = ShuffleBuilder.finalize(E->getCommonMask());
12242 if (GatheredScalars.size() != VF &&
12244 return any_of(TEs, [&](
const TreeEntry *TE) {
12245 return TE->getVectorFactor() == VF;
12248 GatheredScalars.append(VF - GatheredScalars.size(),
12252 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
12260 bool IsRootPoison) {
12263 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
12270 int NumNonConsts = 0;
12273 if (isa<UndefValue>(V)) {
12274 if (!isa<PoisonValue>(V)) {
12289 Scalars.
front() = OrigV;
12292 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
12293 Scalars[Res.first->second] = OrigV;
12294 ReuseMask[
I] = Res.first->second;
12297 if (NumNonConsts == 1) {
12302 if (!UndefPos.
empty() && UndefPos.
front() == 0)
12305 ReuseMask[SinglePos] = SinglePos;
12306 }
else if (!UndefPos.
empty() && IsSplat) {
12311 return !isa<UndefValue>(V) &&
12313 (E->UserTreeIndices.size() == 1 &&
12317 return E->UserTreeIndices.front().EdgeIdx !=
12318 U.getOperandNo() &&
12320 E->UserTreeIndices.front().UserTE->Scalars,
12324 if (It != Scalars.
end()) {
12326 int Pos = std::distance(Scalars.
begin(), It);
12327 for (
int I : UndefPos) {
12329 ReuseMask[
I] = Pos;
12338 for (
int I : UndefPos) {
12340 if (isa<UndefValue>(Scalars[
I]))
12347 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
12348 bool IsNonPoisoned =
true;
12349 bool IsUsedInExpr =
true;
12350 Value *Vec1 =
nullptr;
12351 if (!ExtractShuffles.
empty()) {
12355 Value *Vec2 =
nullptr;
12356 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12360 if (UseVecBaseAsInput) {
12361 Vec1 = ExtractVecBase;
12363 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12366 if (isa<UndefValue>(E->Scalars[
I]))
12368 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
12369 Value *VecOp = EI->getVectorOperand();
12370 if (
const auto *TE = getTreeEntry(VecOp))
12371 if (
TE->VectorizedValue)
12372 VecOp =
TE->VectorizedValue;
12375 }
else if (Vec1 != VecOp) {
12376 assert((!Vec2 || Vec2 == VecOp) &&
12377 "Expected only 1 or 2 vectors shuffle.");
12383 IsUsedInExpr =
false;
12386 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12388 IsUsedInExpr &= FindReusedSplat(
12390 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
12391 ExtractMask.size());
12392 ShuffleBuilder.add(Vec1, ExtractMask,
true);
12395 IsUsedInExpr =
false;
12400 if (!GatherShuffles.
empty()) {
12401 unsigned SliceSize = E->Scalars.size() / NumParts;
12403 for (
const auto [
I, TEs] :
enumerate(Entries)) {
12406 "No shuffles with empty entries list expected.");
12410 "Expected shuffle of 1 or 2 entries.");
12413 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
12414 if (TEs.
size() == 1) {
12415 IsUsedInExpr &= FindReusedSplat(
12416 VecMask, TEs.
front()->getVectorFactor(),
I, SliceSize);
12417 ShuffleBuilder.add(*TEs.
front(), VecMask);
12418 if (TEs.
front()->VectorizedValue)
12422 IsUsedInExpr =
false;
12423 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
12424 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
12435 int EMSz = ExtractMask.size();
12436 int MSz =
Mask.size();
12439 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
12440 bool IsIdentityShuffle =
12441 ((UseVecBaseAsInput ||
12443 [](
const std::optional<TTI::ShuffleKind> &SK) {
12447 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
12449 (!GatherShuffles.
empty() &&
12451 [](
const std::optional<TTI::ShuffleKind> &SK) {
12455 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
12457 bool EnoughConstsForShuffle =
12461 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12465 return isa<Constant>(V) && !isa<UndefValue>(V);
12467 (!IsIdentityShuffle ||
12468 (GatheredScalars.size() == 2 &&
12470 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
12472 return isa<Constant>(V) && !isa<PoisonValue>(V);
12476 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
12477 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
12483 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12485 TryPackScalars(GatheredScalars, BVMask,
true);
12486 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12487 ShuffleBuilder.add(BV, BVMask);
12490 return isa<PoisonValue>(V) ||
12491 (IsSingleShuffle && ((IsIdentityShuffle &&
12492 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12494 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12496 Res = ShuffleBuilder.finalize(
12497 E->ReuseShuffleIndices, E->Scalars.size(),
12499 TryPackScalars(NonConstants, Mask,
false);
12500 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
12505 TryPackScalars(GatheredScalars, ReuseMask,
true);
12506 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
12507 ShuffleBuilder.add(BV, ReuseMask);
12508 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12513 if (!isa<PoisonValue>(V))
12516 Value *BV = ShuffleBuilder.gather(E->Scalars);
12517 ShuffleBuilder.add(BV, Mask);
12518 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12522 Res = ShuffleBuilder.createFreeze(Res);
12526Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy) {
12527 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12534 if (E->VectorizedValue &&
12535 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12536 E->isAltShuffle())) {
12537 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
12538 return E->VectorizedValue;
12541 Value *
V = E->Scalars.front();
12542 Type *ScalarTy =
V->getType();
12543 if (
auto *Store = dyn_cast<StoreInst>(V))
12544 ScalarTy =
Store->getValueOperand()->getType();
12545 else if (
auto *IE = dyn_cast<InsertElementInst>(V))
12546 ScalarTy =
IE->getOperand(1)->getType();
12547 auto It = MinBWs.
find(E);
12548 if (It != MinBWs.
end())
12551 if (E->State == TreeEntry::NeedToGather) {
12553 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12554 setInsertPointAfterBundle(E);
12555 Value *Vec = createBuildVector(E, ScalarTy);
12556 E->VectorizedValue = Vec;
12561 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E,
VectorType *VecTy) {
12562 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
12563 if (E->getOpcode() == Instruction::Store &&
12564 E->State == TreeEntry::Vectorize) {
12566 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
12567 E->ReorderIndices.size());
12568 ShuffleBuilder.add(V, Mask);
12569 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12570 ShuffleBuilder.addOrdered(V, std::nullopt);
12572 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12574 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12577 assert((E->State == TreeEntry::Vectorize ||
12578 E->State == TreeEntry::ScatterVectorize ||
12579 E->State == TreeEntry::StridedVectorize) &&
12580 "Unhandled state");
12581 unsigned ShuffleOrOp =
12582 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
12584 auto GetOperandSignedness = [&](
unsigned Idx) {
12585 const TreeEntry *OpE = getOperandEntry(E,
Idx);
12586 bool IsSigned =
false;
12587 auto It = MinBWs.
find(OpE);
12588 if (It != MinBWs.
end())
12589 IsSigned = It->second.second;
12592 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12596 switch (ShuffleOrOp) {
12597 case Instruction::PHI: {
12598 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12599 E != VectorizableTree.front().get() ||
12600 !E->UserTreeIndices.empty()) &&
12601 "PHI reordering is free.");
12602 if (PostponedPHIs && E->VectorizedValue)
12603 return E->VectorizedValue;
12604 auto *PH = cast<PHINode>(VL0);
12606 PH->getParent()->getFirstNonPHIIt());
12608 if (PostponedPHIs || !E->VectorizedValue) {
12615 PH->getParent()->getFirstInsertionPt());
12618 V = FinalShuffle(V, E, VecTy);
12620 E->VectorizedValue =
V;
12624 PHINode *NewPhi = cast<PHINode>(E->PHI);
12633 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12639 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12643 if (!VisitedBBs.
insert(IBB).second) {
12650 Value *Vec = vectorizeOperand(E,
I,
true);
12651 if (VecTy != Vec->
getType()) {
12653 getOperandEntry(E,
I)->State == TreeEntry::NeedToGather ||
12654 MinBWs.
contains(getOperandEntry(E,
I))) &&
12655 "Expected item in MinBWs.");
12656 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
12662 "Invalid number of incoming values");
12666 case Instruction::ExtractElement: {
12667 Value *
V = E->getSingleOperand(0);
12668 if (
const TreeEntry *TE = getTreeEntry(V))
12669 V =
TE->VectorizedValue;
12670 setInsertPointAfterBundle(E);
12671 V = FinalShuffle(V, E, VecTy);
12672 E->VectorizedValue =
V;
12675 case Instruction::ExtractValue: {
12676 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12681 NewV = FinalShuffle(NewV, E, VecTy);
12682 E->VectorizedValue = NewV;
12685 case Instruction::InsertElement: {
12686 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
12688 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
12690 Type *ScalarTy =
Op.front()->getType();
12691 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
12693 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
12694 assert(Res.first > 0 &&
"Expected item in MinBWs.");
12699 cast<FixedVectorType>(
V->getType())->getNumElements()),
12704 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
12705 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12707 const unsigned NumElts =
12708 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12709 const unsigned NumScalars = E->Scalars.size();
12712 assert(
Offset < NumElts &&
"Failed to find vector index offset");
12716 if (!E->ReorderIndices.empty()) {
12721 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
12724 bool IsIdentity =
true;
12726 Mask.swap(PrevMask);
12727 for (
unsigned I = 0;
I < NumScalars; ++
I) {
12730 IsIdentity &= InsertIdx -
Offset ==
I;
12733 if (!IsIdentity || NumElts != NumScalars) {
12737 if (NumElts != NumScalars &&
Offset == 0) {
12746 InsertMask[*InsertIdx] = *InsertIdx;
12747 if (!
Ins->hasOneUse())
12749 Ins = dyn_cast_or_null<InsertElementInst>(
12750 Ins->getUniqueUndroppableUser());
12753 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12755 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12758 if (!IsFirstPoison.
all()) {
12760 for (
unsigned I = 0;
I < NumElts;
I++) {
12762 IsFirstUndef.
test(
I)) {
12763 if (IsVNonPoisonous) {
12764 InsertMask[
I] =
I < NumScalars ?
I : 0;
12769 if (
Idx >= NumScalars)
12770 Idx = NumScalars - 1;
12771 InsertMask[
I] = NumScalars +
Idx;
12785 if (
auto *
I = dyn_cast<Instruction>(V)) {
12786 GatherShuffleExtractSeq.
insert(
I);
12787 CSEBlocks.
insert(
I->getParent());
12792 for (
unsigned I = 0;
I < NumElts;
I++) {
12797 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12800 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
12801 NumElts != NumScalars) {
12802 if (IsFirstUndef.
all()) {
12805 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12806 if (!IsFirstPoison.
all()) {
12807 for (
unsigned I = 0;
I < NumElts;
I++) {
12809 InsertMask[
I] =
I + NumElts;
12816 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
12817 if (
auto *
I = dyn_cast<Instruction>(V)) {
12818 GatherShuffleExtractSeq.
insert(
I);
12819 CSEBlocks.
insert(
I->getParent());
12824 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12825 for (
unsigned I = 0;
I < NumElts;
I++) {
12829 InsertMask[
I] += NumElts;
12832 FirstInsert->getOperand(0), V, InsertMask,
12833 cast<Instruction>(E->Scalars.back())->getName());
12834 if (
auto *
I = dyn_cast<Instruction>(V)) {
12835 GatherShuffleExtractSeq.
insert(
I);
12836 CSEBlocks.
insert(
I->getParent());
12841 ++NumVectorInstructions;
12842 E->VectorizedValue =
V;
12845 case Instruction::ZExt:
12846 case Instruction::SExt:
12847 case Instruction::FPToUI:
12848 case Instruction::FPToSI:
12849 case Instruction::FPExt:
12850 case Instruction::PtrToInt:
12851 case Instruction::IntToPtr:
12852 case Instruction::SIToFP:
12853 case Instruction::UIToFP:
12854 case Instruction::Trunc:
12855 case Instruction::FPTrunc:
12856 case Instruction::BitCast: {
12857 setInsertPointAfterBundle(E);
12859 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12860 if (E->VectorizedValue) {
12861 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12862 return E->VectorizedValue;
12865 auto *CI = cast<CastInst>(VL0);
12867 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
12868 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
12870 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
12873 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
12874 if (SrcIt != MinBWs.
end())
12875 SrcBWSz = SrcIt->second.first;
12876 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
12877 if (BWSz == SrcBWSz) {
12878 VecOpcode = Instruction::BitCast;
12879 }
else if (BWSz < SrcBWSz) {
12880 VecOpcode = Instruction::Trunc;
12881 }
else if (It != MinBWs.
end()) {
12882 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12883 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12884 }
else if (SrcIt != MinBWs.
end()) {
12885 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12887 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12889 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
12890 !SrcIt->second.second) {
12891 VecOpcode = Instruction::UIToFP;
12893 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12895 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
12896 V = FinalShuffle(V, E, VecTy);
12898 E->VectorizedValue =
V;
12899 ++NumVectorInstructions;
12902 case Instruction::FCmp:
12903 case Instruction::ICmp: {
12904 setInsertPointAfterBundle(E);
12906 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
12907 if (E->VectorizedValue) {
12908 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12909 return E->VectorizedValue;
12911 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
12912 if (E->VectorizedValue) {
12913 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12914 return E->VectorizedValue;
12916 if (
L->getType() !=
R->getType()) {
12917 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12918 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12919 MinBWs.
contains(getOperandEntry(E, 0)) ||
12920 MinBWs.
contains(getOperandEntry(E, 1))) &&
12921 "Expected item in MinBWs.");
12922 if (cast<VectorType>(
L->getType())
12924 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
12926 ->getIntegerBitWidth()) {
12927 Type *CastTy =
R->getType();
12930 Type *CastTy =
L->getType();
12939 VecTy = cast<FixedVectorType>(
V->getType());
12940 V = FinalShuffle(V, E, VecTy);
12942 E->VectorizedValue =
V;
12943 ++NumVectorInstructions;
12946 case Instruction::Select: {
12947 setInsertPointAfterBundle(E);
12949 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
12950 if (E->VectorizedValue) {
12951 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12952 return E->VectorizedValue;
12954 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12955 if (E->VectorizedValue) {
12956 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12957 return E->VectorizedValue;
12959 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12960 if (E->VectorizedValue) {
12961 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12962 return E->VectorizedValue;
12966 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12967 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12968 MinBWs.
contains(getOperandEntry(E, 1)) ||
12969 MinBWs.
contains(getOperandEntry(E, 2))) &&
12970 "Expected item in MinBWs.");
12971 if (True->
getType() != VecTy)
12972 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
12973 if (False->
getType() != VecTy)
12974 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
12978 V = FinalShuffle(V, E, VecTy);
12980 E->VectorizedValue =
V;
12981 ++NumVectorInstructions;
12984 case Instruction::FNeg: {
12985 setInsertPointAfterBundle(E);
12987 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
12989 if (E->VectorizedValue) {
12990 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12991 return E->VectorizedValue;
12997 if (
auto *
I = dyn_cast<Instruction>(V))
13000 V = FinalShuffle(V, E, VecTy);
13002 E->VectorizedValue =
V;
13003 ++NumVectorInstructions;
13007 case Instruction::Add:
13008 case Instruction::FAdd:
13009 case Instruction::Sub:
13010 case Instruction::FSub:
13011 case Instruction::Mul:
13012 case Instruction::FMul:
13013 case Instruction::UDiv:
13014 case Instruction::SDiv:
13015 case Instruction::FDiv:
13016 case Instruction::URem:
13017 case Instruction::SRem:
13018 case Instruction::FRem:
13019 case Instruction::Shl:
13020 case Instruction::LShr:
13021 case Instruction::AShr:
13022 case Instruction::And:
13023 case Instruction::Or:
13024 case Instruction::Xor: {
13025 setInsertPointAfterBundle(E);
13027 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
13028 if (E->VectorizedValue) {
13029 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13030 return E->VectorizedValue;
13032 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
13033 if (E->VectorizedValue) {
13034 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13035 return E->VectorizedValue;
13037 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
13038 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13041 auto *CI = dyn_cast<ConstantInt>(
Op);
13042 return CI && CI->getValue().countr_one() >= It->second.first;
13044 V = FinalShuffle(
I == 0 ? RHS : LHS, E, VecTy);
13045 E->VectorizedValue =
V;
13046 ++NumVectorInstructions;
13053 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13054 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13055 MinBWs.
contains(getOperandEntry(E, 0)) ||
13056 MinBWs.
contains(getOperandEntry(E, 1))) &&
13057 "Expected item in MinBWs.");
13068 if (
auto *
I = dyn_cast<Instruction>(V)) {
13071 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
13073 return isCommutative(cast<Instruction>(V));
13075 I->setHasNoUnsignedWrap(
false);
13078 V = FinalShuffle(V, E, VecTy);
13080 E->VectorizedValue =
V;
13081 ++NumVectorInstructions;
13085 case Instruction::Load: {
13088 setInsertPointAfterBundle(E);
13090 LoadInst *LI = cast<LoadInst>(VL0);
13093 if (E->State == TreeEntry::Vectorize) {
13095 }
else if (E->State == TreeEntry::StridedVectorize) {
13096 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13097 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13098 PO = IsReverseOrder ? PtrN : Ptr0;
13104 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
13106 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13107 DL->getTypeAllocSize(ScalarTy));
13111 return cast<LoadInst>(V)->getPointerOperand();
13114 std::optional<Value *> Stride =
13123 (IsReverseOrder ? -1 : 1) *
13124 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
13126 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13128 Intrinsic::experimental_vp_strided_load,
13129 {VecTy, PO->
getType(), StrideTy},
13131 Builder.
getInt32(E->Scalars.size())});
13137 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
13138 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13139 if (E->VectorizedValue) {
13140 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13141 return E->VectorizedValue;
13144 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13149 V = FinalShuffle(V, E, VecTy);
13150 E->VectorizedValue =
V;
13151 ++NumVectorInstructions;
13154 case Instruction::Store: {
13155 auto *
SI = cast<StoreInst>(VL0);
13157 setInsertPointAfterBundle(E);
13159 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13160 if (VecValue->
getType() != VecTy)
13162 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13163 VecValue = FinalShuffle(VecValue, E, VecTy);
13167 if (E->State == TreeEntry::Vectorize) {
13170 assert(E->State == TreeEntry::StridedVectorize &&
13171 "Expected either strided or conseutive stores.");
13172 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13173 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
13175 Intrinsic::experimental_vp_strided_store,
13176 {VecTy,
Ptr->getType(), StrideTy},
13179 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
13181 Builder.
getInt32(E->Scalars.size())});
13190 E->VectorizedValue =
V;
13191 ++NumVectorInstructions;
13194 case Instruction::GetElementPtr: {
13195 auto *GEP0 = cast<GetElementPtrInst>(VL0);
13196 setInsertPointAfterBundle(E);
13198 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13199 if (E->VectorizedValue) {
13200 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13201 return E->VectorizedValue;
13205 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
13206 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13207 if (E->VectorizedValue) {
13208 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13209 return E->VectorizedValue;
13214 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13215 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
13217 for (
Value *V : E->Scalars) {
13218 if (isa<GetElementPtrInst>(V))
13224 V = FinalShuffle(V, E, VecTy);
13226 E->VectorizedValue =
V;
13227 ++NumVectorInstructions;
13231 case Instruction::Call: {
13232 CallInst *CI = cast<CallInst>(VL0);
13233 setInsertPointAfterBundle(E);
13239 It != MinBWs.
end() ? It->second.first : 0);
13242 VecCallCosts.first <= VecCallCosts.second;
13244 Value *ScalarArg =
nullptr;
13250 auto *CEI = cast<CallInst>(VL0);
13251 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
13256 ScalarArg = CEI->getArgOperand(
I);
13259 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
13260 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
13268 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
13269 if (E->VectorizedValue) {
13270 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13271 return E->VectorizedValue;
13273 ScalarArg = CEI->getArgOperand(
I);
13274 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
13276 It == MinBWs.
end()) {
13279 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
13280 }
else if (It != MinBWs.
end()) {
13281 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
13290 if (!UseIntrinsic) {
13306 V = FinalShuffle(V, E, VecTy);
13308 E->VectorizedValue =
V;
13309 ++NumVectorInstructions;
13312 case Instruction::ShuffleVector: {
13313 assert(E->isAltShuffle() &&
13318 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13319 "Invalid Shuffle Vector Operand");
13323 setInsertPointAfterBundle(E);
13324 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13325 if (E->VectorizedValue) {
13326 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13327 return E->VectorizedValue;
13329 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13331 setInsertPointAfterBundle(E);
13332 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13334 if (E->VectorizedValue) {
13335 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13336 return E->VectorizedValue;
13343 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13344 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13345 MinBWs.
contains(getOperandEntry(E, 0)) ||
13346 MinBWs.
contains(getOperandEntry(E, 1))) &&
13347 "Expected item in MinBWs.");
13348 Type *CastTy = VecTy;
13352 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
13354 ->getIntegerBitWidth())
13371 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13372 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
13373 auto *AltCI = cast<CmpInst>(E->getAltOp());
13375 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
13378 unsigned SrcBWSz =
DL->getTypeSizeInBits(
13379 cast<VectorType>(
LHS->
getType())->getElementType());
13380 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
13381 if (BWSz <= SrcBWSz) {
13382 if (BWSz < SrcBWSz)
13385 if (
auto *
I = dyn_cast<Instruction>(LHS))
13387 E->VectorizedValue =
LHS;
13388 ++NumVectorInstructions;
13399 for (
Value *V : {V0, V1}) {
13400 if (
auto *
I = dyn_cast<Instruction>(V)) {
13401 GatherShuffleExtractSeq.
insert(
I);
13402 CSEBlocks.
insert(
I->getParent());
13411 E->buildAltOpShuffleMask(
13413 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
13417 Mask, &OpScalars, &AltScalars);
13421 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
13423 if (
auto *
I = dyn_cast<Instruction>(Vec);
13424 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
13426 auto *IV = cast<Instruction>(V);
13427 return IV->getOpcode() == Instruction::Sub &&
13428 isCommutative(cast<Instruction>(IV));
13430 I->setHasNoUnsignedWrap(
false);
13432 DropNuwFlag(V0, E->getOpcode());
13433 DropNuwFlag(V1, E->getAltOpcode());
13436 if (
auto *
I = dyn_cast<Instruction>(V)) {
13438 GatherShuffleExtractSeq.
insert(
I);
13439 CSEBlocks.
insert(
I->getParent());
13442 E->VectorizedValue =
V;
13443 ++NumVectorInstructions;
13456 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13462struct ShuffledInsertData {
13475 for (
auto &BSIter : BlocksSchedules) {
13476 scheduleBlock(BSIter.second.get());
13480 EntryToLastInstruction.
clear();
13490 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13491 if (TE->State == TreeEntry::Vectorize &&
13492 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13493 TE->VectorizedValue)
13499 for (
const TreeEntry *E : PostponedNodes) {
13500 auto *TE =
const_cast<TreeEntry *
>(E);
13501 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
13502 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13503 TE->UserTreeIndices.front().EdgeIdx)) &&
13504 VecTE->isSame(TE->Scalars))
13508 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13509 TE->VectorizedValue =
nullptr;
13511 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13520 if (isa<PHINode>(UserI)) {
13523 for (
User *U : PrevVec->users()) {
13526 auto *UI = dyn_cast<Instruction>(U);
13527 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
13529 if (UI->comesBefore(InsertPt))
13538 if (Vec->
getType() != PrevVec->getType()) {
13540 PrevVec->getType()->isIntOrIntVectorTy() &&
13541 "Expected integer vector types only.");
13542 std::optional<bool> IsSigned;
13543 for (
Value *V : TE->Scalars) {
13544 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
13545 auto It = MinBWs.
find(BaseTE);
13546 if (It != MinBWs.
end()) {
13547 IsSigned = IsSigned.value_or(
false) || It->second.second;
13551 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
13552 auto It = MinBWs.
find(MNTE);
13553 if (It != MinBWs.
end()) {
13554 IsSigned = IsSigned.value_or(
false) || It->second.second;
13559 if (IsSigned.value_or(
false))
13562 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13563 auto It = MinBWs.
find(BVE);
13564 if (It != MinBWs.
end()) {
13565 IsSigned = IsSigned.value_or(
false) || It->second.second;
13570 if (IsSigned.value_or(
false))
13572 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
13574 IsSigned.value_or(
false) ||
13578 if (IsSigned.value_or(
false))
13582 if (IsSigned.value_or(
false)) {
13584 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
13585 if (It != MinBWs.
end())
13586 IsSigned = It->second.second;
13589 "Expected user node or perfect diamond match in MinBWs.");
13593 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
13596 auto It = PostponedValues.
find(PrevVec);
13597 if (It != PostponedValues.
end()) {
13598 for (TreeEntry *VTE : It->getSecond())
13599 VTE->VectorizedValue = Vec;
13619 for (
const auto &ExternalUse : ExternalUses) {
13620 Value *Scalar = ExternalUse.Scalar;
13627 TreeEntry *E = getTreeEntry(Scalar);
13628 assert(E &&
"Invalid scalar");
13629 assert(E->State != TreeEntry::NeedToGather &&
13630 "Extracting from a gather list");
13632 if (E->getOpcode() == Instruction::GetElementPtr &&
13633 !isa<GetElementPtrInst>(Scalar))
13636 Value *Vec = E->VectorizedValue;
13637 assert(Vec &&
"Can't find vectorizable value");
13640 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
13641 if (Scalar->getType() != Vec->
getType()) {
13642 Value *Ex =
nullptr;
13643 Value *ExV =
nullptr;
13644 auto *
GEP = dyn_cast<GetElementPtrInst>(Scalar);
13646 auto It = ScalarToEEs.find(Scalar);
13647 if (It != ScalarToEEs.end()) {
13651 if (EEIt != It->second.end()) {
13657 if (
auto *CI = EEIt->second.second)
13661 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13666 if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13667 Value *V = ES->getVectorOperand();
13668 if (
const TreeEntry *ETE = getTreeEntry(V))
13669 V = ETE->VectorizedValue;
13671 }
else if (ReplaceGEP) {
13674 auto *CloneGEP =
GEP->clone();
13675 if (isa<Instruction>(Vec))
13679 CloneGEP->insertBefore(
GEP);
13680 if (
GEP->hasName())
13681 CloneGEP->takeName(
GEP);
13689 if (Scalar->getType() != Ex->
getType())
13691 MinBWs.
find(E)->second.second);
13692 if (
auto *
I = dyn_cast<Instruction>(Ex))
13693 ScalarToEEs[Scalar].try_emplace(
13695 std::make_pair(
I, cast<Instruction>(ExV)));
13699 if (
auto *ExI = dyn_cast<Instruction>(Ex)) {
13700 GatherShuffleExtractSeq.
insert(ExI);
13701 CSEBlocks.
insert(ExI->getParent());
13705 assert(isa<FixedVectorType>(Scalar->getType()) &&
13706 isa<InsertElementInst>(Scalar) &&
13707 "In-tree scalar of vector type is not insertelement?");
13708 auto *IE = cast<InsertElementInst>(Scalar);
13716 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
13721 if (ExternalUsesAsGEPs.contains(U))
13723 TreeEntry *UseEntry = getTreeEntry(U);
13725 (UseEntry->State == TreeEntry::Vectorize ||
13727 TreeEntry::StridedVectorize) &&
13728 (E->State == TreeEntry::Vectorize ||
13729 E->State == TreeEntry::StridedVectorize) &&
13730 doesInTreeUserNeedToExtract(
13732 cast<Instruction>(UseEntry->Scalars.front()),
13735 "Scalar with nullptr User must be registered in "
13736 "ExternallyUsedValues map or remain as scalar in vectorized "
13738 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13739 if (
auto *
PHI = dyn_cast<PHINode>(VecI))
13741 PHI->getParent()->getFirstNonPHIIt());
13744 std::next(VecI->getIterator()));
13748 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13750 Scalar->replaceAllUsesWith(NewInst);
13751 ReplacedExternals.emplace_back(Scalar, NewInst);
13755 if (
auto *VU = dyn_cast<InsertElementInst>(
User)) {
13757 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13758 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
13759 if (!UsedInserts.
insert(VU).second)
13762 auto BWIt = MinBWs.
find(E);
13764 auto *ScalarTy = FTy->getElementType();
13765 auto Key = std::make_pair(Vec, ScalarTy);
13766 auto VecIt = VectorCasts.
find(Key);
13767 if (VecIt == VectorCasts.
end()) {
13769 if (
auto *IVec = dyn_cast<PHINode>(Vec))
13771 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
13772 else if (
auto *IVec = dyn_cast<Instruction>(Vec))
13778 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
13779 BWIt->second.second);
13782 Vec = VecIt->second;
13789 find_if(ShuffledInserts, [VU](
const ShuffledInsertData &
Data) {
13796 unsigned Idx = *InsertIdx;
13797 if (It == ShuffledInserts.
end()) {
13799 It = std::next(ShuffledInserts.
begin(),
13800 ShuffledInserts.
size() - 1);
13806 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
13807 if (IEBase !=
User &&
13808 (!IEBase->hasOneUse() ||
13812 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
13814 IEBase = cast<InsertElementInst>(
Base);
13817 "InsertElementInstruction used already.");
13818 Mask[IEIdx] = IEIdx;
13819 Base = IEBase->getOperand(0);
13820 }
while (E == getTreeEntry(
Base));
13823 Base = cast<InsertElementInst>(
Base)->getOperand(0);
13827 auto It = VectorToInsertElement.
find(
Base);
13828 if (It != VectorToInsertElement.
end())
13835 Mask[
Idx] = ExternalUse.Lane;
13836 It->InsertElements.push_back(cast<InsertElementInst>(
User));
13845 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13847 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13848 if (PH->getIncomingValue(
I) == Scalar) {
13850 PH->getIncomingBlock(
I)->getTerminator();
13851 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13853 std::next(VecI->getIterator()));
13857 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13858 PH->setOperand(
I, NewInst);
13863 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13868 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13878 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
13879 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
13881 CombinedMask1[
I] = Mask[
I];
13883 CombinedMask2[
I] = Mask[
I] - VF;
13886 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
13887 ShuffleBuilder.
add(V1, CombinedMask1);
13889 ShuffleBuilder.
add(V2, CombinedMask2);
13890 return ShuffleBuilder.
finalize(std::nullopt);
13894 bool ForSingleMask) {
13895 unsigned VF = Mask.size();
13896 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
13898 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
13899 Vec = CreateShuffle(Vec,
nullptr, Mask);
13900 return std::make_pair(Vec,
true);
13902 if (!ForSingleMask) {
13904 for (
unsigned I = 0;
I < VF; ++
I) {
13906 ResizeMask[Mask[
I]] = Mask[
I];
13908 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
13912 return std::make_pair(Vec,
false);
13916 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
13922 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
13923 Value *NewInst = performExtractsShuffleAction<Value>(
13927 return cast<VectorType>(Vec->getType())
13928 ->getElementCount()
13929 .getKnownMinValue();
13934 assert((Vals.size() == 1 || Vals.size() == 2) &&
13935 "Expected exactly 1 or 2 input values.");
13936 if (Vals.size() == 1) {
13939 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13940 ->getNumElements() ||
13941 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13942 return CreateShuffle(Vals.front(), nullptr, Mask);
13943 return Vals.front();
13945 return CreateShuffle(Vals.
front() ? Vals.
front()
13947 Vals.
back(), Mask);
13949 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
13952 if (It != ShuffledInserts[
I].InsertElements.
rend())
13955 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
13956 assert(II &&
"Must be an insertelement instruction.");
13960 Inserts.
push_back(cast<Instruction>(II));
13961 II = dyn_cast<InsertElementInst>(II->
getOperand(0));
13965 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
13972 IE->replaceUsesOfWith(IE->getOperand(0),
13974 IE->replaceUsesOfWith(IE->getOperand(1),
13978 CSEBlocks.
insert(LastInsert->getParent());
13983 for (
auto &TEPtr : VectorizableTree) {
13984 TreeEntry *Entry = TEPtr.get();
13987 if (Entry->State == TreeEntry::NeedToGather)
13990 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
13993 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
13994 Value *Scalar = Entry->Scalars[Lane];
13996 if (Entry->getOpcode() == Instruction::GetElementPtr &&
13997 !isa<GetElementPtrInst>(Scalar))
14000 Type *Ty = Scalar->getType();
14002 for (
User *U : Scalar->users()) {
14006 assert((getTreeEntry(U) ||
14007 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14008 (isa_and_nonnull<Instruction>(U) &&
14009 isDeleted(cast<Instruction>(U)))) &&
14010 "Deleting out-of-tree value");
14014 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
14019 RemovedInsts.
push_back(cast<Instruction>(Scalar));
14025 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14026 V->mergeDIAssignID(RemovedInsts);
14029 InstrElementSize.
clear();
14031 const TreeEntry &RootTE = *VectorizableTree.front().get();
14032 Value *Vec = RootTE.VectorizedValue;
14033 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14034 It != MinBWs.end() &&
14035 ReductionBitWidth != It->second.first) {
14038 ReductionRoot->getIterator());
14042 cast<VectorType>(Vec->
getType())->getElementCount()),
14043 It->second.second);
14050 <<
" gather sequences instructions.\n");
14057 Loop *L = LI->getLoopFor(
I->getParent());
14062 BasicBlock *PreHeader = L->getLoopPreheader();
14070 auto *OpI = dyn_cast<Instruction>(V);
14071 return OpI && L->contains(OpI);
14077 CSEBlocks.
insert(PreHeader);
14092 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
14093 "Different nodes should have different DFS numbers");
14094 return A->getDFSNumIn() <
B->getDFSNumIn();
14104 if (I1->getType() != I2->getType())
14106 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14107 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14109 return I1->isIdenticalTo(I2);
14110 if (SI1->isIdenticalTo(SI2))
14112 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
14113 if (SI1->getOperand(
I) != SI2->getOperand(
I))
14116 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14120 unsigned LastUndefsCnt = 0;
14121 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
14127 NewMask[
I] != SM1[
I])
14130 NewMask[
I] = SM1[
I];
14134 return SM1.
size() - LastUndefsCnt > 1 &&
14138 SM1.
size() - LastUndefsCnt));
14144 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
14147 "Worklist not sorted properly!");
14153 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14154 !GatherShuffleExtractSeq.contains(&In))
14159 bool Replaced =
false;
14162 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14163 DT->
dominates(V->getParent(), In.getParent())) {
14164 In.replaceAllUsesWith(V);
14166 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
14167 if (!NewMask.
empty())
14168 SI->setShuffleMask(NewMask);
14172 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14173 GatherShuffleExtractSeq.contains(V) &&
14174 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14175 DT->
dominates(In.getParent(), V->getParent())) {
14177 V->replaceAllUsesWith(&In);
14179 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14180 if (!NewMask.
empty())
14181 SI->setShuffleMask(NewMask);
14189 Visited.push_back(&In);
14194 GatherShuffleExtractSeq.clear();
14197BoUpSLP::ScheduleData *
14199 ScheduleData *Bundle =
nullptr;
14200 ScheduleData *PrevInBundle =
nullptr;
14201 for (
Value *V : VL) {
14204 ScheduleData *BundleMember = getScheduleData(V);
14206 "no ScheduleData for bundle member "
14207 "(maybe not in same basic block)");
14208 assert(BundleMember->isSchedulingEntity() &&
14209 "bundle member already part of other bundle");
14210 if (PrevInBundle) {
14211 PrevInBundle->NextInBundle = BundleMember;
14213 Bundle = BundleMember;
14217 BundleMember->FirstInBundle = Bundle;
14218 PrevInBundle = BundleMember;
14220 assert(Bundle &&
"Failed to find schedule bundle");
14226std::optional<BoUpSLP::ScheduleData *>
14228 const InstructionsState &S) {
14239 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
14240 ScheduleData *Bundle) {
14246 if (ScheduleEnd != OldScheduleEnd) {
14247 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
14248 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->clearDependencies(); });
14253 <<
" in block " << BB->
getName() <<
"\n");
14254 calculateDependencies(Bundle,
true, SLP);
14259 initialFillReadyList(ReadyInsts);
14266 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14267 !ReadyInsts.empty()) {
14268 ScheduleData *Picked = ReadyInsts.pop_back_val();
14269 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14270 "must be ready to schedule");
14271 schedule(Picked, ReadyInsts);
14277 for (
Value *V : VL) {
14280 if (!extendSchedulingRegion(V, S)) {
14287 TryScheduleBundleImpl(
false,
nullptr);
14288 return std::nullopt;
14292 bool ReSchedule =
false;
14293 for (
Value *V : VL) {
14296 ScheduleData *BundleMember = getScheduleData(V);
14298 "no ScheduleData for bundle member (maybe not in same basic block)");
14302 ReadyInsts.remove(BundleMember);
14304 if (!BundleMember->IsScheduled)
14309 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
14310 <<
" was already scheduled\n");
14314 auto *Bundle = buildBundle(VL);
14315 TryScheduleBundleImpl(ReSchedule, Bundle);
14316 if (!Bundle->isReady()) {
14317 cancelScheduling(VL, S.OpValue);
14318 return std::nullopt;
14331 ScheduleData *Bundle = getScheduleData(OpValue);
14332 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
14333 assert(!Bundle->IsScheduled &&
14334 "Can't cancel bundle which is already scheduled");
14335 assert(Bundle->isSchedulingEntity() &&
14337 "tried to unbundle something which is not a bundle");
14340 if (Bundle->isReady())
14341 ReadyInsts.remove(Bundle);
14344 ScheduleData *BundleMember = Bundle;
14345 while (BundleMember) {
14346 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
14347 BundleMember->FirstInBundle = BundleMember;
14348 ScheduleData *Next = BundleMember->NextInBundle;
14349 BundleMember->NextInBundle =
nullptr;
14350 BundleMember->TE =
nullptr;
14351 if (BundleMember->unscheduledDepsInBundle() == 0) {
14352 ReadyInsts.insert(BundleMember);
14354 BundleMember = Next;
14358BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14360 if (ChunkPos >= ChunkSize) {
14361 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14364 return &(ScheduleDataChunks.back()[ChunkPos++]);
14367bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V,
14368 const InstructionsState &S) {
14369 if (getScheduleData(V,
isOneOf(S, V)))
14372 assert(
I &&
"bundle member must be an instruction");
14375 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14377 auto &&CheckScheduleForI = [
this, &S](
Instruction *
I) ->
bool {
14378 ScheduleData *ISD = getScheduleData(
I);
14381 assert(isInSchedulingRegion(ISD) &&
14382 "ScheduleData not in scheduling region");
14383 ScheduleData *SD = allocateScheduleDataChunks();
14385 SD->init(SchedulingRegionID, S.OpValue);
14386 ExtraScheduleDataMap[
I][S.OpValue] = SD;
14389 if (CheckScheduleForI(
I))
14391 if (!ScheduleStart) {
14393 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
14395 ScheduleEnd =
I->getNextNode();
14397 CheckScheduleForI(
I);
14398 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14399 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
14407 ++ScheduleStart->getIterator().getReverse();
14412 if (
auto *II = dyn_cast<IntrinsicInst>(&
I))
14413 return II->isAssumeLikeIntrinsic();
14416 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14417 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14418 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
14420 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14421 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
14428 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14429 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14431 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
14432 assert(
I->getParent() == ScheduleStart->getParent() &&
14433 "Instruction is in wrong basic block.");
14434 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
14437 CheckScheduleForI(
I);
14442 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
14443 "Expected to reach top of the basic block or instruction down the "
14445 assert(
I->getParent() == ScheduleEnd->getParent() &&
14446 "Instruction is in wrong basic block.");
14447 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
14449 ScheduleEnd =
I->getNextNode();
14451 CheckScheduleForI(
I);
14452 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14453 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
14457void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
14459 ScheduleData *PrevLoadStore,
14460 ScheduleData *NextLoadStore) {
14461 ScheduleData *CurrentLoadStore = PrevLoadStore;
14466 ScheduleData *SD = ScheduleDataMap.lookup(
I);
14468 SD = allocateScheduleDataChunks();
14469 ScheduleDataMap[
I] = SD;
14472 assert(!isInSchedulingRegion(SD) &&
14473 "new ScheduleData already in scheduling region");
14474 SD->init(SchedulingRegionID,
I);
14476 if (
I->mayReadOrWriteMemory() &&
14477 (!isa<IntrinsicInst>(
I) ||
14478 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
14479 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
14480 Intrinsic::pseudoprobe))) {
14482 if (CurrentLoadStore) {
14483 CurrentLoadStore->NextLoadStore = SD;
14485 FirstLoadStoreInRegion = SD;
14487 CurrentLoadStore = SD;
14490 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14491 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14492 RegionHasStackSave =
true;
14494 if (NextLoadStore) {
14495 if (CurrentLoadStore)
14496 CurrentLoadStore->NextLoadStore = NextLoadStore;
14498 LastLoadStoreInRegion = CurrentLoadStore;
14502void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14503 bool InsertInReadyList,
14505 assert(SD->isSchedulingEntity());
14510 while (!WorkList.
empty()) {
14512 for (ScheduleData *BundleMember = SD; BundleMember;
14513 BundleMember = BundleMember->NextInBundle) {
14514 assert(isInSchedulingRegion(BundleMember));
14515 if (BundleMember->hasValidDependencies())
14520 BundleMember->Dependencies = 0;
14521 BundleMember->resetUnscheduledDeps();
14524 if (BundleMember->OpValue != BundleMember->Inst) {
14525 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14526 BundleMember->Dependencies++;
14527 ScheduleData *DestBundle = UseSD->FirstInBundle;
14528 if (!DestBundle->IsScheduled)
14529 BundleMember->incrementUnscheduledDeps(1);
14530 if (!DestBundle->hasValidDependencies())
14534 for (
User *U : BundleMember->Inst->
users()) {
14535 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14536 BundleMember->Dependencies++;
14537 ScheduleData *DestBundle = UseSD->FirstInBundle;
14538 if (!DestBundle->IsScheduled)
14539 BundleMember->incrementUnscheduledDeps(1);
14540 if (!DestBundle->hasValidDependencies())
14547 auto *DepDest = getScheduleData(
I);
14548 assert(DepDest &&
"must be in schedule window");
14549 DepDest->ControlDependencies.push_back(BundleMember);
14550 BundleMember->Dependencies++;
14551 ScheduleData *DestBundle = DepDest->FirstInBundle;
14552 if (!DestBundle->IsScheduled)
14553 BundleMember->incrementUnscheduledDeps(1);
14554 if (!DestBundle->hasValidDependencies())
14562 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14563 I != ScheduleEnd;
I =
I->getNextNode()) {
14568 MakeControlDependent(
I);
14576 if (RegionHasStackSave) {
14580 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14581 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14582 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14583 I != ScheduleEnd;
I =
I->getNextNode()) {
14584 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14585 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14590 if (!isa<AllocaInst>(
I))
14594 MakeControlDependent(
I);
14603 if (isa<AllocaInst>(BundleMember->Inst) ||
14604 BundleMember->Inst->mayReadOrWriteMemory()) {
14605 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14606 I != ScheduleEnd;
I =
I->getNextNode()) {
14607 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
14608 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14612 MakeControlDependent(
I);
14619 ScheduleData *DepDest = BundleMember->NextLoadStore;
14624 "NextLoadStore list for non memory effecting bundle?");
14626 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14627 unsigned NumAliased = 0;
14628 unsigned DistToSrc = 1;
14630 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14631 assert(isInSchedulingRegion(DepDest));
14641 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14643 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14650 DepDest->MemoryDependencies.push_back(BundleMember);
14651 BundleMember->Dependencies++;
14652 ScheduleData *DestBundle = DepDest->FirstInBundle;
14653 if (!DestBundle->IsScheduled) {
14654 BundleMember->incrementUnscheduledDeps(1);
14656 if (!DestBundle->hasValidDependencies()) {
14679 if (InsertInReadyList && SD->isReady()) {
14680 ReadyInsts.insert(SD);
14687void BoUpSLP::BlockScheduling::resetSchedule() {
14689 "tried to reset schedule on block which has not been scheduled");
14690 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
14691 doForAllOpcodes(
I, [&](ScheduleData *SD) {
14692 assert(isInSchedulingRegion(SD) &&
14693 "ScheduleData not in scheduling region");
14694 SD->IsScheduled =
false;
14695 SD->resetUnscheduledDeps();
14698 ReadyInsts.clear();
14701void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14702 if (!BS->ScheduleStart)
14705 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
14712 BS->resetSchedule();
14719 struct ScheduleDataCompare {
14720 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
14721 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14724 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14729 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
14730 I =
I->getNextNode()) {
14731 BS->doForAllOpcodes(
I, [
this, &
Idx, BS](ScheduleData *SD) {
14732 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14735 SD->isPartOfBundle() ==
14737 "scheduler and vectorizer bundle mismatch");
14738 SD->FirstInBundle->SchedulingPriority =
Idx++;
14740 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14741 BS->calculateDependencies(SD,
false,
this);
14744 BS->initialFillReadyList(ReadyInsts);
14746 Instruction *LastScheduledInst = BS->ScheduleEnd;
14749 while (!ReadyInsts.empty()) {
14750 ScheduleData *Picked = *ReadyInsts.begin();
14751 ReadyInsts.erase(ReadyInsts.begin());
14755 for (ScheduleData *BundleMember = Picked; BundleMember;
14756 BundleMember = BundleMember->NextInBundle) {
14760 LastScheduledInst = PickedInst;
14763 BS->schedule(Picked, ReadyInsts);
14767#ifdef EXPENSIVE_CHECKS
14771#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14773 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
14774 BS->doForAllOpcodes(
I, [&](ScheduleData *SD) {
14775 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14776 assert(SD->IsScheduled &&
"must be scheduled at this point");
14783 BS->ScheduleStart =
nullptr;
14790 if (
auto *Store = dyn_cast<StoreInst>(V))
14791 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14793 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
14796 auto E = InstrElementSize.
find(V);
14797 if (E != InstrElementSize.
end())
14806 if (
auto *
I = dyn_cast<Instruction>(V)) {
14814 Value *FirstNonBool =
nullptr;
14815 while (!Worklist.
empty()) {
14820 auto *Ty =
I->getType();
14821 if (isa<VectorType>(Ty))
14823 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
14830 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
14831 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
14839 for (
Use &U :
I->operands()) {
14840 if (
auto *J = dyn_cast<Instruction>(U.get()))
14841 if (Visited.
insert(J).second &&
14842 (isa<PHINode>(
I) || J->getParent() == Parent)) {
14846 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
14847 FirstNonBool = U.get();
14858 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
14860 Width =
DL->getTypeSizeInBits(V->getType());
14864 InstrElementSize[
I] = Width;
14869bool BoUpSLP::collectValuesToDemote(
14870 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
14872 unsigned &MaxDepthLevel,
bool &IsProfitableToDemote,
14873 bool IsTruncRoot)
const {
14875 if (
all_of(E.Scalars, IsaPred<Constant>))
14878 unsigned OrigBitWidth =
DL->getTypeSizeInBits(E.Scalars.front()->getType());
14887 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
14896 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14900 if (
auto *
I = dyn_cast<Instruction>(V)) {
14902 unsigned BitWidth2 =
14903 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
14904 while (!IsSigned && BitWidth2 < OrigBitWidth) {
14910 BitWidth1 = std::min(BitWidth1, BitWidth2);
14915 using namespace std::placeholders;
14916 auto FinalAnalysis = [&]() {
14917 if (!IsProfitableToDemote)
14920 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
14922 if (Res && E.State == TreeEntry::NeedToGather) {
14926 for (
Value *V : E.Scalars) {
14927 auto *EE = dyn_cast<ExtractElementInst>(V);
14930 UniqueBases.
insert(EE->getVectorOperand());
14932 const unsigned VF = E.Scalars.size();
14933 Type *OrigScalarTy = E.Scalars.front()->getType();
14934 if (UniqueBases.
size() <= 2 ||
14942 if (E.State == TreeEntry::NeedToGather || !Visited.
insert(&E).second ||
14944 return all_of(V->users(), [&](User *U) {
14945 return isa<InsertElementInst>(U) && !getTreeEntry(U);
14948 return FinalAnalysis();
14951 return !all_of(V->users(), [=](User *U) {
14952 return getTreeEntry(U) ||
14953 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14954 (!isa<CmpInst>(U) && U->getType()->isSized() &&
14955 !U->getType()->isScalableTy() &&
14956 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
14957 }) && !IsPotentiallyTruncated(V,
BitWidth);
14962 bool &NeedToExit) {
14963 NeedToExit =
false;
14964 unsigned InitLevel = MaxDepthLevel;
14966 unsigned Level = InitLevel;
14967 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
14968 ToDemote, Visited, Level, IsProfitableToDemote,
14970 if (!IsProfitableToDemote)
14973 if (!FinalAnalysis())
14977 MaxDepthLevel = std::max(MaxDepthLevel, Level);
14981 auto AttemptCheckBitwidth =
14984 NeedToExit =
false;
14985 unsigned BestFailBitwidth = 0;
14987 if (Checker(
BitWidth, OrigBitWidth))
14989 if (BestFailBitwidth == 0 && FinalAnalysis())
14993 if (BestFailBitwidth == 0) {
15004 auto TryProcessInstruction =
15011 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15016 if (E.UserTreeIndices.size() > 1 &&
15017 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15020 bool NeedToExit =
false;
15021 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15025 if (!ProcessOperands(
Operands, NeedToExit))
15034 return IsProfitableToDemote;
15036 switch (E.getOpcode()) {
15040 case Instruction::Trunc:
15041 if (IsProfitableToDemoteRoot)
15042 IsProfitableToDemote =
true;
15043 return TryProcessInstruction(
BitWidth);
15044 case Instruction::ZExt:
15045 case Instruction::SExt:
15046 IsProfitableToDemote =
true;
15047 return TryProcessInstruction(
BitWidth);
15051 case Instruction::Add:
15052 case Instruction::Sub:
15053 case Instruction::Mul:
15054 case Instruction::And:
15055 case Instruction::Or:
15056 case Instruction::Xor: {
15057 return TryProcessInstruction(
15058 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15060 case Instruction::Shl: {
15065 auto *I = cast<Instruction>(V);
15066 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15067 return AmtKnownBits.getMaxValue().ult(BitWidth);
15070 return TryProcessInstruction(
15071 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15073 case Instruction::LShr: {
15077 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15079 auto *I = cast<Instruction>(V);
15080 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15081 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15082 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15083 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15084 SimplifyQuery(*DL));
15087 return TryProcessInstruction(
15088 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15091 case Instruction::AShr: {
15095 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15097 auto *I = cast<Instruction>(V);
15098 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15099 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15100 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15101 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15105 return TryProcessInstruction(
15106 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15109 case Instruction::UDiv:
15110 case Instruction::URem: {
15112 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15115 auto *I = cast<Instruction>(V);
15116 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15117 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15118 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15121 return TryProcessInstruction(
15122 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15126 case Instruction::Select: {
15127 return TryProcessInstruction(
15128 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15133 case Instruction::PHI: {
15134 const unsigned NumOps = E.getNumOperands();
15137 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
15139 return TryProcessInstruction(
BitWidth, Ops);
15142 case Instruction::Call: {
15143 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15147 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
15148 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
15152 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15155 auto *I = cast<Instruction>(V);
15156 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15157 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15158 return MaskedValueIsZero(I->getOperand(0), Mask,
15159 SimplifyQuery(*DL)) &&
15160 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15162 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
15163 "Expected min/max intrinsics only.");
15164 unsigned SignBits = OrigBitWidth -
BitWidth;
15177 if (
ID != Intrinsic::abs) {
15178 Operands.push_back(getOperandEntry(&E, 1));
15179 CallChecker = CompChecker;
15182 std::numeric_limits<InstructionCost::CostType>::max();
15184 unsigned VF = E.Scalars.size();
15194 if (
Cost < BestCost) {
15200 [[maybe_unused]]
bool NeedToExit;
15201 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15211 return FinalAnalysis();
15218 bool IsStoreOrInsertElt =
15219 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15220 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15221 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15222 ExtraBitWidthNodes.
size() <= 1 &&
15223 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15224 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15227 unsigned NodeIdx = 0;
15228 if (IsStoreOrInsertElt &&
15229 VectorizableTree.front()->State != TreeEntry::NeedToGather)
15233 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
15234 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.
empty()) ||
15235 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15238 static_cast<int>(NodeIdx);
15244 bool IsTruncRoot =
false;
15245 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15247 if (NodeIdx != 0 &&
15248 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15249 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
15250 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
15251 IsTruncRoot =
true;
15253 IsProfitableToDemoteRoot =
true;
15258 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
15262 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
15263 bool IsProfitableToDemoteRoot,
unsigned Opcode,
15264 unsigned Limit,
bool IsTruncRoot,
15265 bool IsSignedCmp) {
15267 unsigned VF = E.getVectorFactor();
15268 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
15269 if (!TreeRootIT || !Opcode)
15273 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
15276 unsigned NumParts =
15282 unsigned MaxBitWidth = 1u;
15290 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
15291 KnownBits Known = computeKnownBits(R, *DL);
15292 return Known.isNonNegative();
15297 for (
Value *Root : E.Scalars) {
15300 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15316 if (!IsKnownPositive)
15320 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15322 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15325 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15330 if (NumParts > 1 &&
15336 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15337 Opcode == Instruction::SExt ||
15338 Opcode == Instruction::ZExt || NumParts > 1;
15343 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15344 bool NeedToDemote = IsProfitableToDemote;
15346 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15347 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15349 (MaxDepthLevel <= Limit &&
15350 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15351 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15352 DL->getTypeSizeInBits(TreeRootIT) /
15353 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15359 MaxBitWidth =
bit_ceil(MaxBitWidth);
15361 return MaxBitWidth;
15368 if (UserIgnoreList &&
15369 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15370 for (
Value *V : *UserIgnoreList) {
15372 auto NumTypeBits =
DL->getTypeSizeInBits(V->getType());
15373 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15376 unsigned BitWidth2 = BitWidth1;
15379 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15381 ReductionBitWidth =
15382 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15384 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15385 ReductionBitWidth = 8;
15387 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
15389 bool IsTopRoot = NodeIdx == 0;
15390 while (NodeIdx < VectorizableTree.size() &&
15391 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15392 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15395 IsTruncRoot =
true;
15397 bool IsSignedCmp =
false;
15398 while (NodeIdx < VectorizableTree.size()) {
15400 unsigned Limit = 2;
15401 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15403 ReductionBitWidth ==
15404 DL->getTypeSizeInBits(
15405 VectorizableTree.front()->Scalars.front()->getType()))
15407 unsigned MaxBitWidth = ComputeMaxBitWidth(
15408 *VectorizableTree[NodeIdx].
get(), IsTopRoot, IsProfitableToDemoteRoot,
15409 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15410 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
15411 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15412 ReductionBitWidth =
bit_ceil(MaxBitWidth);
15413 else if (MaxBitWidth == 0)
15414 ReductionBitWidth = 0;
15417 for (
unsigned Idx : RootDemotes) {
15419 uint32_t OrigBitWidth =
DL->getTypeSizeInBits(V->getType());
15420 if (OrigBitWidth > MaxBitWidth) {
15428 RootDemotes.clear();
15430 IsProfitableToDemoteRoot =
true;
15432 if (ExtraBitWidthNodes.
empty()) {
15433 NodeIdx = VectorizableTree.size();
15435 unsigned NewIdx = 0;
15437 NewIdx = *ExtraBitWidthNodes.
begin();
15438 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
15439 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
15442 NodeIdx < VectorizableTree.size() &&
15443 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15446 EI.
UserTE->getOpcode() == Instruction::Trunc &&
15447 !EI.
UserTE->isAltShuffle();
15450 NodeIdx < VectorizableTree.size() &&
15451 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15453 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
15455 auto *IC = dyn_cast<ICmpInst>(V);
15458 !isKnownNonNegative(IC->getOperand(0),
15459 SimplifyQuery(*DL)) ||
15460 !isKnownNonNegative(IC->getOperand(1),
15461 SimplifyQuery(*DL)));
15468 if (MaxBitWidth == 0 ||
15470 cast<IntegerType>(TreeRoot.
front()->getType())->getBitWidth()) {
15471 if (UserIgnoreList)
15478 for (
unsigned Idx : ToDemote) {
15479 TreeEntry *TE = VectorizableTree[
Idx].get();
15482 bool IsSigned = TE->getOpcode() == Instruction::SExt ||
15484 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15502 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
15527 DL = &
F.getParent()->getDataLayout();
15531 bool Changed =
false;
15537 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
15542 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
15545 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
15549 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
15558 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
15560 R.clearReductionData();
15561 collectSeedInstructions(BB);
15564 if (!Stores.empty()) {
15566 <<
" underlying objects.\n");
15567 Changed |= vectorizeStoreChains(R);
15571 Changed |= vectorizeChainsInBlock(BB, R);
15576 if (!GEPs.
empty()) {
15578 <<
" underlying objects.\n");
15579 Changed |= vectorizeGEPIndices(BB, R);
15584 R.optimizeGatherSequence();
15592 unsigned Idx,
unsigned MinVF,
15597 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15598 unsigned VF = Chain.
size();
15612 for (
Value *V : Chain)
15613 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
15616 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
15621 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15622 (!S.MainOp->isSafeToRemove() ||
15625 return !isa<ExtractElementInst>(V) &&
15626 (V->getNumUses() > Chain.size() ||
15627 any_of(V->users(), [&](User *U) {
15628 return !Stores.contains(U);
15631 (ValOps.
size() > Chain.size() / 2 && !S.getOpcode())) {
15632 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15636 if (
R.isLoadCombineCandidate(Chain))
15638 R.buildTree(Chain);
15640 if (
R.isTreeTinyAndNotFullyVectorizable()) {
15641 if (
R.isGathered(Chain.front()) ||
15642 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15643 return std::nullopt;
15644 Size =
R.getTreeSize();
15647 R.reorderTopToBottom();
15648 R.reorderBottomToTop();
15649 R.buildExternalUses();
15651 R.computeMinimumValueSizes();
15652 R.transformNodes();
15654 Size =
R.getTreeSize();
15655 if (S.getOpcode() == Instruction::Load)
15663 using namespace ore;
15666 cast<StoreInst>(Chain[0]))
15667 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
15668 <<
" and with tree size "
15669 <<
NV(
"TreeSize",
R.getTreeSize()));
15683 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
15684 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
15685 unsigned Size = First ? Val.first : Val.second;
15697 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
15698 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
15699 unsigned P = First ? Val.first : Val.second;
15702 return V + (P - Mean) * (P - Mean);
15705 return Dev * 81 / (Mean * Mean) == 0;
15708bool SLPVectorizerPass::vectorizeStores(
15710 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
15715 bool Changed =
false;
15717 struct StoreDistCompare {
15718 bool operator()(
const std::pair<unsigned, int> &Op1,
15719 const std::pair<unsigned, int> &Op2)
const {
15720 return Op1.second < Op2.second;
15725 using StoreIndexToDistSet =
15726 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15727 auto TryToVectorize = [&](
const StoreIndexToDistSet &Set) {
15732 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
15734 PrevDist =
Data.second;
15735 if (
Idx != Set.size() - 1)
15740 Operands.push_back(Stores[DataVar.first]);
15741 PrevDist = DataVar.second;
15746 .
insert({Operands.front(),
15747 cast<StoreInst>(Operands.front())->getValueOperand(),
15749 cast<StoreInst>(Operands.back())->getValueOperand(),
15754 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
15755 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
15759 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15760 unsigned MaxRegVF = MaxVF;
15762 Type *StoreTy =
Store->getValueOperand()->getType();
15763 Type *ValueTy = StoreTy;
15764 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
15765 ValueTy = Trunc->getSrcTy();
15766 if (ValueTy == StoreTy &&
15767 R.getVectorElementSize(
Store->getValueOperand()) <= EltSize)
15769 unsigned MinVF = std::max<unsigned>(
15771 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
15774 if (MaxVF < MinVF) {
15775 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
15777 <<
"MinVF (" << MinVF <<
")\n");
15781 unsigned NonPowerOf2VF = 0;
15786 unsigned CandVF =
Operands.size();
15788 NonPowerOf2VF = CandVF;
15793 unsigned Size = MinVF;
15795 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
15799 unsigned Repeat = 0;
15800 constexpr unsigned MaxAttempts = 4;
15802 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
15803 P.first =
P.second = 1;
15806 auto IsNotVectorized = [](
bool First,
15807 const std::pair<unsigned, unsigned> &
P) {
15808 return First ?
P.first > 0 :
P.second > 0;
15810 auto IsVectorized = [](
bool First,
15811 const std::pair<unsigned, unsigned> &
P) {
15812 return First ?
P.first == 0 :
P.second == 0;
15814 auto VFIsProfitable = [](
bool First,
unsigned Size,
15815 const std::pair<unsigned, unsigned> &
P) {
15818 auto FirstSizeSame = [](
unsigned Size,
15819 const std::pair<unsigned, unsigned> &
P) {
15820 return Size ==
P.first;
15824 bool RepeatChanged =
false;
15825 bool AnyProfitableGraph;
15826 for (
unsigned Size : CandidateVFs) {
15827 AnyProfitableGraph =
false;
15828 unsigned StartIdx = std::distance(
15829 RangeSizes.begin(),
15830 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
15831 std::placeholders::_1)));
15832 while (StartIdx <
End) {
15834 std::distance(RangeSizes.begin(),
15835 find_if(RangeSizes.drop_front(StartIdx),
15836 std::bind(IsVectorized,
Size >= MaxRegVF,
15837 std::placeholders::_1)));
15838 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
15839 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
15841 Size >= MaxRegVF)) {
15848 return cast<StoreInst>(V)
15849 ->getValueOperand()
15851 cast<StoreInst>(Slice.
front())
15852 ->getValueOperand()
15855 "Expected all operands of same type.");
15856 if (!NonSchedulable.empty()) {
15857 auto [NonSchedSizeMax, NonSchedSizeMin] =
15858 NonSchedulable.lookup(Slice.
front());
15859 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
15860 Cnt += NonSchedSizeMax;
15865 std::optional<bool> Res =
15866 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
15870 .first->getSecond()
15878 AnyProfitableGraph = RepeatChanged = Changed =
true;
15882 [](std::pair<unsigned, unsigned> &
P) {
15883 P.first = P.second = 0;
15885 if (Cnt < StartIdx + MinVF) {
15886 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
15887 [](std::pair<unsigned, unsigned> &
P) {
15888 P.first = P.second = 0;
15890 StartIdx = Cnt +
Size;
15892 if (Cnt > Sz -
Size - MinVF) {
15894 [](std::pair<unsigned, unsigned> &
P) {
15895 P.first = P.second = 0;
15904 if (
Size > 2 && Res &&
15906 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
15907 std::placeholders::_1))) {
15913 if (
Size > MaxRegVF && TreeSize > 1 &&
15915 std::bind(FirstSizeSame, TreeSize,
15916 std::placeholders::_1))) {
15918 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
15924 [&](std::pair<unsigned, unsigned> &
P) {
15925 if (Size >= MaxRegVF)
15926 P.second = std::max(P.second, TreeSize);
15928 P.first = std::max(P.first, TreeSize);
15931 AnyProfitableGraph =
true;
15933 if (StartIdx >=
End)
15935 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
15936 AnyProfitableGraph =
true;
15937 StartIdx = std::distance(
15938 RangeSizes.begin(),
15939 find_if(RangeSizes.drop_front(Sz),
15940 std::bind(IsNotVectorized,
Size >= MaxRegVF,
15941 std::placeholders::_1)));
15943 if (!AnyProfitableGraph &&
Size >= MaxRegVF)
15947 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
15948 return P.first == 0 &&
P.second == 0;
15952 if (Repeat >= MaxAttempts ||
15953 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
15955 constexpr unsigned StoresLimit = 64;
15956 const unsigned MaxTotalNum =
bit_floor(std::min<unsigned>(
15958 static_cast<unsigned>(
15961 RangeSizes.begin(),
15962 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
15963 std::placeholders::_1))) +
15966 if (VF > MaxTotalNum || VF >= StoresLimit)
15968 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
15970 P.first = std::max(
P.second,
P.first);
15974 CandidateVFs.clear();
15975 CandidateVFs.push_back(VF);
16022 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16024 Stores[Set.first]->getValueOperand()->getType(),
16025 Stores[Set.first]->getPointerOperand(),
16026 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
16030 auto It = Set.second.find(std::make_pair(
Idx, *Diff));
16031 if (It == Set.second.end()) {
16032 Set.second.emplace(
Idx, *Diff);
16036 TryToVectorize(Set.second);
16037 StoreIndexToDistSet PrevSet;
16038 PrevSet.swap(Set.second);
16040 Set.second.emplace(
Idx, 0);
16043 unsigned StartIdx = It->first + 1;
16048 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
16050 if (Pair.first <= It->first ||
16051 VectorizedStores.
contains(Stores[Pair.first]))
16053 unsigned BI = Pair.first - StartIdx;
16054 UsedStores.set(BI);
16055 Dists[BI] = Pair.second - It->second;
16057 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
16058 unsigned BI =
I - StartIdx;
16059 if (UsedStores.test(BI))
16060 Set.second.emplace(
I, Dists[BI]);
16064 auto &Res = SortedStores.emplace_back();
16066 Res.second.emplace(
Idx, 0);
16072 SI->getValueOperand()->getType()) {
16073 for (
auto &Set : SortedStores)
16074 TryToVectorize(Set.second);
16075 SortedStores.clear();
16078 FillStoresSet(
I, SI);
16082 for (
auto &Set : SortedStores)
16083 TryToVectorize(Set.second);
16088void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
16099 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
16100 if (!
SI->isSimple())
16110 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
16111 if (
GEP->getNumIndices() != 1)
16114 if (isa<Constant>(
Idx))
16118 if (
GEP->getType()->isVectorTy())
16130 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
16131 << VL.
size() <<
".\n");
16136 if (!S.getOpcode())
16142 for (
Value *V : VL) {
16143 Type *Ty =
V->getType();
16147 R.getORE()->emit([&]() {
16148 std::string TypeStr;
16152 <<
"Cannot SLP vectorize list: type "
16153 << rso.str() +
" is unsupported by vectorizer";
16159 unsigned Sz =
R.getVectorElementSize(I0);
16160 unsigned MinVF =
R.getMinVF(Sz);
16161 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
16162 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16164 R.getORE()->emit([&]() {
16166 <<
"Cannot SLP vectorize list: vectorization factor "
16167 <<
"less than 2 is not supported";
16172 bool Changed =
false;
16173 bool CandidateFound =
false;
16175 Type *ScalarTy = VL[0]->getType();
16176 if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16177 ScalarTy =
IE->getOperand(1)->getType();
16179 unsigned NextInst = 0, MaxInst = VL.size();
16180 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16187 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
16188 unsigned ActualVF = std::min(MaxInst -
I, VF);
16193 if (MaxVFOnly && ActualVF < MaxVF)
16195 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16201 auto *
I = dyn_cast<Instruction>(V);
16202 return I &&
R.isDeleted(
I);
16206 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
16210 if (
R.isTreeTinyAndNotFullyVectorizable())
16212 R.reorderTopToBottom();
16213 R.reorderBottomToTop(
16214 !isa<InsertElementInst>(Ops.
front()) &&
16215 !
R.doesRootHaveInTreeUses());
16216 R.buildExternalUses();
16218 R.computeMinimumValueSizes();
16219 R.transformNodes();
16221 CandidateFound =
true;
16222 MinCost = std::min(MinCost,
Cost);
16225 <<
" for VF=" << ActualVF <<
"\n");
16229 cast<Instruction>(Ops[0]))
16230 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
16231 <<
" and with tree size "
16232 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
16243 if (!Changed && CandidateFound) {
16244 R.getORE()->emit([&]() {
16246 <<
"List vectorization was possible but not beneficial with cost "
16247 <<
ore::NV(
"Cost", MinCost) <<
" >= "
16250 }
else if (!Changed) {
16251 R.getORE()->emit([&]() {
16253 <<
"Cannot SLP vectorize list: vectorization was impossible"
16254 <<
" with available vectorization factors";
16264 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
16270 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
16271 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
16272 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P)
16279 auto *
A = dyn_cast<BinaryOperator>(Op0);
16280 auto *
B = dyn_cast<BinaryOperator>(Op1);
16282 if (
A &&
B &&
B->hasOneUse()) {
16283 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
16284 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
16285 if (B0 && B0->getParent() ==
P)
16287 if (B1 && B1->getParent() ==
P)
16291 if (
B &&
A &&
A->hasOneUse()) {
16292 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
16293 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
16294 if (A0 && A0->getParent() ==
P)
16296 if (A1 && A1->getParent() ==
P)
16300 if (Candidates.
size() == 1)
16301 return tryToVectorizeList({Op0, Op1},
R);
16304 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
16305 if (!BestCandidate)
16307 return tryToVectorizeList(
16308 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
16342 ReductionOpsListType ReductionOps;
16354 bool IsSupportedHorRdxIdentityOp =
false;
16365 return isa<SelectInst>(
I) &&
16371 if (Kind == RecurKind::None)
16379 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16383 return I->getFastMathFlags().noNaNs();
16386 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16389 return I->isAssociative();
16398 return I->getOperand(2);
16399 return I->getOperand(
Index);
16407 case RecurKind::Or:
16413 case RecurKind::And:
16419 case RecurKind::Add:
16420 case RecurKind::Mul:
16421 case RecurKind::Xor:
16422 case RecurKind::FAdd:
16423 case RecurKind::FMul:
16426 case RecurKind::FMax:
16428 case RecurKind::FMin:
16430 case RecurKind::FMaximum:
16432 case RecurKind::FMinimum:
16434 case RecurKind::SMax:
16440 case RecurKind::SMin:
16446 case RecurKind::UMax:
16452 case RecurKind::UMin:
16467 const ReductionOpsListType &ReductionOps) {
16468 bool UseSelect = ReductionOps.size() == 2 ||
16470 (ReductionOps.size() == 1 &&
16471 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16472 assert((!UseSelect || ReductionOps.size() != 2 ||
16473 isa<SelectInst>(ReductionOps[1][0])) &&
16474 "Expected cmp + select pairs for reduction");
16477 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
16491 auto *
I = dyn_cast<Instruction>(V);
16493 return RecurKind::None;
16495 return RecurKind::Add;
16497 return RecurKind::Mul;
16500 return RecurKind::And;
16503 return RecurKind::Or;
16505 return RecurKind::Xor;
16507 return RecurKind::FAdd;
16509 return RecurKind::FMul;
16512 return RecurKind::FMax;
16514 return RecurKind::FMin;
16517 return RecurKind::FMaximum;
16519 return RecurKind::FMinimum;
16525 return RecurKind::SMax;
16527 return RecurKind::SMin;
16529 return RecurKind::UMax;
16531 return RecurKind::UMin;
16533 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
16555 if (!isa<ExtractElementInst>(
RHS) ||
16557 return RecurKind::None;
16559 if (!isa<ExtractElementInst>(
LHS) ||
16561 return RecurKind::None;
16563 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
16564 return RecurKind::None;
16568 return RecurKind::None;
16573 return RecurKind::None;
16576 return RecurKind::SMax;
16579 return RecurKind::SMin;
16582 return RecurKind::UMax;
16585 return RecurKind::UMin;
16588 return RecurKind::None;
16592 static unsigned getFirstOperandIndex(
Instruction *
I) {
16593 return isCmpSelMinMax(
I) ? 1 : 0;
16599 return isCmpSelMinMax(
I) ? 3 : 2;
16605 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
16606 auto *Sel = cast<SelectInst>(
I);
16607 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
16608 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
16610 return I->getParent() == BB;
16614 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
16615 if (IsCmpSelMinMax) {
16618 if (
auto *Sel = dyn_cast<SelectInst>(
I))
16619 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
16620 return I->hasNUses(2);
16624 return I->hasOneUse();
16629 if (isCmpSelMinMax(
I))
16630 ReductionOps.assign(2, ReductionOpsType());
16632 ReductionOps.assign(1, ReductionOpsType());
16637 if (isCmpSelMinMax(
I)) {
16638 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
16639 ReductionOps[1].emplace_back(
I);
16641 ReductionOps[0].emplace_back(
I);
16646 int Sz = Data.size();
16647 auto *
I = dyn_cast<Instruction>(Data.front());
16648 return Sz > 1 ||
isConstant(Data.front()) ||
16659 RdxKind = HorizontalReduction::getRdxKind(Root);
16660 if (!isVectorizable(RdxKind, Root))
16671 if (
auto *Sel = dyn_cast<SelectInst>(Root))
16672 if (!Sel->getCondition()->hasOneUse())
16675 ReductionRoot = Root;
16680 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16689 for (
int I = getFirstOperandIndex(TreeN),
16690 End = getNumberOfOperands(TreeN);
16692 Value *EdgeVal = getRdxOperand(TreeN,
I);
16693 ReducedValsToOps[EdgeVal].push_back(TreeN);
16694 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16697 !hasSameParent(EdgeInst, BB)) {
16698 ExtraArgs.push_back(EdgeVal);
16705 if (!EdgeInst ||
getRdxKind(EdgeInst) != RdxKind ||
16706 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16707 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16708 !isVectorizable(RdxKind, EdgeInst) ||
16709 (
R.isAnalyzedReductionRoot(EdgeInst) &&
16710 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16711 PossibleReducedVals.push_back(EdgeVal);
16714 ReductionOps.push_back(EdgeInst);
16723 PossibleReducedVals;
16724 initReductionOps(Root);
16729 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
16732 auto LIt = LoadsMap.
find(
Ptr);
16733 if (LIt != LoadsMap.
end()) {
16734 for (
LoadInst *RLI : LIt->second) {
16740 for (
LoadInst *RLI : LIt->second) {
16744 DoNotReverseVals.
insert(RLI);
16748 if (LIt->second.size() > 2) {
16750 hash_value(LIt->second.back()->getPointerOperand());
16751 DoNotReverseVals.
insert(LIt->second.back());
16756 LoadKeyUsed.
insert(Key);
16761 while (!Worklist.empty()) {
16766 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16769 if (
Args.size() < 2) {
16770 addReductionOps(TreeN);
16772 if (!
Args.empty()) {
16773 assert(
Args.size() == 1 &&
"Expected only single argument.");
16774 ExtraArgs[TreeN] =
Args.front();
16778 for (
Value *V : PossibleRedVals) {
16782 ++PossibleReducedVals[
Key][
Idx]
16783 .
insert(std::make_pair(V, 0))
16786 Worklist.append(PossibleReductionOps.
rbegin(),
16787 PossibleReductionOps.
rend());
16792 ++PossibleReducedVals[
Key][
Idx]
16793 .
insert(std::make_pair(TreeN, 0))
16797 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
16800 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
16801 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
16803 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
16806 auto RedValsVect = It->second.takeVector();
16808 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
16809 PossibleRedValsVect.
back().append(Data.second, Data.first);
16811 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
16812 return P1.size() > P2.size();
16816 if (isGoodForReduction(Data) ||
16817 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16818 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16820 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16824 NewIdx = ReducedVals.
size();
16827 if (DoNotReverseVals.
contains(Data.front()))
16828 ReducedVals[NewIdx].
append(Data.begin(), Data.end());
16830 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
16832 ReducedVals.
emplace_back().append(Data.rbegin(), Data.rend());
16847 constexpr int ReductionLimit = 4;
16848 constexpr unsigned RegMaxNumber = 4;
16849 constexpr unsigned RedValsMaxNumber = 128;
16853 unsigned NumReducedVals =
16854 std::accumulate(ReducedVals.
begin(), ReducedVals.
end(), 0,
16856 if (!isGoodForReduction(Vals))
16858 return Num + Vals.size();
16860 if (NumReducedVals < ReductionLimit &&
16865 for (ReductionOpsType &RdxOps : ReductionOps)
16866 for (
Value *RdxOp : RdxOps)
16867 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16878 ReducedVals.
size() * ReducedVals.
front().size() + ExtraArgs.size());
16881 ExternallyUsedValues.
reserve(ExtraArgs.size() + 1);
16884 for (
const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16885 assert(Pair.first &&
"DebugLoc must be set.");
16886 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16887 TrackedVals.
try_emplace(Pair.second, Pair.second);
16892 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
16893 assert(isa<SelectInst>(RdxRootInst) &&
16894 "Expected min/max reduction to have select root instruction");
16895 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16896 assert(isa<Instruction>(ScalarCond) &&
16897 "Expected min/max reduction to have compare condition");
16898 return cast<Instruction>(ScalarCond);
16902 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
16903 if (VectorizedTree) {
16906 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16907 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16910 auto It = ReducedValsToOps.
find(Res);
16911 if (It != ReducedValsToOps.
end() &&
16917 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
16923 bool AnyBoolLogicOp =
16925 return isBoolLogicOp(cast<Instruction>(V));
16929 ExternallyUsedValues[ReductionRoot];
16931 ReductionOps.front().size());
16932 for (ReductionOpsType &RdxOps : ReductionOps)
16933 for (
Value *RdxOp : RdxOps) {
16936 IgnoreList.insert(RdxOp);
16941 for (
Value *U : IgnoreList)
16942 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
16943 RdxFMF &= FPMO->getFastMathFlags();
16944 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
16949 for (
Value *V : Candidates)
16950 TrackedVals.try_emplace(V, V);
16956 Value *VectorizedTree =
nullptr;
16957 bool CheckForReusedReductionOps =
false;
16959 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
16965 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
16966 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
16971 auto *Inst = dyn_cast<Instruction>(RdxVal);
16973 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
16974 (S.getOpcode() && !Inst))
16977 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
16979 bool ShuffledExtracts =
false;
16981 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16983 InstructionsState NextS =
getSameOpcode(ReducedVals[
I + 1], TLI);
16984 if (NextS.getOpcode() == Instruction::ExtractElement &&
16985 !NextS.isAltShuffle()) {
16987 for (
Value *RV : ReducedVals[
I + 1]) {
16988 Value *RdxVal = TrackedVals.find(RV)->second;
16992 if (
auto *Inst = dyn_cast<Instruction>(RdxVal))
16993 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
16995 CommonCandidates.push_back(RdxVal);
16996 TrackedToOrig.try_emplace(RdxVal, RV);
17001 Candidates.
swap(CommonCandidates);
17002 ShuffledExtracts =
true;
17011 ++VectorizedVals.try_emplace(Candidates.
front(), 0).first->getSecond();
17013 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
17014 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17015 if (
auto *ResI = dyn_cast<Instruction>(Res))
17016 V.analyzedReductionRoot(ResI);
17018 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17022 unsigned NumReducedVals = Candidates.
size();
17023 if (NumReducedVals < ReductionLimit &&
17030 IsSupportedHorRdxIdentityOp =
17032 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17035 if (IsSupportedHorRdxIdentityOp)
17036 for (
Value *V : Candidates)
17037 ++SameValuesCounter.
insert(std::make_pair(V, 0)).first->second;
17048 bool SameScaleFactor =
false;
17049 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17050 SameValuesCounter.
size() != Candidates.size();
17051 if (OptReusedScalars) {
17053 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17054 RdxKind == RecurKind::Xor) &&
17056 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
17057 return P.second == SameValuesCounter.
front().second;
17059 Candidates.resize(SameValuesCounter.
size());
17060 transform(SameValuesCounter, Candidates.begin(),
17061 [](
const auto &
P) { return P.first; });
17062 NumReducedVals = Candidates.size();
17064 if (NumReducedVals == 1) {
17065 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17066 unsigned Cnt = SameValuesCounter.
lookup(OrigV);
17068 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17069 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17070 VectorizedVals.try_emplace(OrigV, Cnt);
17075 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
17076 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
17080 unsigned ReduxWidth = std::min<unsigned>(
17082 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17083 RegMaxNumber * RedValsMaxNumber));
17084 unsigned Start = 0;
17085 unsigned Pos = Start;
17087 unsigned PrevReduxWidth = ReduxWidth;
17088 bool CheckForReusedReductionOpsLocal =
false;
17089 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17090 &CheckForReusedReductionOpsLocal,
17091 &PrevReduxWidth, &
V,
17092 &IgnoreList](
bool IgnoreVL =
false) {
17093 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
17094 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17097 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17100 if (Pos < NumReducedVals - ReduxWidth + 1)
17101 return IsAnyRedOpGathered;
17104 return IsAnyRedOpGathered;
17106 bool AnyVectorized =
false;
17107 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17108 ReduxWidth >= ReductionLimit) {
17111 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17113 CheckForReusedReductionOps =
true;
17116 PrevReduxWidth = ReduxWidth;
17119 if (
V.areAnalyzedReductionVals(VL)) {
17120 (void)AdjustReducedVals(
true);
17126 auto *RedValI = dyn_cast<Instruction>(RedVal);
17129 return V.isDeleted(RedValI);
17132 V.buildTree(VL, IgnoreList);
17133 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
17134 if (!AdjustReducedVals())
17135 V.analyzedReductionVals(VL);
17138 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
17139 if (!AdjustReducedVals())
17140 V.analyzedReductionVals(VL);
17143 V.reorderTopToBottom();
17145 V.reorderBottomToTop(
true);
17149 ExternallyUsedValues);
17150 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
17151 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
17153 for (
Value *V : ReducedVals[Cnt])
17154 if (isa<Instruction>(V))
17155 LocalExternallyUsedValues[TrackedVals[
V]];
17157 if (!IsSupportedHorRdxIdentityOp) {
17160 "Reused values counter map is not empty");
17161 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17162 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17164 Value *
V = Candidates[Cnt];
17165 Value *OrigV = TrackedToOrig.find(V)->second;
17166 ++SameValuesCounter[OrigV];
17172 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17173 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17175 Value *RdxVal = Candidates[Cnt];
17176 if (!Visited.
insert(RdxVal).second)
17180 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
17181 LocalExternallyUsedValues[RdxVal];
17184 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17186 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17187 if (NumOps != ReducedValsToOps.
find(OrigV)->second.size())
17188 LocalExternallyUsedValues[RdxVal];
17191 if (!IsSupportedHorRdxIdentityOp)
17192 SameValuesCounter.
clear();
17193 for (
Value *RdxVal : VL)
17194 if (RequiredExtract.
contains(RdxVal))
17195 LocalExternallyUsedValues[RdxVal];
17199 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17200 ReplacementToExternal.
try_emplace(Pair.second, Pair.first);
17201 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17203 auto RIt = ReplacementToExternal.
find(Ext);
17204 while (RIt != ReplacementToExternal.
end()) {
17206 RIt = ReplacementToExternal.
find(Ext);
17208 auto *It = ExternallyUsedValues.
find(Ext);
17209 if (It == ExternallyUsedValues.
end())
17211 LocalExternallyUsedValues[Pair.second].append(It->second);
17213 V.buildExternalUses(LocalExternallyUsedValues);
17215 V.computeMinimumValueSizes();
17216 V.transformNodes();
17221 getReductionCost(
TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17224 <<
" for reduction\n");
17228 V.getORE()->emit([&]() {
17230 SV_NAME,
"HorSLPNotBeneficial",
17231 ReducedValsToOps.
find(VL[0])->second.front())
17232 <<
"Vectorizing horizontal reduction is possible "
17233 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
17234 <<
" and threshold "
17237 if (!AdjustReducedVals())
17238 V.analyzedReductionVals(VL);
17242 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
17243 <<
Cost <<
". (HorRdx)\n");
17244 V.getORE()->emit([&]() {
17246 SV_NAME,
"VectorizedHorizontalReduction",
17247 ReducedValsToOps.
find(VL[0])->second.front())
17248 <<
"Vectorized horizontal reduction with cost "
17249 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
17250 <<
ore::NV(
"TreeSize",
V.getTreeSize());
17257 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17259 if (IsCmpSelMinMax)
17260 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17263 Value *VectorizedRoot =
V.vectorizeTree(LocalExternallyUsedValues,
17264 ReplacedExternals, InsertPt);
17271 if ((isBoolLogicOp(RdxRootInst) ||
17272 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17274 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
17277 if (OptReusedScalars && !SameScaleFactor) {
17279 emitReusedOps(VectorizedRoot, Builder,
V.getRootNodeScalars(),
17280 SameValuesCounter, TrackedToOrig);
17283 Value *ReducedSubTree =
17284 emitReduction(VectorizedRoot, Builder, ReduxWidth,
TTI);
17285 if (ReducedSubTree->
getType() != VL.front()->getType()) {
17287 ReducedSubTree, VL.front()->getType(),
any_of(VL, [&](
Value *R) {
17289 R, cast<Instruction>(ReductionOps.front().front())
17291 ->getDataLayout());
17299 if (OptReusedScalars && SameScaleFactor)
17300 ReducedSubTree = emitScaleForReusedOps(
17301 ReducedSubTree, Builder, SameValuesCounter.
front().second);
17303 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17305 for (
Value *RdxVal : VL) {
17306 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17307 if (IsSupportedHorRdxIdentityOp) {
17308 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17311 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17312 if (!
V.isVectorized(RdxVal))
17313 RequiredExtract.
insert(RdxVal);
17318 AnyVectorized =
true;
17320 if (OptReusedScalars && !AnyVectorized) {
17321 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
17322 Value *RedVal = emitScaleForReusedOps(
P.first, Builder,
P.second);
17323 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17324 Value *OrigV = TrackedToOrig.find(
P.first)->second;
17325 VectorizedVals.try_emplace(OrigV,
P.second);
17330 if (VectorizedTree) {
17351 if (!AnyBoolLogicOp)
17353 if (isBoolLogicOp(RedOp1) &&
17354 ((!InitStep &&
LHS == VectorizedTree) ||
17357 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
17358 getRdxOperand(RedOp2, 0) ==
RHS ||
17363 if (
LHS != VectorizedTree)
17374 unsigned Sz = InstVals.
size();
17377 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
17380 Value *RdxVal1 = InstVals[
I].second;
17381 Value *StableRdxVal1 = RdxVal1;
17382 auto It1 = TrackedVals.find(RdxVal1);
17383 if (It1 != TrackedVals.end())
17384 StableRdxVal1 = It1->second;
17385 Value *RdxVal2 = InstVals[
I + 1].second;
17386 Value *StableRdxVal2 = RdxVal2;
17387 auto It2 = TrackedVals.find(RdxVal2);
17388 if (It2 != TrackedVals.end())
17389 StableRdxVal2 = It2->second;
17393 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
17395 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17396 StableRdxVal2,
"op.rdx", ReductionOps);
17397 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
17400 ExtraReds[Sz / 2] = InstVals.
back();
17404 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
17408 for (
Value *RdxVal : Candidates) {
17409 if (!Visited.
insert(RdxVal).second)
17411 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17418 for (
auto &Pair : ExternallyUsedValues) {
17420 for (
auto *
I : Pair.second)
17424 bool InitStep =
true;
17425 while (ExtraReductions.
size() > 1) {
17426 VectorizedTree = ExtraReductions.
front().second;
17428 FinalGen(ExtraReductions, InitStep);
17429 ExtraReductions.
swap(NewReds);
17432 VectorizedTree = ExtraReductions.
front().second;
17434 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17443 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
17450 for (
auto *U :
Ignore->users()) {
17452 "All users must be either in the reduction ops list.");
17455 if (!
Ignore->use_empty()) {
17457 Ignore->replaceAllUsesWith(Undef);
17459 V.eraseInstruction(cast<Instruction>(
Ignore));
17462 }
else if (!CheckForReusedReductionOps) {
17463 for (ReductionOpsType &RdxOps : ReductionOps)
17464 for (
Value *RdxOp : RdxOps)
17465 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17467 return VectorizedTree;
17474 bool IsCmpSelMinMax,
unsigned ReduxWidth,
17477 Type *ScalarTy = ReducedVals.
front()->getType();
17486 int Cnt = ReducedVals.
size();
17487 for (
Value *RdxVal : ReducedVals) {
17492 Cost += GenCostFn();
17497 auto *RdxOp = cast<Instruction>(U);
17498 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17506 Cost += ScalarCost;
17508 Cost += GenCostFn();
17513 case RecurKind::Add:
17514 case RecurKind::Mul:
17515 case RecurKind::Or:
17516 case RecurKind::And:
17517 case RecurKind::Xor:
17518 case RecurKind::FAdd:
17519 case RecurKind::FMul: {
17524 ScalarCost = EvaluateScalarCost([&]() {
17529 case RecurKind::FMax:
17530 case RecurKind::FMin:
17531 case RecurKind::FMaximum:
17532 case RecurKind::FMinimum:
17533 case RecurKind::SMax:
17534 case RecurKind::SMin:
17535 case RecurKind::UMax:
17536 case RecurKind::UMin: {
17540 ScalarCost = EvaluateScalarCost([&]() {
17550 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
17552 <<
" (It is a splitting reduction)\n");
17553 return VectorCost - ScalarCost;
17559 assert(VectorizedValue &&
"Need to have a vectorized tree node");
17561 "We only handle power-of-two reductions for now");
17562 assert(RdxKind != RecurKind::FMulAdd &&
17563 "A call to the llvm.fmuladd intrinsic is not handled yet");
17565 ++NumVectorInstructions;
17572 assert(IsSupportedHorRdxIdentityOp &&
17573 "The optimization of matched scalar identity horizontal reductions "
17574 "must be supported.");
17576 case RecurKind::Add: {
17578 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
17580 << VectorizedValue <<
". (HorRdx)\n");
17581 return Builder.
CreateMul(VectorizedValue, Scale);
17583 case RecurKind::Xor: {
17585 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
17586 <<
". (HorRdx)\n");
17589 return VectorizedValue;
17591 case RecurKind::FAdd: {
17593 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
17595 << VectorizedValue <<
". (HorRdx)\n");
17596 return Builder.
CreateFMul(VectorizedValue, Scale);
17598 case RecurKind::And:
17599 case RecurKind::Or:
17600 case RecurKind::SMax:
17601 case RecurKind::SMin:
17602 case RecurKind::UMax:
17603 case RecurKind::UMin:
17604 case RecurKind::FMax:
17605 case RecurKind::FMin:
17606 case RecurKind::FMaximum:
17607 case RecurKind::FMinimum:
17609 return VectorizedValue;
17610 case RecurKind::Mul:
17611 case RecurKind::FMul:
17612 case RecurKind::FMulAdd:
17613 case RecurKind::IAnyOf:
17614 case RecurKind::FAnyOf:
17615 case RecurKind::None:
17627 assert(IsSupportedHorRdxIdentityOp &&
17628 "The optimization of matched scalar identity horizontal reductions "
17629 "must be supported.");
17630 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
17631 if (VTy->getElementType() != VL.
front()->getType()) {
17637 R, cast<Instruction>(ReductionOps.front().front())
17639 ->getDataLayout());
17644 case RecurKind::Add: {
17647 for (
Value *V : VL) {
17648 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17649 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
17653 << VectorizedValue <<
". (HorRdx)\n");
17654 return Builder.
CreateMul(VectorizedValue, Scale);
17656 case RecurKind::And:
17657 case RecurKind::Or:
17660 <<
". (HorRdx)\n");
17661 return VectorizedValue;
17662 case RecurKind::SMax:
17663 case RecurKind::SMin:
17664 case RecurKind::UMax:
17665 case RecurKind::UMin:
17666 case RecurKind::FMax:
17667 case RecurKind::FMin:
17668 case RecurKind::FMaximum:
17669 case RecurKind::FMinimum:
17672 <<
". (HorRdx)\n");
17673 return VectorizedValue;
17674 case RecurKind::Xor: {
17680 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
17682 std::iota(
Mask.begin(),
Mask.end(), 0);
17683 bool NeedShuffle =
false;
17684 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
17686 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17687 if (Cnt % 2 == 0) {
17689 NeedShuffle =
true;
17695 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
17699 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
17700 return VectorizedValue;
17702 case RecurKind::FAdd: {
17705 for (
Value *V : VL) {
17706 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17707 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
17710 return Builder.
CreateFMul(VectorizedValue, Scale);
17712 case RecurKind::Mul:
17713 case RecurKind::FMul:
17714 case RecurKind::FMulAdd:
17715 case RecurKind::IAnyOf:
17716 case RecurKind::FAnyOf:
17717 case RecurKind::None:
17727 return HorizontalReduction::getRdxKind(V);
17730 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17731 return cast<FixedVectorType>(IE->getType())->getNumElements();
17733 unsigned AggregateSize = 1;
17734 auto *
IV = cast<InsertValueInst>(InsertInst);
17735 Type *CurrentType =
IV->getType();
17737 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
17738 for (
auto *Elt : ST->elements())
17739 if (Elt != ST->getElementType(0))
17740 return std::nullopt;
17741 AggregateSize *= ST->getNumElements();
17742 CurrentType = ST->getElementType(0);
17743 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17744 AggregateSize *= AT->getNumElements();
17745 CurrentType = AT->getElementType();
17746 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17747 AggregateSize *= VT->getNumElements();
17748 return AggregateSize;
17750 return AggregateSize;
17752 return std::nullopt;
17761 unsigned OperandOffset) {
17764 std::optional<unsigned> OperandIndex =
17768 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17770 BuildVectorOpds, InsertElts, *OperandIndex);
17773 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17774 InsertElts[*OperandIndex] = LastInsertInst;
17776 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
17777 }
while (LastInsertInst !=
nullptr &&
17778 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17801 assert((isa<InsertElementInst>(LastInsertInst) ||
17802 isa<InsertValueInst>(LastInsertInst)) &&
17803 "Expected insertelement or insertvalue instruction!");
17806 "Expected empty result vectors!");
17809 if (!AggregateSize)
17811 BuildVectorOpds.
resize(*AggregateSize);
17812 InsertElts.
resize(*AggregateSize);
17817 if (BuildVectorOpds.
size() >= 2)
17835 auto DominatedReduxValue = [&](
Value *R) {
17836 return isa<Instruction>(R) &&
17837 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
17843 if (
P->getIncomingBlock(0) == ParentBB) {
17844 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17845 }
else if (
P->getIncomingBlock(1) == ParentBB) {
17846 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17849 if (Rdx && DominatedReduxValue(Rdx))
17862 if (
P->getIncomingBlock(0) == BBLatch) {
17863 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17864 }
else if (
P->getIncomingBlock(1) == BBLatch) {
17865 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17868 if (Rdx && DominatedReduxValue(Rdx))
17902 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17903 isa<IntrinsicInst>(Root)) &&
17904 "Expected binop, select, or intrinsic for reduction matching");
17906 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17908 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17910 return dyn_cast<Instruction>(
RHS);
17912 return dyn_cast<Instruction>(
LHS);
17919 Value *Op0 =
nullptr;
17920 Value *Op1 =
nullptr;
17923 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17929 Value *B0 =
nullptr, *B1 =
nullptr;
17934bool SLPVectorizerPass::vectorizeHorReduction(
17939 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
17941 if (Root->
getParent() != BB || isa<PHINode>(Root))
17945 auto SelectRoot = [&]() {
17964 std::queue<std::pair<Instruction *, unsigned>>
Stack;
17965 Stack.emplace(SelectRoot(), 0);
17969 if (
R.isAnalyzedReductionRoot(Inst))
17974 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
17976 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI);
17978 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
17979 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17986 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
17991 while (!
Stack.empty()) {
17994 std::tie(Inst, Level) =
Stack.front();
17999 if (
R.isDeleted(Inst))
18001 if (
Value *VectorizedV = TryToReduce(Inst)) {
18003 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
18005 Stack.emplace(
I, Level);
18010 if (!TryAppendToPostponedInsts(Inst)) {
18021 if (VisitedInstrs.
insert(
Op).second)
18022 if (
auto *
I = dyn_cast<Instruction>(
Op))
18025 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
18026 !
R.isDeleted(
I) &&
I->getParent() == BB)
18027 Stack.emplace(
I, Level);
18036 bool Res = vectorizeHorReduction(
P, Root, BB, R,
TTI, PostponedInsts);
18037 Res |= tryToVectorize(PostponedInsts, R);
18044 for (
Value *V : Insts)
18045 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
18046 Res |= tryToVectorize(Inst, R);
18050bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
18052 if (!
R.canMapToVector(IVI->
getType()))
18060 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
18062 return tryToVectorizeList(BuildVectorOpds, R);
18071 (
llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18075 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
18076 return tryToVectorizeList(BuildVectorInsts, R);
18079template <
typename T>
18084 bool MaxVFOnly,
BoUpSLP &R) {
18085 bool Changed =
false;
18094 auto *SameTypeIt = IncIt;
18095 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
18099 unsigned NumElts = (SameTypeIt - IncIt);
18100 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
18101 << NumElts <<
")\n");
18112 TryToVectorizeHelper(
ArrayRef(IncIt, NumElts), MaxVFOnly)) {
18118 auto GetMinNumElements = [&R](
Value *V) {
18119 unsigned EltSize = R.getVectorElementSize(V);
18120 return std::max(2U, R.getMaxVecRegSize() / EltSize);
18122 if (NumElts < GetMinNumElements(*IncIt) &&
18123 (Candidates.
empty() ||
18124 Candidates.
front()->getType() == (*IncIt)->getType())) {
18125 Candidates.
append(IncIt, std::next(IncIt, NumElts));
18129 if (Candidates.
size() > 1 &&
18130 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18131 if (TryToVectorizeHelper(Candidates,
false)) {
18134 }
else if (MaxVFOnly) {
18136 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end();
18138 auto *SameTypeIt = It;
18139 while (SameTypeIt !=
End && AreCompatible(*SameTypeIt, *It))
18141 unsigned NumElts = (SameTypeIt - It);
18142 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(It, NumElts),
18148 Candidates.
clear();
18152 IncIt = SameTypeIt;
18164template <
bool IsCompatibility>
18169 "Expected valid element types only.");
18171 return IsCompatibility;
18172 auto *CI1 = cast<CmpInst>(V);
18173 auto *CI2 = cast<CmpInst>(V2);
18174 if (CI1->getOperand(0)->getType()->getTypeID() <
18176 return !IsCompatibility;
18177 if (CI1->getOperand(0)->getType()->getTypeID() >
18186 if (BasePred1 < BasePred2)
18187 return !IsCompatibility;
18188 if (BasePred1 > BasePred2)
18191 bool CI1Preds = Pred1 == BasePred1;
18192 bool CI2Preds = Pred2 == BasePred1;
18193 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
18194 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
18195 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
18199 return !IsCompatibility;
18202 if (
auto *I1 = dyn_cast<Instruction>(Op1))
18203 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
18204 if (IsCompatibility) {
18205 if (I1->getParent() != I2->getParent())
18212 return NodeI2 !=
nullptr;
18215 assert((NodeI1 == NodeI2) ==
18217 "Different nodes should have different DFS numbers");
18218 if (NodeI1 != NodeI2)
18222 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18224 if (IsCompatibility)
18226 if (I1->getOpcode() != I2->getOpcode())
18227 return I1->getOpcode() < I2->getOpcode();
18230 return IsCompatibility;
18233template <
typename ItT>
18236 bool Changed =
false;
18239 if (
R.isDeleted(
I))
18242 if (
auto *RootOp = dyn_cast<Instruction>(
Op))
18243 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R,
TTI);
18247 if (
R.isDeleted(
I))
18249 Changed |= tryToVectorize(
I, R);
18256 return compareCmp<false>(V, V2, *TLI, *DT);
18259 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
18262 return compareCmp<true>(V1, V2, *TLI, *DT);
18269 if (Vals.
size() <= 1)
18271 Changed |= tryToVectorizeSequence<Value>(
18272 Vals, CompareSorter, AreCompatibleCompares,
18275 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
18277 auto *Select = dyn_cast<SelectInst>(U);
18279 Select->getParent() != cast<Instruction>(V)->getParent();
18282 if (ArePossiblyReducedInOtherBlock)
18284 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18290bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18292 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18293 "This function only accepts Insert instructions");
18294 bool OpsChanged =
false;
18297 for (
auto *
I :
reverse(Instructions)) {
18298 if (
R.isDeleted(
I))
18300 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R,
TTI, PostponedInsts);
18303 for (
auto *
I :
reverse(Instructions)) {
18304 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
18306 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
18307 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
18308 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
18309 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
18313 OpsChanged |= tryToVectorize(PostponedInsts, R);
18320 bool Changed =
false;
18327 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
18330 "Expected vectorizable types only.");
18339 if (Opcodes1.
size() < Opcodes2.
size())
18341 if (Opcodes1.
size() > Opcodes2.
size())
18343 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
18346 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
18347 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
18352 return NodeI2 !=
nullptr;
18355 assert((NodeI1 == NodeI2) ==
18357 "Different nodes should have different DFS numbers");
18358 if (NodeI1 != NodeI2)
18361 if (S.getOpcode() && !S.isAltShuffle())
18363 return I1->getOpcode() < I2->getOpcode();
18372 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
18373 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
18381 bool U1 = isa<UndefValue>(Opcodes1[
I]);
18382 bool U2 = isa<UndefValue>(Opcodes2[
I]);
18386 auto ValID1 = Opcodes1[
I]->getValueID();
18387 auto ValID2 = Opcodes2[
I]->getValueID();
18388 if (ValID1 == ValID2)
18390 if (ValID1 < ValID2)
18392 if (ValID1 > ValID2)
18401 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
18406 auto AreCompatiblePHIs = [&PHIToOpcodes,
this](
Value *V1,
Value *
V2) {
18409 if (V1->getType() !=
V2->getType())
18413 if (Opcodes1.
size() != Opcodes2.
size())
18415 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
18417 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
18419 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
18420 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
18421 if (
I1->getParent() != I2->getParent())
18428 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
18430 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
18436 bool HaveVectorizedPhiNodes =
false;
18441 auto *
P = dyn_cast<PHINode>(&
I);
18447 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
18460 if (!Opcodes.
empty())
18464 while (!Nodes.empty()) {
18465 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
18468 for (
Value *V :
PHI->incoming_values()) {
18469 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
18470 Nodes.push_back(PHI1);
18478 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18479 Incoming, PHICompare, AreCompatiblePHIs,
18481 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18484 Changed |= HaveVectorizedPhiNodes;
18486 }
while (HaveVectorizedPhiNodes);
18488 VisitedInstrs.
clear();
18490 InstSetVector PostProcessInserts;
18494 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
18495 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18496 if (VectorizeCmps) {
18497 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
18498 PostProcessCmps.
clear();
18500 PostProcessInserts.clear();
18505 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
18506 return PostProcessCmps.
contains(Cmp);
18507 return isa<InsertElementInst, InsertValueInst>(
I) &&
18508 PostProcessInserts.contains(
I);
18514 return I->use_empty() &&
18515 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
18520 if (isa<ScalableVectorType>(It->getType()))
18524 if (
R.isDeleted(&*It))
18527 if (!VisitedInstrs.
insert(&*It).second) {
18528 if (HasNoUsers(&*It) &&
18529 VectorizeInsertsAndCmps(It->isTerminator())) {
18539 if (isa<DbgInfoIntrinsic>(It))
18543 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
18545 if (
P->getNumIncomingValues() == 2) {
18548 if (Root && vectorizeRootInstruction(
P, Root, BB, R,
TTI)) {
18557 for (
unsigned I = 0, E =
P->getNumIncomingValues();
I != E;
I++) {
18562 if (BB ==
P->getIncomingBlock(
I) ||
18568 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
18569 PI && !IsInPostProcessInstrs(PI))
18570 Changed |= vectorizeRootInstruction(
nullptr, PI,
18571 P->getIncomingBlock(
I), R,
TTI);
18576 if (HasNoUsers(&*It)) {
18577 bool OpsChanged =
false;
18578 auto *
SI = dyn_cast<StoreInst>(It);
18588 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
18589 SI->getValueOperand()->hasOneUse();
18591 if (TryToVectorizeRoot) {
18592 for (
auto *V : It->operand_values()) {
18595 if (
auto *VI = dyn_cast<Instruction>(V);
18596 VI && !IsInPostProcessInstrs(VI))
18598 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R,
TTI);
18605 VectorizeInsertsAndCmps(It->isTerminator());
18616 if (isa<InsertElementInst, InsertValueInst>(It))
18617 PostProcessInserts.insert(&*It);
18618 else if (isa<CmpInst>(It))
18619 PostProcessCmps.
insert(cast<CmpInst>(&*It));
18626 auto Changed =
false;
18627 for (
auto &Entry : GEPs) {
18630 if (Entry.second.size() < 2)
18633 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
18634 << Entry.second.size() <<
".\n");
18641 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18642 unsigned EltSize =
R.getVectorElementSize(*Entry.second[0]->idx_begin());
18643 if (MaxVecRegSize < EltSize)
18646 unsigned MaxElts = MaxVecRegSize / EltSize;
18647 for (
unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18648 auto Len = std::min<unsigned>(BE - BI, MaxElts);
18661 Candidates.remove_if([&R](
Value *
I) {
18662 return R.isDeleted(cast<Instruction>(
I)) ||
18663 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
18671 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
18672 auto *GEPI = GEPList[
I];
18673 if (!Candidates.count(GEPI))
18675 auto *SCEVI = SE->
getSCEV(GEPList[
I]);
18676 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
18677 auto *GEPJ = GEPList[J];
18678 auto *SCEVJ = SE->
getSCEV(GEPList[J]);
18679 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
18680 Candidates.remove(GEPI);
18681 Candidates.remove(GEPJ);
18682 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18683 Candidates.remove(GEPJ);
18690 if (Candidates.
size() < 2)
18697 auto BundleIndex = 0
u;
18698 for (
auto *V : Candidates) {
18699 auto *
GEP = cast<GetElementPtrInst>(V);
18700 auto *GEPIdx =
GEP->idx_begin()->get();
18701 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18702 Bundle[BundleIndex++] = GEPIdx;
18714 Changed |= tryToVectorizeList(Bundle, R);
18720bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
18721 bool Changed =
false;
18726 if (
V->getValueOperand()->getType()->getTypeID() <
18727 V2->getValueOperand()->getType()->getTypeID())
18729 if (
V->getValueOperand()->getType()->getTypeID() >
18730 V2->getValueOperand()->getType()->getTypeID())
18732 if (
V->getPointerOperandType()->getTypeID() <
18733 V2->getPointerOperandType()->getTypeID())
18735 if (
V->getPointerOperandType()->getTypeID() >
18736 V2->getPointerOperandType()->getTypeID())
18739 if (isa<UndefValue>(
V->getValueOperand()) ||
18740 isa<UndefValue>(
V2->getValueOperand()))
18742 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
18743 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18747 DT->
getNode(I2->getParent());
18748 assert(NodeI1 &&
"Should only process reachable instructions");
18749 assert(NodeI2 &&
"Should only process reachable instructions");
18750 assert((NodeI1 == NodeI2) ==
18752 "Different nodes should have different DFS numbers");
18753 if (NodeI1 != NodeI2)
18758 return I1->getOpcode() < I2->getOpcode();
18760 if (isa<Constant>(
V->getValueOperand()) &&
18761 isa<Constant>(
V2->getValueOperand()))
18763 return V->getValueOperand()->getValueID() <
18764 V2->getValueOperand()->getValueID();
18776 isa<UndefValue>(
V2->getValueOperand()))
18779 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18780 if (
I1->getParent() != I2->getParent())
18783 return S.getOpcode() > 0;
18786 isa<Constant>(
V2->getValueOperand()))
18789 V2->getValueOperand()->getValueID();
18794 for (
auto &Pair : Stores) {
18795 if (Pair.second.size() < 2)
18799 << Pair.second.size() <<
".\n");
18808 Pair.second.rend());
18809 Changed |= tryToVectorizeSequence<StoreInst>(
18810 ReversedStores, StoreSorter, AreCompatibleStores,
18812 return vectorizeStores(Candidates, R, Attempted);
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Loop::LoopBounds::Direction Direction
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
value_type & FindAndConstruct(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
const BasicBlock * getParent() const
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
size_type count(const KeyT &Key) const
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T get() const
Returns the value of the specified pointer type.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
void clear()
Completely clear the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Value * getOperand(unsigned i) const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
unsigned getTreeSize() const
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
testing::Matcher< const detail::ErrorHolder & > Failed()
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const