73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Enable vectorization for wider vector utilization"));
122 cl::desc(
"Only vectorize if you gain more than this "
127 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
132 cl::desc(
"Attempt to vectorize horizontal reductions"));
137 "Attempt to vectorize horizontal reductions feeding into a store"));
143 cl::desc(
"Allow optimization of original scalar identity operations on "
144 "matched horizontal reductions."));
148 cl::desc(
"Attempt to vectorize for this register size in bits"));
152 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
160 cl::desc(
"Limit the size of the SLP scheduling region per block"));
164 cl::desc(
"Attempt to vectorize for this register size in bits"));
168 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
172 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
178 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
187 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
191 cl::desc(
"The minimum number of loads, which should be considered strided, "
192 "if the stride is > 1 or is runtime value"));
196 cl::desc(
"The maximum stride, considered to be profitable."));
200 cl::desc(
"Display the SLP trees with Graphviz"));
204 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
235 if (
SLPReVec && isa<FixedVectorType>(Ty))
237 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
243 assert(!isa<ScalableVectorType>(Ty) &&
244 "ScalableVectorType is not supported.");
245 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
246 return VecTy->getNumElements();
259 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
266 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
267 !isa<ExtractValueInst, UndefValue>(V))
269 auto *
I = dyn_cast<Instruction>(V);
270 if (!
I || isa<ExtractValueInst>(
I))
272 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
274 if (isa<ExtractElementInst>(
I))
276 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
292 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
300 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
316 for (
int I = 1, E = VL.
size();
I < E;
I++) {
317 auto *
II = dyn_cast<Instruction>(VL[
I]);
321 if (BB !=
II->getParent())
338 Value *FirstNonUndef =
nullptr;
339 for (
Value *V : VL) {
340 if (isa<UndefValue>(V))
342 if (!FirstNonUndef) {
346 if (V != FirstNonUndef)
349 return FirstNonUndef !=
nullptr;
354 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
355 return Cmp->isCommutative();
356 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
357 return BO->isCommutative() ||
358 (BO->getOpcode() == Instruction::Sub &&
364 ICmpInst::Predicate Pred;
365 if (match(U.getUser(),
366 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
367 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
371 return match(U.getUser(),
372 m_Intrinsic<Intrinsic::abs>(
373 m_Specific(U.get()), m_ConstantInt(Flag))) &&
374 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
377 (BO->getOpcode() == Instruction::FSub &&
380 return match(U.getUser(),
381 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
383 return I->isCommutative();
389 static_assert(std::is_same_v<T, InsertElementInst> ||
390 std::is_same_v<T, ExtractElementInst>,
393 if (
const auto *IE = dyn_cast<T>(Inst)) {
394 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
397 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
400 if (CI->getValue().uge(VT->getNumElements()))
402 Index *= VT->getNumElements();
403 Index += CI->getZExtValue();
414 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
416 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
421 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
425 Type *CurrentType =
IV->getType();
426 for (
unsigned I :
IV->indices()) {
427 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
428 Index *= ST->getNumElements();
429 CurrentType = ST->getElementType(
I);
430 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
431 Index *= AT->getNumElements();
432 CurrentType = AT->getElementType();
465 if (MaskArg == UseMask::UndefsAsMask)
469 if (MaskArg == UseMask::FirstArg &&
Value < VF)
470 UseMask.reset(
Value);
471 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
472 UseMask.reset(
Value - VF);
480template <
bool IsPoisonOnly = false>
484 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
487 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
490 auto *
C = dyn_cast<Constant>(V);
492 if (!UseMask.empty()) {
494 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
496 if (isa<T>(
II->getOperand(1)))
503 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
511 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
518 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
519 if (
Constant *Elem =
C->getAggregateElement(
I))
521 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
549static std::optional<TargetTransformInfo::ShuffleKind>
551 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
555 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
556 auto *EI = dyn_cast<ExtractElementInst>(V);
559 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
562 return std::max(S, VTy->getNumElements());
565 Value *Vec1 =
nullptr;
566 Value *Vec2 =
nullptr;
568 auto *EE = dyn_cast<ExtractElementInst>(V);
571 Value *Vec = EE->getVectorOperand();
572 if (isa<UndefValue>(Vec))
577 ShuffleMode CommonShuffleMode =
Unknown;
579 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
581 if (isa<UndefValue>(VL[
I]))
583 auto *EI = cast<ExtractElementInst>(VL[
I]);
584 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
586 auto *Vec = EI->getVectorOperand();
588 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
591 if (isa<UndefValue>(Vec)) {
594 if (isa<UndefValue>(EI->getIndexOperand()))
596 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
602 unsigned IntIdx =
Idx->getValue().getZExtValue();
609 if (!Vec1 || Vec1 == Vec) {
611 }
else if (!Vec2 || Vec2 == Vec) {
617 if (CommonShuffleMode == Permute)
621 if (Mask[
I] %
Size !=
I) {
622 CommonShuffleMode = Permute;
625 CommonShuffleMode =
Select;
628 if (CommonShuffleMode ==
Select && Vec2)
639 assert((Opcode == Instruction::ExtractElement ||
640 Opcode == Instruction::ExtractValue) &&
641 "Expected extractelement or extractvalue instruction.");
642 if (Opcode == Instruction::ExtractElement) {
643 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
646 return CI->getZExtValue();
648 auto *EI = cast<ExtractValueInst>(E);
649 if (EI->getNumIndices() != 1)
651 return *EI->idx_begin();
657struct InstructionsState {
659 Value *OpValue =
nullptr;
670 unsigned getAltOpcode()
const {
675 bool isAltShuffle()
const {
return AltOp != MainOp; }
678 unsigned CheckedOpcode =
I->getOpcode();
679 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
682 InstructionsState() =
delete;
684 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
693 auto *
I = dyn_cast<Instruction>(
Op);
694 if (
I && S.isOpcodeOrAlt(
I))
713 unsigned BaseIndex = 0);
721 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
722 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
723 BaseOp0 == Op0 || BaseOp1 == Op1 ||
734 "Assessing comparisons of different types?");
744 return (BasePred == Pred &&
746 (BasePred == SwappedPred &&
755 unsigned BaseIndex) {
758 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
760 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
761 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
762 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
764 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
766 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
767 unsigned AltOpcode = Opcode;
768 unsigned AltIndex = BaseIndex;
770 bool SwappedPredsCompatible = [&]() {
774 UniquePreds.
insert(BasePred);
775 UniqueNonSwappedPreds.
insert(BasePred);
776 for (
Value *V : VL) {
777 auto *
I = dyn_cast<CmpInst>(V);
783 UniqueNonSwappedPreds.
insert(CurrentPred);
784 if (!UniquePreds.
contains(CurrentPred) &&
785 !UniquePreds.
contains(SwappedCurrentPred))
786 UniquePreds.
insert(CurrentPred);
791 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
795 auto *IBase = cast<Instruction>(VL[BaseIndex]);
798 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
802 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
804 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
805 auto *
I = cast<Instruction>(VL[Cnt]);
806 unsigned InstOpcode =
I->getOpcode();
807 if (IsBinOp && isa<BinaryOperator>(
I)) {
808 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
812 AltOpcode = InstOpcode;
816 }
else if (IsCastOp && isa<CastInst>(
I)) {
817 Value *Op0 = IBase->getOperand(0);
819 Value *Op1 =
I->getOperand(0);
822 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
824 if (Opcode == AltOpcode) {
827 "Cast isn't safe for alternation, logic needs to be updated!");
828 AltOpcode = InstOpcode;
833 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
834 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
835 Type *Ty0 = BaseInst->getOperand(0)->getType();
836 Type *Ty1 = Inst->getOperand(0)->getType();
838 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
845 if ((E == 2 || SwappedPredsCompatible) &&
846 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
851 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
852 if (AltIndex != BaseIndex) {
855 }
else if (BasePred != CurrentPred) {
858 "CmpInst isn't safe for alternation, logic needs to be updated!");
863 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
864 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
867 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
868 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
869 if (Gep->getNumOperands() != 2 ||
870 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
871 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
872 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
874 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
875 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
876 auto *BaseLI = cast<LoadInst>(IBase);
877 if (!LI->isSimple() || !BaseLI->isSimple())
878 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
879 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
880 auto *
CallBase = cast<CallInst>(IBase);
882 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
884 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
885 Call->op_begin() + Call->getBundleOperandsEndIndex(),
888 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
891 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
894 if (Mappings.
size() != BaseMappings.
size() ||
895 Mappings.
front().ISA != BaseMappings.
front().ISA ||
896 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
897 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
898 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
899 Mappings.
front().Shape.Parameters !=
900 BaseMappings.
front().Shape.Parameters)
901 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
906 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
909 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
910 cast<Instruction>(VL[AltIndex]));
926 case Instruction::Load: {
927 LoadInst *LI = cast<LoadInst>(UserInst);
930 case Instruction::Store: {
931 StoreInst *SI = cast<StoreInst>(UserInst);
932 return (SI->getPointerOperand() == Scalar);
934 case Instruction::Call: {
935 CallInst *CI = cast<CallInst>(UserInst);
938 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
939 Arg.value().get() == Scalar;
951 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
958 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
959 return LI->isSimple();
961 return SI->isSimple();
963 return !
MI->isVolatile();
971 bool ExtendingManyInputs =
false) {
975 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
977 (SubMask.
size() == Mask.size() &&
978 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
979 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
980 "SubMask with many inputs support must be larger than the mask.");
982 Mask.append(SubMask.
begin(), SubMask.
end());
986 int TermValue = std::min(Mask.size(), SubMask.
size());
987 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
989 (!ExtendingManyInputs &&
990 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
992 NewMask[
I] = Mask[SubMask[
I]];
1008 const unsigned Sz = Order.
size();
1011 for (
unsigned I = 0;
I < Sz; ++
I) {
1013 UnusedIndices.
reset(Order[
I]);
1015 MaskedIndices.
set(
I);
1017 if (MaskedIndices.
none())
1020 "Non-synced masked/available indices.");
1024 assert(
Idx >= 0 &&
"Indices must be synced.");
1036 for (
unsigned Lane : seq<unsigned>(VL.
size()))
1037 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1038 OpcodeMask.
set(Lane);
1047 const unsigned E = Indices.
size();
1049 for (
unsigned I = 0;
I < E; ++
I)
1050 Mask[Indices[
I]] =
I;
1056 assert(!Mask.empty() &&
"Expected non-empty mask.");
1060 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1062 Scalars[Mask[
I]] = Prev[
I];
1070 auto *
I = dyn_cast<Instruction>(V);
1075 auto *IO = dyn_cast<Instruction>(V);
1078 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1087 auto *
I = dyn_cast<Instruction>(V);
1091 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1093 auto *IU = dyn_cast<Instruction>(U);
1096 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1112 return !VL.
empty() &&
1116namespace slpvectorizer {
1121 struct ScheduleData;
1146 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1147 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1201 return !VectorizableTree.
empty() &&
1202 !VectorizableTree.
front()->UserTreeIndices.empty();
1207 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1208 return VectorizableTree.
front()->Scalars;
1214 return MinBWs.
at(VectorizableTree.
front().get()).second;
1229 VectorizableTree.
clear();
1230 ScalarToTreeEntry.clear();
1231 MultiNodeScalars.clear();
1233 NonScheduledFirst.
clear();
1234 EntryToLastInstruction.clear();
1235 ExternalUses.
clear();
1236 ExternalUsesAsGEPs.clear();
1237 for (
auto &Iter : BlocksSchedules) {
1238 BlockScheduling *BS = Iter.second.get();
1242 ReductionBitWidth = 0;
1243 CastMaxMinBWSizes.reset();
1244 ExtraBitWidthNodes.
clear();
1245 InstrElementSize.clear();
1246 UserIgnoreList =
nullptr;
1247 PostponedGathers.
clear();
1248 ValueToGatherNodes.
clear();
1305 return MaxVecRegSize;
1310 return MinVecRegSize;
1318 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1320 return MaxVF ? MaxVF : UINT_MAX;
1364 bool TryRecursiveCheck =
true)
const;
1388 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1389 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1411 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1412 MaxLevel(MaxLevel) {}
1466 if (isa<LoadInst>(V1)) {
1468 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1473 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1475 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1478 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1481 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1483 ((
int)V1->getNumUses() == NumLanes ||
1484 AllUsersAreInternal(V1, V2)))
1490 auto CheckSameEntryOrFail = [&]() {
1491 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1492 TE1 && TE1 == R.getTreeEntry(V2))
1497 auto *LI1 = dyn_cast<LoadInst>(V1);
1498 auto *LI2 = dyn_cast<LoadInst>(V2);
1500 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1502 return CheckSameEntryOrFail();
1505 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1506 LI2->getPointerOperand(),
DL, SE,
true);
1507 if (!Dist || *Dist == 0) {
1510 R.TTI->isLegalMaskedGather(
1513 return CheckSameEntryOrFail();
1517 if (std::abs(*Dist) > NumLanes / 2)
1526 auto *C1 = dyn_cast<Constant>(V1);
1527 auto *C2 = dyn_cast<Constant>(V2);
1541 if (isa<UndefValue>(V2))
1545 Value *EV2 =
nullptr;
1558 int Dist = Idx2 - Idx1;
1561 if (std::abs(Dist) == 0)
1563 if (std::abs(Dist) > NumLanes / 2)
1570 return CheckSameEntryOrFail();
1573 auto *I1 = dyn_cast<Instruction>(V1);
1574 auto *I2 = dyn_cast<Instruction>(V2);
1576 if (I1->getParent() != I2->getParent())
1577 return CheckSameEntryOrFail();
1584 if (S.getOpcode() &&
1585 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1586 !S.isAltShuffle()) &&
1588 return cast<Instruction>(V)->getNumOperands() ==
1589 S.MainOp->getNumOperands();
1595 if (isa<UndefValue>(V2))
1598 return CheckSameEntryOrFail();
1632 int ShallowScoreAtThisLevel =
1641 auto *I1 = dyn_cast<Instruction>(
LHS);
1642 auto *I2 = dyn_cast<Instruction>(
RHS);
1643 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1645 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1646 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1647 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1648 ShallowScoreAtThisLevel))
1649 return ShallowScoreAtThisLevel;
1650 assert(I1 && I2 &&
"Should have early exited.");
1657 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1658 OpIdx1 != NumOperands1; ++OpIdx1) {
1660 int MaxTmpScore = 0;
1661 unsigned MaxOpIdx2 = 0;
1662 bool FoundBest =
false;
1666 ? I2->getNumOperands()
1667 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1668 assert(FromIdx <= ToIdx &&
"Bad index");
1669 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1671 if (Op2Used.
count(OpIdx2))
1676 I1, I2, CurrLevel + 1, std::nullopt);
1679 TmpScore > MaxTmpScore) {
1680 MaxTmpScore = TmpScore;
1687 Op2Used.
insert(MaxOpIdx2);
1688 ShallowScoreAtThisLevel += MaxTmpScore;
1691 return ShallowScoreAtThisLevel;
1722 struct OperandData {
1723 OperandData() =
default;
1724 OperandData(
Value *V,
bool APO,
bool IsUsed)
1725 : V(V), APO(APO), IsUsed(IsUsed) {}
1735 bool IsUsed =
false;
1744 enum class ReorderingMode {
1761 const Loop *L =
nullptr;
1764 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1765 return OpsVec[OpIdx][Lane];
1769 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1770 return OpsVec[OpIdx][Lane];
1775 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1776 OpIdx != NumOperands; ++OpIdx)
1777 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1779 OpsVec[OpIdx][Lane].IsUsed =
false;
1783 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1784 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1796 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1797 Value *IdxLaneV = getData(
Idx, Lane).V;
1798 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1801 for (
unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1804 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1805 if (!isa<Instruction>(OpIdxLnV))
1807 Uniques.
insert(OpIdxLnV);
1809 int UniquesCount = Uniques.
size();
1810 int UniquesCntWithIdxLaneV =
1811 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1812 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1813 int UniquesCntWithOpIdxLaneV =
1814 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1815 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1818 UniquesCntWithOpIdxLaneV) -
1819 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1828 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1829 Value *IdxLaneV = getData(
Idx, Lane).V;
1830 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1839 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1840 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1842 return R.areAllUsersVectorized(IdxLaneI)
1850 static const int ScoreScaleFactor = 10;
1858 int Lane,
unsigned OpIdx,
unsigned Idx,
1868 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1869 if (Score <= -SplatScore) {
1874 Score += SplatScore;
1880 Score *= ScoreScaleFactor;
1881 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1899 std::optional<unsigned>
1900 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1903 unsigned NumOperands = getNumOperands();
1906 Value *OpLastLane = getData(OpIdx, LastLane).V;
1909 ReorderingMode RMode = ReorderingModes[OpIdx];
1910 if (RMode == ReorderingMode::Failed)
1911 return std::nullopt;
1914 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1920 std::optional<unsigned>
Idx;
1924 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1930 bool IsUsed = RMode == ReorderingMode::Splat ||
1931 RMode == ReorderingMode::Constant ||
1932 RMode == ReorderingMode::Load;
1934 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1936 OperandData &OpData = getData(
Idx, Lane);
1938 bool OpAPO = OpData.APO;
1947 if (OpAPO != OpIdxAPO)
1952 case ReorderingMode::Load:
1953 case ReorderingMode::Opcode: {
1954 bool LeftToRight = Lane > LastLane;
1955 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1956 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1957 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1958 OpIdx,
Idx, IsUsed);
1959 if (Score >
static_cast<int>(BestOp.Score) ||
1960 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
1963 BestOp.Score = Score;
1964 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1968 case ReorderingMode::Constant:
1969 if (isa<Constant>(
Op) ||
1970 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
1972 if (isa<Constant>(
Op)) {
1974 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1977 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
1981 case ReorderingMode::Splat:
1982 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
1983 IsUsed =
Op == OpLastLane;
1984 if (
Op == OpLastLane) {
1986 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1992 case ReorderingMode::Failed:
1998 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2002 return std::nullopt;
2009 unsigned getBestLaneToStartReordering()
const {
2010 unsigned Min = UINT_MAX;
2011 unsigned SameOpNumber = 0;
2022 for (
int I = getNumLanes();
I > 0; --
I) {
2023 unsigned Lane =
I - 1;
2024 OperandsOrderData NumFreeOpsHash =
2025 getMaxNumOperandsThatCanBeReordered(Lane);
2028 if (NumFreeOpsHash.NumOfAPOs < Min) {
2029 Min = NumFreeOpsHash.NumOfAPOs;
2030 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2032 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2033 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2034 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2037 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2038 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2039 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2040 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2041 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
2042 if (It == HashMap.
end())
2043 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2049 unsigned BestLane = 0;
2050 unsigned CntMin = UINT_MAX;
2052 if (
Data.second.first < CntMin) {
2053 CntMin =
Data.second.first;
2054 BestLane =
Data.second.second;
2061 struct OperandsOrderData {
2064 unsigned NumOfAPOs = UINT_MAX;
2067 unsigned NumOpsWithSameOpcodeParent = 0;
2081 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2082 unsigned CntTrue = 0;
2083 unsigned NumOperands = getNumOperands();
2093 bool AllUndefs =
true;
2094 unsigned NumOpsWithSameOpcodeParent = 0;
2098 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2099 const OperandData &OpData = getData(OpIdx, Lane);
2104 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2106 I->getParent() != Parent) {
2107 if (NumOpsWithSameOpcodeParent == 0) {
2108 NumOpsWithSameOpcodeParent = 1;
2110 Parent =
I->getParent();
2112 --NumOpsWithSameOpcodeParent;
2115 ++NumOpsWithSameOpcodeParent;
2119 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2120 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2124 OperandsOrderData
Data;
2125 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2126 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2134 assert((empty() || VL.
size() == getNumLanes()) &&
2135 "Expected same number of lanes");
2136 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2137 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2138 constexpr unsigned IntrinsicNumOperands = 2;
2139 if (isa<IntrinsicInst>(VL[0]))
2140 NumOperands = IntrinsicNumOperands;
2141 OpsVec.
resize(NumOperands);
2142 unsigned NumLanes = VL.
size();
2143 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2144 OpsVec[OpIdx].
resize(NumLanes);
2145 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2146 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2157 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2158 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2159 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2166 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2169 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2172 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2173 return getData(OpIdx, Lane).V;
2177 bool empty()
const {
return OpsVec.
empty(); }
2180 void clear() { OpsVec.
clear(); }
2185 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2186 bool OpAPO = getData(OpIdx, Lane).APO;
2187 bool IsInvariant = L && L->isLoopInvariant(
Op);
2189 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2193 bool FoundCandidate =
false;
2194 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2195 OperandData &
Data = getData(OpI, Ln);
2196 if (
Data.APO != OpAPO ||
Data.IsUsed)
2198 Value *OpILane = getValue(OpI, Lane);
2199 bool IsConstantOp = isa<Constant>(OpILane);
2208 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2214 isa<Constant>(
Data.V)))) ||
2221 (IsInvariant && !isa<Constant>(
Data.V) &&
2223 L->isLoopInvariant(
Data.V))) {
2224 FoundCandidate =
true;
2231 if (!FoundCandidate)
2234 return getNumLanes() == 2 || Cnt > 1;
2239 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2240 bool OpAPO = getData(OpIdx, Lane).APO;
2241 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2244 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2245 const OperandData &
Data = getData(OpI, Ln);
2246 if (
Data.APO != OpAPO ||
Data.IsUsed)
2248 Value *OpILn = getValue(OpI, Ln);
2249 return (L && L->isLoopInvariant(OpILn)) ||
2251 Op->getParent() == cast<Instruction>(OpILn)->getParent());
2261 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2265 appendOperandsOfVL(RootVL);
2272 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2273 "Expected same num of lanes across all operands");
2274 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2275 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2283 unsigned NumOperands = getNumOperands();
2284 unsigned NumLanes = getNumLanes();
2304 unsigned FirstLane = getBestLaneToStartReordering();
2307 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2308 Value *OpLane0 = getValue(OpIdx, FirstLane);
2311 if (isa<LoadInst>(OpLane0))
2312 ReorderingModes[OpIdx] = ReorderingMode::Load;
2313 else if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2315 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2316 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2317 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2319 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2320 }
else if (isa<Constant>(OpLane0))
2321 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2322 else if (isa<Argument>(OpLane0))
2324 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2327 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2334 auto &&SkipReordering = [
this]() {
2337 for (
const OperandData &
Data : Op0)
2340 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2359 if (SkipReordering())
2362 bool StrategyFailed =
false;
2370 for (
unsigned I = 0;
I < NumOperands; ++
I)
2371 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2373 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2376 int Lane = FirstLane +
Direction * Distance;
2377 if (Lane < 0 || Lane >= (
int)NumLanes)
2380 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2383 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2385 std::optional<unsigned> BestIdx = getBestOperand(
2386 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2393 swap(OpIdx, *BestIdx, Lane);
2396 StrategyFailed =
true;
2399 if (MainAltOps[OpIdx].
size() != 2) {
2400 OperandData &AltOp = getData(OpIdx, Lane);
2401 InstructionsState OpS =
2403 if (OpS.getOpcode() && OpS.isAltShuffle())
2410 if (!StrategyFailed)
2415#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2418 case ReorderingMode::Load:
2420 case ReorderingMode::Opcode:
2422 case ReorderingMode::Constant:
2424 case ReorderingMode::Splat:
2426 case ReorderingMode::Failed:
2447 const unsigned Indent = 2;
2450 OS <<
"Operand " << Cnt++ <<
"\n";
2451 for (
const OperandData &OpData : OpDataVec) {
2453 if (
Value *V = OpData.V)
2457 OS <<
", APO:" << OpData.APO <<
"}\n";
2479 int BestScore = Limit;
2480 std::optional<int>
Index;
2481 for (
int I : seq<int>(0, Candidates.size())) {
2483 Candidates[
I].second,
2486 if (Score > BestScore) {
2501 DeletedInstructions.insert(
I);
2506 template <
typename T>
2509 for (
T *V : DeadVals) {
2510 auto *
I = cast<Instruction>(V);
2511 DeletedInstructions.insert(
I);
2514 for (
T *V : DeadVals) {
2515 if (!V || !Processed.
insert(V).second)
2517 auto *
I = cast<Instruction>(V);
2520 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2521 Entries.push_back(Entry);
2522 auto It = MultiNodeScalars.find(
I);
2523 if (It != MultiNodeScalars.end())
2524 Entries.append(It->second.begin(), It->second.end());
2526 for (
Use &U :
I->operands()) {
2527 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2528 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2530 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2531 return Entry->VectorizedValue == OpI;
2535 I->dropAllReferences();
2537 for (
T *V : DeadVals) {
2538 auto *
I = cast<Instruction>(V);
2539 if (!
I->getParent())
2544 cast<Instruction>(U.getUser()));
2546 "trying to erase instruction with users.");
2547 I->removeFromParent();
2551 while (!DeadInsts.
empty()) {
2554 if (!VI || !VI->getParent())
2557 "Live instruction found in dead worklist!");
2558 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2565 for (
Use &OpU : VI->operands()) {
2566 Value *OpV = OpU.get();
2577 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2578 if (!DeletedInstructions.contains(OpI) &&
2583 VI->removeFromParent();
2584 DeletedInstructions.insert(VI);
2592 return AnalyzedReductionsRoots.count(
I);
2597 AnalyzedReductionsRoots.insert(
I);
2611 AnalyzedReductionsRoots.clear();
2612 AnalyzedReductionVals.
clear();
2613 AnalyzedMinBWVals.
clear();
2625 return NonScheduledFirst.
contains(V);
2638 bool collectValuesToDemote(
const TreeEntry &E,
bool IsProfitableToDemoteRoot,
2642 unsigned &MaxDepthLevel,
2643 bool &IsProfitableToDemote,
2644 bool IsTruncRoot)
const;
2654 canReorderOperands(TreeEntry *UserTE,
2661 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2665 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2667 TreeEntry *TE =
nullptr;
2669 TE = getTreeEntry(V);
2670 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2672 auto It = MultiNodeScalars.find(V);
2673 if (It != MultiNodeScalars.end()) {
2674 for (TreeEntry *E : It->second) {
2675 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2683 if (It != VL.
end()) {
2684 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2692 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2693 unsigned OpIdx)
const {
2694 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2695 const_cast<TreeEntry *
>(UserTE), OpIdx);
2699 bool areAllUsersVectorized(
2708 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
2712 getCastContextHint(
const TreeEntry &TE)
const;
2721 const EdgeInfo &EI);
2732 bool ResizeAllowed =
false)
const;
2743 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
2748 template <
typename BVTy,
typename ResTy,
typename...
Args>
2749 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
2754 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
2760 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
2767 std::optional<TargetTransformInfo::ShuffleKind>
2779 unsigned NumParts)
const;
2791 std::optional<TargetTransformInfo::ShuffleKind>
2792 isGatherShuffledSingleRegisterEntry(
2809 isGatherShuffledEntry(
2812 unsigned NumParts,
bool ForOrder =
false);
2819 Type *ScalarTy)
const;
2823 void setInsertPointAfterBundle(
const TreeEntry *E);
2831 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2844 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2860 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2864 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2881 [Scalars](
Value *V,
int Idx) {
2882 return (isa<UndefValue>(V) &&
2883 Idx == PoisonMaskElem) ||
2884 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2887 if (!ReorderIndices.empty()) {
2894 return IsSame(Scalars, Mask);
2895 if (VL.
size() == ReuseShuffleIndices.size()) {
2897 return IsSame(Scalars, Mask);
2901 return IsSame(Scalars, ReuseShuffleIndices);
2904 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2905 return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2906 UserTreeIndices.front().UserTE == UserEI.UserTE;
2910 bool hasEqualOperands(
const TreeEntry &TE)
const {
2911 if (
TE.getNumOperands() != getNumOperands())
2914 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
2915 unsigned PrevCount =
Used.count();
2916 for (
unsigned K = 0;
K < E; ++
K) {
2919 if (getOperand(K) ==
TE.getOperand(
I)) {
2925 if (PrevCount ==
Used.count())
2934 unsigned getVectorFactor()
const {
2935 if (!ReuseShuffleIndices.empty())
2936 return ReuseShuffleIndices.size();
2937 return Scalars.
size();
2941 bool isGather()
const {
return State == NeedToGather; }
2975 VecTreeTy &Container;
2999 assert(Operands[OpIdx].empty() &&
"Already resized?");
3001 "Number of operands is greater than the number of scalars.");
3007 void setOperandsInOrder() {
3009 auto *I0 = cast<Instruction>(Scalars[0]);
3010 Operands.resize(I0->getNumOperands());
3011 unsigned NumLanes = Scalars.size();
3012 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3013 OpIdx != NumOperands; ++OpIdx) {
3015 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3016 auto *
I = cast<Instruction>(Scalars[Lane]);
3017 assert(
I->getNumOperands() == NumOperands &&
3018 "Expected same number of operands");
3019 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
3043 unsigned getNumOperands()
const {
return Operands.size(); }
3046 Value *getSingleOperand(
unsigned OpIdx)
const {
3048 assert(!Operands[OpIdx].empty() &&
"No operand available");
3053 bool isAltShuffle()
const {
return MainOp != AltOp; }
3056 unsigned CheckedOpcode =
I->getOpcode();
3057 return (getOpcode() == CheckedOpcode ||
3058 getAltOpcode() == CheckedOpcode);
3065 auto *
I = dyn_cast<Instruction>(
Op);
3066 if (
I && isOpcodeOrAlt(
I))
3071 void setOperations(
const InstructionsState &S) {
3085 unsigned getOpcode()
const {
3086 return MainOp ? MainOp->
getOpcode() : 0;
3089 unsigned getAltOpcode()
const {
3095 int findLaneForValue(
Value *V)
const {
3096 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
3097 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3098 if (!ReorderIndices.
empty())
3099 FoundLane = ReorderIndices[FoundLane];
3100 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3101 if (!ReuseShuffleIndices.
empty()) {
3102 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
3103 find(ReuseShuffleIndices, FoundLane));
3117 bool isNonPowOf2Vec()
const {
3119 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3120 "Reshuffling not supported with non-power-of-2 vectors yet.");
3121 return IsNonPowerOf2;
3128 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3129 dbgs() <<
"Operand " << OpI <<
":\n";
3130 for (
const Value *V : Operands[OpI])
3133 dbgs() <<
"Scalars: \n";
3134 for (
Value *V : Scalars)
3136 dbgs() <<
"State: ";
3139 dbgs() <<
"Vectorize\n";
3141 case ScatterVectorize:
3142 dbgs() <<
"ScatterVectorize\n";
3144 case StridedVectorize:
3145 dbgs() <<
"StridedVectorize\n";
3148 dbgs() <<
"NeedToGather\n";
3151 dbgs() <<
"MainOp: ";
3153 dbgs() << *MainOp <<
"\n";
3156 dbgs() <<
"AltOp: ";
3158 dbgs() << *AltOp <<
"\n";
3161 dbgs() <<
"VectorizedValue: ";
3162 if (VectorizedValue)
3163 dbgs() << *VectorizedValue <<
"\n";
3166 dbgs() <<
"ReuseShuffleIndices: ";
3167 if (ReuseShuffleIndices.
empty())
3170 for (
int ReuseIdx : ReuseShuffleIndices)
3171 dbgs() << ReuseIdx <<
", ";
3173 dbgs() <<
"ReorderIndices: ";
3174 for (
unsigned ReorderIdx : ReorderIndices)
3175 dbgs() << ReorderIdx <<
", ";
3177 dbgs() <<
"UserTreeIndices: ";
3178 for (
const auto &EInfo : UserTreeIndices)
3179 dbgs() << EInfo <<
", ";
3186 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3189 dbgs() <<
"SLP: " << Banner <<
":\n";
3191 dbgs() <<
"SLP: Costs:\n";
3192 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3193 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3194 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3195 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3196 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3202 std::optional<ScheduleData *> Bundle,
3203 const InstructionsState &S,
3204 const EdgeInfo &UserTreeIdx,
3207 TreeEntry::EntryState EntryState =
3208 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3209 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3210 ReuseShuffleIndices, ReorderIndices);
3214 TreeEntry::EntryState EntryState,
3215 std::optional<ScheduleData *> Bundle,
3216 const InstructionsState &S,
3217 const EdgeInfo &UserTreeIdx,
3220 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3221 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3222 "Need to vectorize gather entry?");
3223 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3224 TreeEntry *
Last = VectorizableTree.
back().get();
3225 Last->Idx = VectorizableTree.
size() - 1;
3226 Last->State = EntryState;
3227 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3228 ReuseShuffleIndices.end());
3229 if (ReorderIndices.
empty()) {
3231 Last->setOperations(S);
3234 Last->Scalars.assign(VL.
size(),
nullptr);
3237 if (Idx >= VL.size())
3238 return UndefValue::get(VL.front()->getType());
3242 Last->setOperations(S);
3243 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3245 if (!
Last->isGather()) {
3246 for (
Value *V : VL) {
3247 const TreeEntry *
TE = getTreeEntry(V);
3249 "Scalar already in tree!");
3252 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3255 ScalarToTreeEntry[
V] =
Last;
3258 ScheduleData *BundleMember = *Bundle;
3259 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3262 "Bundle and VL out of sync");
3264 for (
Value *V : VL) {
3269 BundleMember->TE =
Last;
3270 BundleMember = BundleMember->NextInBundle;
3273 assert(!BundleMember &&
"Bundle and VL out of sync");
3276 bool AllConstsOrCasts =
true;
3279 auto *
I = dyn_cast<CastInst>(V);
3280 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3283 if (AllConstsOrCasts)
3285 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3286 MustGather.
insert(VL.begin(), VL.end());
3289 if (UserTreeIdx.UserTE) {
3290 Last->UserTreeIndices.push_back(UserTreeIdx);
3291 assert((!
Last->isNonPowOf2Vec() ||
Last->ReorderIndices.empty()) &&
3292 "Reordering isn't implemented for non-power-of-2 nodes yet");
3299 TreeEntry::VecTreeTy VectorizableTree;
3304 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3305 VectorizableTree[
Id]->dump();
3311 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3313 const TreeEntry *getTreeEntry(
Value *V)
const {
3314 return ScalarToTreeEntry.lookup(V);
3323 bool areAltOperandsProfitable(
const InstructionsState &S,
3328 TreeEntry::EntryState getScalarsVectorizationState(
3361 using ValueToGatherNodesMap =
3363 ValueToGatherNodesMap ValueToGatherNodes;
3366 struct ExternalUser {
3390 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3391 auto It = AliasCache.
find(Key);
3392 if (It != AliasCache.
end())
3397 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3401 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3433 UserList ExternalUses;
3453 struct ScheduleData {
3456 enum { InvalidDeps = -1 };
3458 ScheduleData() =
default;
3460 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3461 FirstInBundle =
this;
3462 NextInBundle =
nullptr;
3463 NextLoadStore =
nullptr;
3464 IsScheduled =
false;
3465 SchedulingRegionID = BlockSchedulingRegionID;
3466 clearDependencies();
3473 if (hasValidDependencies()) {
3474 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3476 assert(UnscheduledDeps == Dependencies &&
"invariant");
3480 assert(isSchedulingEntity() &&
3481 "unexpected scheduled state");
3482 for (
const ScheduleData *BundleMember =
this; BundleMember;
3483 BundleMember = BundleMember->NextInBundle) {
3484 assert(BundleMember->hasValidDependencies() &&
3485 BundleMember->UnscheduledDeps == 0 &&
3486 "unexpected scheduled state");
3487 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3488 "only bundle is marked scheduled");
3492 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3493 "all bundle members must be in same basic block");
3499 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3503 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3507 bool isPartOfBundle()
const {
3508 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3513 bool isReady()
const {
3514 assert(isSchedulingEntity() &&
3515 "can't consider non-scheduling entity for ready list");
3516 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3522 int incrementUnscheduledDeps(
int Incr) {
3523 assert(hasValidDependencies() &&
3524 "increment of unscheduled deps would be meaningless");
3525 UnscheduledDeps += Incr;
3526 return FirstInBundle->unscheduledDepsInBundle();
3531 void resetUnscheduledDeps() {
3532 UnscheduledDeps = Dependencies;
3536 void clearDependencies() {
3537 Dependencies = InvalidDeps;
3538 resetUnscheduledDeps();
3539 MemoryDependencies.clear();
3540 ControlDependencies.clear();
3543 int unscheduledDepsInBundle()
const {
3544 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3546 for (
const ScheduleData *BundleMember =
this; BundleMember;
3547 BundleMember = BundleMember->NextInBundle) {
3548 if (BundleMember->UnscheduledDeps == InvalidDeps)
3550 Sum += BundleMember->UnscheduledDeps;
3556 if (!isSchedulingEntity()) {
3557 os <<
"/ " << *Inst;
3558 }
else if (NextInBundle) {
3560 ScheduleData *SD = NextInBundle;
3562 os <<
';' << *SD->Inst;
3563 SD = SD->NextInBundle;
3574 Value *OpValue =
nullptr;
3577 TreeEntry *
TE =
nullptr;
3581 ScheduleData *FirstInBundle =
nullptr;
3585 ScheduleData *NextInBundle =
nullptr;
3589 ScheduleData *NextLoadStore =
nullptr;
3603 int SchedulingRegionID = 0;
3606 int SchedulingPriority = 0;
3612 int Dependencies = InvalidDeps;
3618 int UnscheduledDeps = InvalidDeps;
3622 bool IsScheduled =
false;
3627 const BoUpSLP::ScheduleData &SD) {
3652 struct BlockScheduling {
3654 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3658 ScheduleStart =
nullptr;
3659 ScheduleEnd =
nullptr;
3660 FirstLoadStoreInRegion =
nullptr;
3661 LastLoadStoreInRegion =
nullptr;
3662 RegionHasStackSave =
false;
3666 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3669 ScheduleRegionSize = 0;
3673 ++SchedulingRegionID;
3677 if (BB !=
I->getParent())
3680 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3681 if (SD && isInSchedulingRegion(SD))
3686 ScheduleData *getScheduleData(
Value *V) {
3687 if (
auto *
I = dyn_cast<Instruction>(V))
3688 return getScheduleData(
I);
3692 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3694 return getScheduleData(V);
3695 auto I = ExtraScheduleDataMap.find(V);
3696 if (
I != ExtraScheduleDataMap.end()) {
3697 ScheduleData *SD =
I->second.lookup(Key);
3698 if (SD && isInSchedulingRegion(SD))
3704 bool isInSchedulingRegion(ScheduleData *SD)
const {
3705 return SD->SchedulingRegionID == SchedulingRegionID;
3710 template <
typename ReadyListType>
3711 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3712 SD->IsScheduled =
true;
3715 for (ScheduleData *BundleMember = SD; BundleMember;
3716 BundleMember = BundleMember->NextInBundle) {
3717 if (BundleMember->Inst != BundleMember->OpValue)
3723 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3724 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3725 if (OpDef && OpDef->hasValidDependencies() &&
3726 OpDef->incrementUnscheduledDeps(-1) == 0) {
3730 ScheduleData *DepBundle = OpDef->FirstInBundle;
3731 assert(!DepBundle->IsScheduled &&
3732 "already scheduled bundle gets ready");
3733 ReadyList.insert(DepBundle);
3735 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3743 if (TreeEntry *TE = BundleMember->TE) {
3745 int Lane = std::distance(
TE->Scalars.begin(),
3746 find(
TE->Scalars, BundleMember->Inst));
3747 assert(Lane >= 0 &&
"Lane not set");
3755 auto *
In = BundleMember->Inst;
3758 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3759 In->getNumOperands() ==
TE->getNumOperands()) &&
3760 "Missed TreeEntry operands?");
3763 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3764 OpIdx != NumOperands; ++OpIdx)
3765 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3770 for (
Use &U : BundleMember->Inst->operands())
3771 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3775 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3776 if (MemoryDepSD->hasValidDependencies() &&
3777 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3780 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3781 assert(!DepBundle->IsScheduled &&
3782 "already scheduled bundle gets ready");
3783 ReadyList.insert(DepBundle);
3785 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3789 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3790 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3793 ScheduleData *DepBundle = DepSD->FirstInBundle;
3794 assert(!DepBundle->IsScheduled &&
3795 "already scheduled bundle gets ready");
3796 ReadyList.insert(DepBundle);
3798 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3809 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3810 ScheduleStart->comesBefore(ScheduleEnd) &&
3811 "Not a valid scheduling region?");
3813 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3814 auto *SD = getScheduleData(
I);
3817 assert(isInSchedulingRegion(SD) &&
3818 "primary schedule data not in window?");
3819 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3820 "entire bundle in window!");
3822 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3825 for (
auto *SD : ReadyInsts) {
3826 assert(SD->isSchedulingEntity() && SD->isReady() &&
3827 "item in ready list not ready?");
3832 void doForAllOpcodes(
Value *V,
3834 if (ScheduleData *SD = getScheduleData(V))
3836 auto I = ExtraScheduleDataMap.find(V);
3837 if (
I != ExtraScheduleDataMap.end())
3838 for (
auto &
P :
I->second)
3839 if (isInSchedulingRegion(
P.second))
3844 template <
typename ReadyListType>
3845 void initialFillReadyList(ReadyListType &ReadyList) {
3846 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3847 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3848 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3850 ReadyList.insert(SD);
3852 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3867 std::optional<ScheduleData *>
3869 const InstructionsState &S);
3875 ScheduleData *allocateScheduleDataChunks();
3879 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3884 ScheduleData *PrevLoadStore,
3885 ScheduleData *NextLoadStore);
3889 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3893 void resetSchedule();
3914 ExtraScheduleDataMap;
3927 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3931 ScheduleData *LastLoadStoreInRegion =
nullptr;
3936 bool RegionHasStackSave =
false;
3939 int ScheduleRegionSize = 0;
3948 int SchedulingRegionID = 1;
3956 void scheduleBlock(BlockScheduling *BS);
3963 struct OrdersTypeDenseMapInfo {
3976 static unsigned getHashValue(
const OrdersType &V) {
3997 unsigned MaxVecRegSize;
3998 unsigned MinVecRegSize;
4013 unsigned ReductionBitWidth = 0;
4017 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4036 struct ChildIteratorType
4038 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4049 return R.VectorizableTree[0].get();
4053 return {
N->UserTreeIndices.begin(),
N->Container};
4057 return {
N->UserTreeIndices.end(),
N->Container};
4062 class nodes_iterator {
4073 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4077 return nodes_iterator(R->VectorizableTree.begin());
4081 return nodes_iterator(R->VectorizableTree.end());
4084 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4095 OS << Entry->Idx <<
".\n";
4098 for (
auto *V : Entry->Scalars) {
4100 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4101 return EU.Scalar == V;
4111 if (Entry->isGather())
4113 if (Entry->State == TreeEntry::ScatterVectorize ||
4114 Entry->State == TreeEntry::StridedVectorize)
4115 return "color=blue";
4124 for (
auto *
I : DeletedInstructions) {
4125 if (!
I->getParent()) {
4128 if (isa<PHINode>(
I))
4130 I->insertBefore(
F->getEntryBlock(),
4131 F->getEntryBlock().getFirstNonPHIIt());
4133 I->insertBefore(
F->getEntryBlock().getTerminator());
4136 for (
Use &U :
I->operands()) {
4137 auto *
Op = dyn_cast<Instruction>(U.get());
4138 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4142 I->dropAllReferences();
4144 for (
auto *
I : DeletedInstructions) {
4146 "trying to erase instruction with users.");
4147 I->eraseFromParent();
4153#ifdef EXPENSIVE_CHECKS
4164 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4165 "Expected non-empty mask.");
4168 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4170 Reuses[Mask[
I]] = Prev[
I];
4178 bool BottomOrder =
false) {
4179 assert(!Mask.empty() &&
"Expected non-empty mask.");
4180 unsigned Sz = Mask.size();
4183 if (Order.
empty()) {
4185 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4187 PrevOrder.
swap(Order);
4190 for (
unsigned I = 0;
I < Sz; ++
I)
4192 Order[
I] = PrevOrder[Mask[
I]];
4194 return Data.value() == Sz ||
Data.index() ==
Data.value();
4203 if (Order.
empty()) {
4205 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4215 for (
unsigned I = 0;
I < Sz; ++
I)
4217 Order[MaskOrder[
I]] =
I;
4221std::optional<BoUpSLP::OrdersType>
4223 assert(TE.isGather() &&
"Expected gather node only.");
4227 Type *ScalarTy = GatheredScalars.
front()->getType();
4228 int NumScalars = GatheredScalars.
size();
4230 return std::nullopt;
4233 if (NumParts == 0 || NumParts >= NumScalars)
4239 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4241 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4244 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4245 return std::nullopt;
4246 OrdersType CurrentOrder(NumScalars, NumScalars);
4247 if (GatherShuffles.
size() == 1 &&
4249 Entries.front().front()->isSame(TE.Scalars)) {
4252 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4253 return CurrentOrder;
4257 return all_of(Mask, [&](
int I) {
4264 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4265 (Entries.size() != 1 ||
4266 Entries.front().front()->ReorderIndices.empty())) ||
4267 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4268 return std::nullopt;
4273 for (
int I : seq<int>(0, NumParts)) {
4274 if (ShuffledSubMasks.
test(
I))
4276 const int VF = GetVF(
I);
4282 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4283 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4284 ShuffledSubMasks.
set(
I);
4288 int FirstMin = INT_MAX;
4289 int SecondVecFound =
false;
4290 for (
int K : seq<int>(Limit)) {
4291 int Idx = Mask[
I * PartSz + K];
4293 Value *V = GatheredScalars[
I * PartSz + K];
4295 SecondVecFound =
true;
4304 SecondVecFound =
true;
4308 FirstMin = (FirstMin / PartSz) * PartSz;
4310 if (SecondVecFound) {
4311 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4312 ShuffledSubMasks.
set(
I);
4315 for (
int K : seq<int>(Limit)) {
4316 int Idx = Mask[
I * PartSz + K];
4320 if (
Idx >= PartSz) {
4321 SecondVecFound =
true;
4324 if (CurrentOrder[
I * PartSz +
Idx] >
4325 static_cast<unsigned>(
I * PartSz + K) &&
4326 CurrentOrder[
I * PartSz +
Idx] !=
4327 static_cast<unsigned>(
I * PartSz +
Idx))
4328 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4331 if (SecondVecFound) {
4332 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4333 ShuffledSubMasks.
set(
I);
4339 if (!ExtractShuffles.
empty())
4340 TransformMaskToOrder(
4341 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4342 if (!ExtractShuffles[
I])
4345 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4346 for (
unsigned Idx : seq<unsigned>(Sz)) {
4347 int K =
I * PartSz +
Idx;
4350 if (!TE.ReuseShuffleIndices.empty())
4351 K = TE.ReuseShuffleIndices[K];
4352 if (!TE.ReorderIndices.empty())
4353 K = std::distance(TE.ReorderIndices.begin(),
4354 find(TE.ReorderIndices, K));
4355 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4358 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4360 .getKnownMinValue());
4365 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4366 if (ShuffledSubMasks.
any())
4367 return std::nullopt;
4368 PartSz = NumScalars;
4371 if (!Entries.empty())
4372 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4373 if (!GatherShuffles[
I])
4375 return std::max(Entries[
I].front()->getVectorFactor(),
4376 Entries[
I].back()->getVectorFactor());
4379 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4380 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4381 return std::nullopt;
4382 return std::move(CurrentOrder);
4387 bool CompareOpcodes =
true) {
4390 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4393 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4396 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4400 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4405template <
typename T>
4407 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4409 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4410 return CommonAlignment;
4415 unsigned Sz = Order.
size();
4417 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4428static std::optional<Value *>
4434 const SCEV *PtrSCEVLowest =
nullptr;
4435 const SCEV *PtrSCEVHighest =
nullptr;
4441 return std::nullopt;
4443 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4444 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4448 if (isa<SCEVCouldNotCompute>(Diff))
4449 return std::nullopt;
4451 PtrSCEVLowest = PtrSCEV;
4455 if (isa<SCEVCouldNotCompute>(Diff1))
4456 return std::nullopt;
4458 PtrSCEVHighest = PtrSCEV;
4464 if (isa<SCEVCouldNotCompute>(Dist))
4465 return std::nullopt;
4466 int Size =
DL.getTypeStoreSize(ElemTy);
4467 auto TryGetStride = [&](
const SCEV *Dist,
4468 const SCEV *Multiplier) ->
const SCEV * {
4469 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4470 if (M->getOperand(0) == Multiplier)
4471 return M->getOperand(1);
4472 if (M->getOperand(1) == Multiplier)
4473 return M->getOperand(0);
4476 if (Multiplier == Dist)
4481 const SCEV *Stride =
nullptr;
4482 if (
Size != 1 || SCEVs.
size() > 2) {
4484 Stride = TryGetStride(Dist, Sz);
4486 return std::nullopt;
4488 if (!Stride || isa<SCEVConstant>(Stride))
4489 return std::nullopt;
4492 using DistOrdPair = std::pair<int64_t, int>;
4494 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4496 bool IsConsecutive =
true;
4497 for (
const SCEV *PtrSCEV : SCEVs) {
4499 if (PtrSCEV != PtrSCEVLowest) {
4501 const SCEV *Coeff = TryGetStride(Diff, Stride);
4503 return std::nullopt;
4504 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4505 if (!SC || isa<SCEVCouldNotCompute>(SC))
4506 return std::nullopt;
4510 return std::nullopt;
4511 Dist = SC->getAPInt().getZExtValue();
4515 return std::nullopt;
4516 auto Res = Offsets.emplace(Dist, Cnt);
4518 return std::nullopt;
4520 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4523 if (Offsets.size() != SCEVs.
size())
4524 return std::nullopt;
4525 SortedIndices.
clear();
4526 if (!IsConsecutive) {
4530 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4531 SortedIndices[Cnt] = Pair.second;
4541static std::pair<InstructionCost, InstructionCost>
4557 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4563 const unsigned Sz = VL.
size();
4565 auto *POIter = PointerOps.
begin();
4566 for (
Value *V : VL) {
4567 auto *L = cast<LoadInst>(V);
4570 *POIter = L->getPointerOperand();
4581 "supported with VectorizeNonPowerOf2");
4585 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4596 if (Order.
empty()) {
4597 Ptr0 = PointerOps.
front();
4598 PtrN = PointerOps.
back();
4600 Ptr0 = PointerOps[Order.
front()];
4601 PtrN = PointerOps[Order.
back()];
4603 std::optional<int> Diff =
4606 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4609 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4621 (
static_cast<unsigned>(std::abs(*Diff)) <=
4624 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4625 *Diff == -(
static_cast<int>(Sz) - 1))) {
4626 int Stride = *Diff /
static_cast<int>(Sz - 1);
4627 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4639 else if (
Ptr != Ptr0)
4644 if (((Dist / Stride) * Stride) != Dist ||
4645 !Dists.
insert(Dist).second)
4648 if (Dists.
size() == Sz)
4654 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment) {
4655 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4657 unsigned MaxVF = std::max<unsigned>(
bit_floor(VL.
size() / 2), MinVF);
4658 MaxVF = std::min(
getMaximumVF(Sz, Instruction::Load), MaxVF);
4659 for (
unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4660 unsigned VectorizedCnt = 0;
4662 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End;
4663 Cnt += VF, ++VectorizedCnt) {
4681 if (VectorizedCnt == VL.
size() / VF) {
4684 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4685 TTI, PointerOps, PointerOps.
front(), Instruction::GetElementPtr,
4689 Instruction::Load, VecTy,
4691 false, CommonAlignment,
CostKind) +
4692 VectorGEPCost - ScalarGEPCost;
4696 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
4699 auto [ScalarGEPCost, VectorGEPCost] =
4701 LI0->getPointerOperand(), Instruction::Load,
4704 Instruction::Load, SubVecTy, LI0->getAlign(),
4705 LI0->getPointerAddressSpace(),
CostKind,
4707 VectorGEPCost - ScalarGEPCost;
4711 auto [ScalarGEPCost, VectorGEPCost] =
4713 LI0->getPointerOperand(), Instruction::Load,
4717 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4718 false, CommonAlignment,
CostKind) +
4719 VectorGEPCost - ScalarGEPCost;
4723 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4725 LI0->getPointerOperand(), Instruction::GetElementPtr,
4729 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4730 false, CommonAlignment,
CostKind) +
4731 VectorGEPCost - ScalarGEPCost;
4736 "Expected only consecutive, strided or masked gather loads.");
4739 for (
int Idx : seq<int>(0, VL.
size()))
4748 if (MaskedGatherCost >= VecLdCost)
4758 bool ProfitableGatherPointers =
4761 return L->isLoopInvariant(V);
4763 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
4764 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
4766 (
GEP &&
GEP->getNumOperands() == 2 &&
4767 isa<Constant, Instruction>(
GEP->getOperand(1)));
4769 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4774 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4793 "Expected list of pointer operands.");
4798 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4803 std::optional<int> Diff =
4809 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4815 if (Bases.
size() > VL.
size() / 2 - 1)
4819 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4825 bool AnyConsecutive =
false;
4826 for (
auto &
Base : Bases) {
4827 auto &Vec =
Base.second;
4828 if (Vec.size() > 1) {
4830 const std::tuple<Value *, int, unsigned> &
Y) {
4831 return std::get<1>(
X) < std::get<1>(
Y);
4833 int InitialOffset = std::get<1>(Vec[0]);
4835 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4841 SortedIndices.
clear();
4842 if (!AnyConsecutive)
4845 for (
auto &
Base : Bases) {
4846 for (
auto &
T :
Base.second)
4851 "Expected SortedIndices to be the size of VL");
4855std::optional<BoUpSLP::OrdersType>
4857 assert(TE.isGather() &&
"Expected gather node only.");
4858 Type *ScalarTy = TE.Scalars[0]->getType();
4861 Ptrs.
reserve(TE.Scalars.size());
4862 for (
Value *V : TE.Scalars) {
4863 auto *L = dyn_cast<LoadInst>(V);
4864 if (!L || !L->isSimple())
4865 return std::nullopt;
4871 return std::move(Order);
4872 return std::nullopt;
4883 if (VU->
getType() != V->getType())
4886 if (!VU->
hasOneUse() && !V->hasOneUse())
4892 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4898 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
4899 bool IsReusedIdx =
false;
4901 if (IE2 == VU && !IE1)
4903 if (IE1 == V && !IE2)
4904 return V->hasOneUse();
4905 if (IE1 && IE1 != V) {
4907 IsReusedIdx |= ReusedIdx.
test(Idx1);
4908 ReusedIdx.
set(Idx1);
4909 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4912 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4914 if (IE2 && IE2 != VU) {
4916 IsReusedIdx |= ReusedIdx.
test(Idx2);
4917 ReusedIdx.
set(Idx2);
4918 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4921 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4923 }
while (!IsReusedIdx && (IE1 || IE2));
4927std::optional<BoUpSLP::OrdersType>
4930 if (TE.isNonPowOf2Vec())
4931 return std::nullopt;
4935 if (!TE.ReuseShuffleIndices.empty()) {
4937 return std::nullopt;
4945 unsigned Sz = TE.Scalars.size();
4946 if (TE.isGather()) {
4947 if (std::optional<OrdersType> CurrentOrder =
4952 ::addMask(Mask, TE.ReuseShuffleIndices);
4953 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4954 unsigned Sz = TE.Scalars.size();
4955 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
4958 Res[
Idx + K * Sz] =
I + K * Sz;
4960 return std::move(Res);
4963 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4965 2 * TE.getVectorFactor())) == 1)
4966 return std::nullopt;
4970 if (TE.ReorderIndices.empty())
4971 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4974 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4975 unsigned VF = ReorderMask.
size();
4979 for (
unsigned I = 0;
I < VF;
I += Sz) {
4981 unsigned UndefCnt = 0;
4982 unsigned Limit = std::min(Sz, VF -
I);
4991 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
4993 return std::nullopt;
4995 for (
unsigned K = 0; K < NumParts; ++K)
4996 ResOrder[Val + Sz * K] =
I + K;
4998 return std::move(ResOrder);
5000 unsigned VF = TE.getVectorFactor();
5003 TE.ReuseShuffleIndices.end());
5004 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
5006 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5007 return Idx && *Idx < Sz;
5010 if (TE.ReorderIndices.empty())
5011 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5014 for (
unsigned I = 0;
I < VF; ++
I) {
5015 int &
Idx = ReusedMask[
I];
5018 Value *V = TE.Scalars[ReorderMask[
Idx]];
5020 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5026 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5027 auto *It = ResOrder.
begin();
5028 for (
unsigned K = 0; K < VF; K += Sz) {
5032 std::iota(SubMask.begin(), SubMask.end(), 0);
5034 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5035 std::advance(It, Sz);
5038 return Data.index() ==
Data.value();
5040 return std::nullopt;
5041 return std::move(ResOrder);
5043 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5044 any_of(TE.UserTreeIndices,
5046 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5048 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
5049 return std::nullopt;
5050 if ((TE.State == TreeEntry::Vectorize ||
5051 TE.State == TreeEntry::StridedVectorize) &&
5052 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5053 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
5055 return TE.ReorderIndices;
5056 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5057 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5058 Value *V1 = TE.Scalars[I1];
5059 Value *V2 = TE.Scalars[I2];
5060 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5066 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5067 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5068 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
5069 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
5076 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
5077 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
5078 if (EE1->getOperand(0) != EE2->getOperand(0))
5084 auto IsIdentityOrder = [](
const OrdersType &Order) {
5085 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
5090 if (!TE.ReorderIndices.empty())
5091 return TE.ReorderIndices;
5094 std::iota(Phis.begin(), Phis.end(), 0);
5096 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
5099 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
5100 ResOrder[Id] = PhiToId[Phis[Id]];
5101 if (IsIdentityOrder(ResOrder))
5102 return std::nullopt;
5103 return std::move(ResOrder);
5105 if (TE.isGather() && !TE.isAltShuffle() &&
allSameType(TE.Scalars)) {
5108 if ((TE.getOpcode() == Instruction::ExtractElement ||
5109 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5110 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5112 auto *EE = dyn_cast<ExtractElementInst>(V);
5113 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5118 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5120 if (Reuse || !CurrentOrder.
empty())
5121 return std::move(CurrentOrder);
5129 int Sz = TE.Scalars.size();
5131 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5133 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5134 if (It == TE.Scalars.begin())
5137 if (It != TE.Scalars.end()) {
5139 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5154 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5157 return std::move(Order);
5162 return std::nullopt;
5163 if (TE.Scalars.size() >= 4)
5167 return CurrentOrder;
5169 return std::nullopt;
5179 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5181 if (Cluster != FirstCluster)
5187void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5190 const unsigned Sz =
TE.Scalars.size();
5192 if (!
TE.isGather() ||
5199 addMask(NewMask,
TE.ReuseShuffleIndices);
5201 TE.ReorderIndices.clear();
5208 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5209 *
End =
TE.ReuseShuffleIndices.end();
5210 It !=
End; std::advance(It, Sz))
5211 std::iota(It, std::next(It, Sz), 0);
5217 "Expected same size of orders");
5218 unsigned Sz = Order.
size();
5220 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5221 if (Order[
Idx] != Sz)
5222 UsedIndices.
set(Order[
Idx]);
5224 if (SecondaryOrder.
empty()) {
5225 for (
unsigned Idx : seq<unsigned>(0, Sz))
5226 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5229 for (
unsigned Idx : seq<unsigned>(0, Sz))
5230 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5231 !UsedIndices.
test(SecondaryOrder[
Idx]))
5232 Order[
Idx] = SecondaryOrder[
Idx];
5252 ExternalUserReorderMap;
5257 const std::unique_ptr<TreeEntry> &TE) {
5260 findExternalStoreUsersReorderIndices(TE.get());
5261 if (!ExternalUserReorderIndices.
empty()) {
5262 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5264 std::move(ExternalUserReorderIndices));
5270 if (TE->isAltShuffle()) {
5273 unsigned Opcode0 = TE->getOpcode();
5274 unsigned Opcode1 = TE->getAltOpcode();
5277 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5278 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5284 if (std::optional<OrdersType> CurrentOrder =
5294 const TreeEntry *UserTE = TE.get();
5296 if (UserTE->UserTreeIndices.size() != 1)
5299 return EI.UserTE->State == TreeEntry::Vectorize &&
5300 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5303 UserTE = UserTE->UserTreeIndices.back().UserTE;
5306 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5307 if (!(TE->State == TreeEntry::Vectorize ||
5308 TE->State == TreeEntry::StridedVectorize) ||
5309 !TE->ReuseShuffleIndices.empty())
5310 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5311 if (TE->State == TreeEntry::Vectorize &&
5312 TE->getOpcode() == Instruction::PHI)
5313 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5318 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5320 auto It = VFToOrderedEntries.
find(VF);
5321 if (It == VFToOrderedEntries.
end())
5333 for (
const TreeEntry *OpTE : OrderedEntries) {
5336 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5339 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5341 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5342 auto It = GathersToOrders.find(OpTE);
5343 if (It != GathersToOrders.end())
5346 if (OpTE->isAltShuffle()) {
5347 auto It = AltShufflesToOrders.find(OpTE);
5348 if (It != AltShufflesToOrders.end())
5351 if (OpTE->State == TreeEntry::Vectorize &&
5352 OpTE->getOpcode() == Instruction::PHI) {
5353 auto It = PhisToOrders.
find(OpTE);
5354 if (It != PhisToOrders.
end())
5357 return OpTE->ReorderIndices;
5360 auto It = ExternalUserReorderMap.
find(OpTE);
5361 if (It != ExternalUserReorderMap.
end()) {
5362 const auto &ExternalUserReorderIndices = It->second;
5366 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5367 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
5368 ExternalUserReorderIndices.size();
5370 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
5371 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5378 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5379 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5382 unsigned E = Order.size();
5385 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5388 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5390 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5393 if (OrdersUses.empty())
5396 const unsigned Sz = Order.size();
5397 for (
unsigned Idx : seq<unsigned>(0, Sz))
5398 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5403 unsigned IdentityCnt = 0;
5404 unsigned FilledIdentityCnt = 0;
5406 for (
auto &Pair : OrdersUses) {
5407 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5408 if (!Pair.first.empty())
5409 FilledIdentityCnt += Pair.second;
5410 IdentityCnt += Pair.second;
5415 unsigned Cnt = IdentityCnt;
5416 for (
auto &Pair : OrdersUses) {
5420 if (Cnt < Pair.second ||
5421 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5422 Cnt == Pair.second && !BestOrder.
empty() &&
5423 IsIdentityOrder(BestOrder))) {
5425 BestOrder = Pair.first;
5432 if (IsIdentityOrder(BestOrder))
5438 unsigned E = BestOrder.
size();
5440 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5443 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5445 if (TE->Scalars.size() != VF) {
5446 if (TE->ReuseShuffleIndices.size() == VF) {
5452 return EI.UserTE->Scalars.size() == VF ||
5453 EI.UserTE->Scalars.size() ==
5456 "All users must be of VF size.");
5459 reorderNodeWithReuses(*TE, Mask);
5463 if ((TE->State == TreeEntry::Vectorize ||
5464 TE->State == TreeEntry::StridedVectorize) &&
5467 !TE->isAltShuffle()) {
5471 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5472 TE->reorderOperands(Mask);
5475 TE->reorderOperands(Mask);
5476 assert(TE->ReorderIndices.empty() &&
5477 "Expected empty reorder sequence.");
5480 if (!TE->ReuseShuffleIndices.empty()) {
5487 addMask(NewReuses, TE->ReuseShuffleIndices);
5488 TE->ReuseShuffleIndices.swap(NewReuses);
5494bool BoUpSLP::canReorderOperands(
5495 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5499 if (UserTE->isNonPowOf2Vec())
5502 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
5503 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
5504 return OpData.first ==
I &&
5505 (OpData.second->State == TreeEntry::Vectorize ||
5506 OpData.second->State == TreeEntry::StridedVectorize);
5509 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
5511 if (
any_of(TE->UserTreeIndices,
5512 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5516 Edges.emplace_back(
I, TE);
5522 if (TE->State != TreeEntry::Vectorize &&
5523 TE->State != TreeEntry::StridedVectorize &&
5524 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5528 TreeEntry *
Gather =
nullptr;
5530 [&
Gather, UserTE,
I](TreeEntry *TE) {
5531 assert(TE->State != TreeEntry::Vectorize &&
5532 TE->State != TreeEntry::StridedVectorize &&
5533 "Only non-vectorized nodes are expected.");
5534 if (
any_of(TE->UserTreeIndices,
5535 [UserTE,
I](
const EdgeInfo &EI) {
5536 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5538 assert(TE->isSame(UserTE->getOperand(
I)) &&
5539 "Operand entry does not match operands.");
5560 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5561 if (TE->State != TreeEntry::Vectorize &&
5562 TE->State != TreeEntry::StridedVectorize)
5564 if (std::optional<OrdersType> CurrentOrder =
5566 OrderedEntries.
insert(TE.get());
5567 if (!(TE->State == TreeEntry::Vectorize ||
5568 TE->State == TreeEntry::StridedVectorize) ||
5569 !TE->ReuseShuffleIndices.empty())
5570 GathersToOrders.
insert(TE.get());
5579 while (!OrderedEntries.
empty()) {
5584 for (TreeEntry *TE : OrderedEntries) {
5585 if (!(TE->State == TreeEntry::Vectorize ||
5586 TE->State == TreeEntry::StridedVectorize ||
5587 (TE->isGather() && GathersToOrders.
contains(TE))) ||
5588 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5591 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5593 !Visited.
insert(TE).second) {
5599 for (
EdgeInfo &EI : TE->UserTreeIndices) {
5600 TreeEntry *UserTE = EI.
UserTE;
5601 auto It =
Users.find(UserTE);
5602 if (It ==
Users.end())
5603 It =
Users.insert({UserTE, {}}).first;
5604 It->second.emplace_back(EI.
EdgeIdx, TE);
5608 for (TreeEntry *TE : Filtered)
5609 OrderedEntries.remove(TE);
5611 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5613 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
5614 return Data1.first->Idx > Data2.first->Idx;
5616 for (
auto &
Data : UsersVec) {
5619 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
5621 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5622 OrderedEntries.remove(
Op.second);
5635 for (
const auto &
Op :
Data.second) {
5636 TreeEntry *OpTE =
Op.second;
5637 if (!VisitedOps.
insert(OpTE).second)
5639 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5641 const auto Order = [&]() ->
const OrdersType {
5642 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
5645 return OpTE->ReorderIndices;
5649 if (Order.size() == 1)
5652 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
5653 return P.second == OpTE;
5656 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5657 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5660 unsigned E = Order.size();
5663 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5666 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5669 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5671 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
5672 const auto AllowsReordering = [&](
const TreeEntry *TE) {
5674 if (TE->isNonPowOf2Vec())
5676 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5677 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5678 (IgnoreReorder && TE->Idx == 0))
5680 if (TE->isGather()) {
5689 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
5690 TreeEntry *UserTE = EI.
UserTE;
5691 if (!VisitedUsers.
insert(UserTE).second)
5696 if (AllowsReordering(UserTE))
5704 if (
static_cast<unsigned>(
count_if(
5705 Ops, [UserTE, &AllowsReordering](
5706 const std::pair<unsigned, TreeEntry *> &
Op) {
5707 return AllowsReordering(
Op.second) &&
5710 return EI.UserTE == UserTE;
5712 })) <= Ops.
size() / 2)
5713 ++Res.first->second;
5716 if (OrdersUses.empty()) {
5717 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5718 OrderedEntries.remove(
Op.second);
5722 const unsigned Sz = Order.size();
5723 for (
unsigned Idx : seq<unsigned>(0, Sz))
5724 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5729 unsigned IdentityCnt = 0;
5730 unsigned VF =
Data.second.front().second->getVectorFactor();
5732 for (
auto &Pair : OrdersUses) {
5733 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5734 IdentityCnt += Pair.second;
5739 unsigned Cnt = IdentityCnt;
5740 for (
auto &Pair : OrdersUses) {
5744 if (Cnt < Pair.second) {
5746 BestOrder = Pair.first;
5753 if (IsIdentityOrder(BestOrder)) {
5754 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5755 OrderedEntries.remove(
Op.second);
5764 unsigned E = BestOrder.
size();
5766 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5768 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
5769 TreeEntry *TE =
Op.second;
5770 OrderedEntries.remove(TE);
5771 if (!VisitedOps.
insert(TE).second)
5773 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
5774 reorderNodeWithReuses(*TE, Mask);
5778 if (TE->State != TreeEntry::Vectorize &&
5779 TE->State != TreeEntry::StridedVectorize &&
5780 (TE->State != TreeEntry::ScatterVectorize ||
5781 TE->ReorderIndices.empty()))
5783 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
5784 TE->ReorderIndices.empty()) &&
5785 "Non-matching sizes of user/operand entries.");
5787 if (IgnoreReorder && TE == VectorizableTree.front().get())
5788 IgnoreReorder =
false;
5791 for (TreeEntry *
Gather : GatherOps) {
5793 "Unexpected reordering of gathers.");
5794 if (!
Gather->ReuseShuffleIndices.empty()) {
5800 OrderedEntries.remove(
Gather);
5804 if (
Data.first->State != TreeEntry::Vectorize ||
5805 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5806 Data.first->getMainOp()) ||
5807 Data.first->isAltShuffle())
5808 Data.first->reorderOperands(Mask);
5809 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
5810 Data.first->isAltShuffle() ||
5811 Data.first->State == TreeEntry::StridedVectorize) {
5815 if (
Data.first->ReuseShuffleIndices.empty() &&
5816 !
Data.first->ReorderIndices.empty() &&
5817 !
Data.first->isAltShuffle()) {
5820 OrderedEntries.insert(
Data.first);
5828 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5829 VectorizableTree.front()->ReuseShuffleIndices.empty())
5830 VectorizableTree.front()->ReorderIndices.clear();
5837 for (
auto &TEPtr : VectorizableTree) {
5838 TreeEntry *Entry = TEPtr.get();
5841 if (Entry->isGather())
5845 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5846 Value *Scalar = Entry->Scalars[Lane];
5847 if (!isa<Instruction>(Scalar))
5850 auto It = ScalarToExtUses.
find(Scalar);
5851 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
5855 const auto *ExtI = ExternallyUsedValues.
find(Scalar);
5856 if (ExtI != ExternallyUsedValues.
end()) {
5857 int FoundLane = Entry->findLaneForValue(Scalar);
5858 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
5859 << FoundLane <<
" from " << *Scalar <<
".\n");
5860 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
5861 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
5864 for (
User *U : Scalar->users()) {
5872 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5876 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5880 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5882 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5883 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
5885 assert(!UseEntry->isGather() &&
"Bad state");
5889 if (It != ScalarToExtUses.
end()) {
5890 ExternalUses[It->second].User =
nullptr;
5895 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
5897 int FoundLane = Entry->findLaneForValue(Scalar);
5899 <<
" from lane " << FoundLane <<
" from " << *Scalar
5901 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
5902 ExternalUses.emplace_back(Scalar, U, FoundLane);
5911BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
5913 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5914 Value *V = TE->Scalars[Lane];
5920 for (
User *U : V->users()) {
5921 auto *SI = dyn_cast<StoreInst>(U);
5922 if (SI ==
nullptr || !SI->isSimple() ||
5926 if (getTreeEntry(U))
5930 auto &StoresVec = PtrToStoresMap[
Ptr];
5933 if (StoresVec.size() > Lane)
5936 if (!StoresVec.empty() &&
5937 SI->getParent() != StoresVec.back()->getParent())
5940 if (!StoresVec.empty() &&
5941 SI->getValueOperand()->getType() !=
5942 StoresVec.back()->getValueOperand()->getType())
5944 StoresVec.push_back(SI);
5947 return PtrToStoresMap;
5951 OrdersType &ReorderIndices)
const {
5959 StoreOffsetVec[0] = {S0, 0};
5962 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
5964 std::optional<int> Diff =
5966 SI->getPointerOperand(), *
DL, *SE,
5971 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
5976 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
5977 const std::pair<StoreInst *, int> &Pair2) {
5978 int Offset1 = Pair1.second;
5979 int Offset2 = Pair2.second;
5980 return Offset1 < Offset2;
5984 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5985 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx - 1].second + 1)
5990 ReorderIndices.reserve(StoresVec.
size());
5993 [SI](
const std::pair<StoreInst *, int> &Pair) {
5994 return Pair.first ==
SI;
5996 StoreOffsetVec.begin();
5997 ReorderIndices.push_back(
Idx);
6002 auto IsIdentityOrder = [](
const OrdersType &Order) {
6003 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
6008 if (IsIdentityOrder(ReorderIndices))
6009 ReorderIndices.clear();
6016 for (
unsigned Idx : Order)
6023BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
6024 unsigned NumLanes =
TE->Scalars.size();
6027 collectUserStores(TE);
6036 for (
const auto &Pair : PtrToStoresMap) {
6037 auto &StoresVec = Pair.second;
6039 if (StoresVec.size() != NumLanes)
6044 if (!canFormVector(StoresVec, ReorderIndices))
6049 ExternalReorderIndices.
push_back(ReorderIndices);
6051 return ExternalReorderIndices;
6057 UserIgnoreList = &UserIgnoreLst;
6060 buildTree_rec(Roots, 0,
EdgeInfo());
6067 buildTree_rec(Roots, 0,
EdgeInfo());
6074 Value *NeedsScheduling =
nullptr;
6075 for (
Value *V : VL) {
6078 if (!NeedsScheduling) {
6079 NeedsScheduling = V;
6084 return NeedsScheduling;
6095 bool AllowAlternate) {
6099 if (
auto *LI = dyn_cast<LoadInst>(V)) {
6102 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
6107 if (isa<ExtractElementInst, UndefValue>(V))
6109 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
6111 !isa<UndefValue>(EI->getIndexOperand()))
6114 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
6117 if ((isa<BinaryOperator, CastInst>(
I)) &&
6127 : cast<CastInst>(
I)->getOperand(0)->getType()));
6129 if (isa<CastInst>(
I)) {
6130 std::pair<size_t, size_t> OpVals =
6136 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
6138 if (CI->isCommutative())
6144 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
6158 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
6159 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
6160 SubKey =
hash_value(Gep->getPointerOperand());
6164 !isa<ConstantInt>(
I->getOperand(1))) {
6172 return std::make_pair(Key, SubKey);
6182bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
6184 unsigned Opcode0 = S.getOpcode();
6185 unsigned Opcode1 = S.getAltOpcode();
6189 Opcode0, Opcode1, OpcodeMask))
6192 for (
unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
6196 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
6200 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
6206 switch (Res.value_or(0)) {
6221 constexpr unsigned NumAltInsts = 3;
6222 unsigned NonInstCnt = 0;
6225 unsigned UndefCnt = 0;
6227 unsigned ExtraShuffleInsts = 0;
6236 return is_contained(Operands.back(), V);
6239 ++ExtraShuffleInsts;
6256 if (isa<Constant, ExtractElementInst>(V) ||
6257 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
6258 if (isa<UndefValue>(V))
6264 if (!Res.second && Res.first->second == 1)
6265 ++ExtraShuffleInsts;
6266 ++Res.first->getSecond();
6267 if (
auto *
I = dyn_cast<Instruction>(V))
6268 UniqueOpcodes.
insert(
I->getOpcode());
6269 else if (Res.second)
6272 return none_of(Uniques, [&](
const auto &
P) {
6273 return P.first->hasNUsesOrMore(
P.second + 1) &&
6275 return getTreeEntry(U) || Uniques.contains(U);
6284 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6285 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
6286 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6289BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6292 assert(S.MainOp &&
"Expected instructions with same/alternate opcodes only.");
6294 unsigned ShuffleOrOp =
6295 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
6296 auto *VL0 = cast<Instruction>(S.OpValue);
6297 switch (ShuffleOrOp) {
6298 case Instruction::PHI: {
6301 return TreeEntry::NeedToGather;
6304 for (
Value *
Incoming : cast<PHINode>(V)->incoming_values()) {
6306 if (Term &&
Term->isTerminator()) {
6308 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
6309 return TreeEntry::NeedToGather;
6313 return TreeEntry::Vectorize;
6315 case Instruction::ExtractValue:
6316 case Instruction::ExtractElement: {
6317 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6320 return TreeEntry::NeedToGather;
6321 if (Reuse || !CurrentOrder.empty())
6322 return TreeEntry::Vectorize;
6324 return TreeEntry::NeedToGather;
6326 case Instruction::InsertElement: {
6330 for (
Value *V : VL) {
6331 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
6333 "Non-constant or undef index?");
6337 return !SourceVectors.contains(V);
6340 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
6341 "different source vectors.\n");
6342 return TreeEntry::NeedToGather;
6345 return TreeEntry::Vectorize;
6347 case Instruction::Load: {
6356 return TreeEntry::Vectorize;
6358 return TreeEntry::ScatterVectorize;
6360 return TreeEntry::StridedVectorize;
6363 Type *ScalarTy = VL0->getType();
6364 if (
DL->getTypeSizeInBits(ScalarTy) !=
6365 DL->getTypeAllocSizeInBits(ScalarTy))
6366 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
6368 [](
Value *V) {
return !cast<LoadInst>(V)->isSimple(); }))
6373 return TreeEntry::NeedToGather;
6377 case Instruction::ZExt:
6378 case Instruction::SExt:
6379 case Instruction::FPToUI:
6380 case Instruction::FPToSI:
6381 case Instruction::FPExt:
6382 case Instruction::PtrToInt:
6383 case Instruction::IntToPtr:
6384 case Instruction::SIToFP:
6385 case Instruction::UIToFP:
6386 case Instruction::Trunc:
6387 case Instruction::FPTrunc:
6388 case Instruction::BitCast: {
6389 Type *SrcTy = VL0->getOperand(0)->getType();
6390 for (
Value *V : VL) {
6391 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6394 dbgs() <<
"SLP: Gathering casts with different src types.\n");
6395 return TreeEntry::NeedToGather;
6398 return TreeEntry::Vectorize;
6400 case Instruction::ICmp:
6401 case Instruction::FCmp: {
6405 Type *ComparedTy = VL0->getOperand(0)->getType();
6406 for (
Value *V : VL) {
6408 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
6409 Cmp->getOperand(0)->getType() != ComparedTy) {
6410 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
6411 return TreeEntry::NeedToGather;
6414 return TreeEntry::Vectorize;
6416 case Instruction::Select:
6417 case Instruction::FNeg:
6418 case Instruction::Add:
6419 case Instruction::FAdd:
6420 case Instruction::Sub:
6421 case Instruction::FSub:
6422 case Instruction::Mul:
6423 case Instruction::FMul:
6424 case Instruction::UDiv:
6425 case Instruction::SDiv:
6426 case Instruction::FDiv:
6427 case Instruction::URem:
6428 case Instruction::SRem:
6429 case Instruction::FRem:
6430 case Instruction::Shl:
6431 case Instruction::LShr:
6432 case Instruction::AShr:
6433 case Instruction::And:
6434 case Instruction::Or:
6435 case Instruction::Xor:
6436 return TreeEntry::Vectorize;
6437 case Instruction::GetElementPtr: {
6439 for (
Value *V : VL) {
6440 auto *
I = dyn_cast<GetElementPtrInst>(V);
6443 if (
I->getNumOperands() != 2) {
6444 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
6445 return TreeEntry::NeedToGather;
6451 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6452 for (
Value *V : VL) {
6453 auto *
GEP = dyn_cast<GEPOperator>(V);
6456 Type *CurTy =
GEP->getSourceElementType();
6458 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
6459 return TreeEntry::NeedToGather;
6464 Type *Ty1 = VL0->getOperand(1)->getType();
6465 for (
Value *V : VL) {
6466 auto *
I = dyn_cast<GetElementPtrInst>(V);
6469 auto *
Op =
I->getOperand(1);
6470 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6471 (
Op->getType() != Ty1 &&
6472 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6473 Op->getType()->getScalarSizeInBits() >
6474 DL->getIndexSizeInBits(
6475 V->getType()->getPointerAddressSpace())))) {
6477 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
6478 return TreeEntry::NeedToGather;
6482 return TreeEntry::Vectorize;
6484 case Instruction::Store: {
6486 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6489 if (
DL->getTypeSizeInBits(ScalarTy) !=
6490 DL->getTypeAllocSizeInBits(ScalarTy)) {
6491 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
6492 return TreeEntry::NeedToGather;
6496 for (
Value *V : VL) {
6497 auto *
SI = cast<StoreInst>(V);
6498 if (!
SI->isSimple()) {
6500 return TreeEntry::NeedToGather;
6509 if (CurrentOrder.empty()) {
6510 Ptr0 = PointerOps.
front();
6511 PtrN = PointerOps.
back();
6513 Ptr0 = PointerOps[CurrentOrder.front()];
6514 PtrN = PointerOps[CurrentOrder.back()];
6516 std::optional<int> Dist =
6519 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
6520 return TreeEntry::Vectorize;
6524 return TreeEntry::NeedToGather;
6526 case Instruction::Call: {
6529 CallInst *CI = cast<CallInst>(VL0);
6540 return TreeEntry::NeedToGather;
6545 for (
unsigned J = 0; J != NumArgs; ++J)
6548 for (
Value *V : VL) {
6549 CallInst *CI2 = dyn_cast<CallInst>(V);
6555 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
6557 return TreeEntry::NeedToGather;
6561 for (
unsigned J = 0; J != NumArgs; ++J) {
6564 if (ScalarArgs[J] != A1J) {
6566 <<
"SLP: mismatched arguments in call:" << *CI
6567 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
6568 return TreeEntry::NeedToGather;
6577 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
6578 <<
"!=" << *V <<
'\n');
6579 return TreeEntry::NeedToGather;
6583 return TreeEntry::Vectorize;
6585 case Instruction::ShuffleVector: {
6588 if (!S.isAltShuffle()) {
6589 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
6590 return TreeEntry::NeedToGather;
6595 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
6596 "the whole alt sequence is not profitable.\n");
6597 return TreeEntry::NeedToGather;
6600 return TreeEntry::Vectorize;
6604 return TreeEntry::NeedToGather;
6618 PHIHandler() =
delete;
6620 : DT(DT), Main(Main), Phis(Phis),
6621 Operands(Main->getNumIncomingValues(),
6623 void buildOperands() {
6624 constexpr unsigned FastLimit = 4;
6634 auto *
P = cast<PHINode>(V);
6635 if (
P->getIncomingBlock(
I) == InBB)
6650 Blocks.try_emplace(InBB).first->second.push_back(
I);
6653 auto *
P = cast<PHINode>(V);
6654 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
6662 auto It =
Blocks.find(InBB);
6668 for (
const auto &
P :
Blocks) {
6669 if (
P.getSecond().size() <= 1)
6671 unsigned BasicI =
P.getSecond().front();
6674 [&](
const auto &Data) {
6675 return !Data.value() ||
6676 Data.value() ==
Operands[BasicI][Data.index()];
6678 "Expected empty operands list.");
6688 const EdgeInfo &UserTreeIdx) {
6694 auto TryToFindDuplicates = [&](
const InstructionsState &S,
6695 bool DoNotFail =
false) {
6698 for (
Value *V : VL) {
6705 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
6710 size_t NumUniqueScalarValues = UniqueValues.
size();
6711 if (NumUniqueScalarValues == VL.size()) {
6712 ReuseShuffleIndices.
clear();
6715 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6716 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
6717 "for nodes with padding.\n");
6718 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6722 if (NumUniqueScalarValues <= 1 ||
6723 (UniquePositions.size() == 1 &&
all_of(UniqueValues,
6725 return isa<UndefValue>(V) ||
6728 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6729 if (DoNotFail && UniquePositions.size() > 1 &&
6730 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6732 return isa<ExtractElementInst>(V) ||
6733 areAllUsersVectorized(cast<Instruction>(V),
6737 if (PWSz == VL.size()) {
6738 ReuseShuffleIndices.
clear();
6740 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
6741 NonUniqueValueVL.
append(PWSz - UniqueValues.
size(),
6742 UniqueValues.
back());
6743 VL = NonUniqueValueVL;
6748 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6759 if (!EphValues.
empty()) {
6760 for (
Value *V : VL) {
6761 if (EphValues.
count(V)) {
6763 <<
") is ephemeral.\n");
6764 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6774 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6779 cast<Instruction>(
I)->getOpcode() ==
6780 cast<Instruction>(S.MainOp)->getOpcode();
6782 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
6783 if (TryToFindDuplicates(S))
6784 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6785 ReuseShuffleIndices);
6790 if (S.getOpcode() == Instruction::ExtractElement &&
6791 isa<ScalableVectorType>(
6792 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6793 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
6794 if (TryToFindDuplicates(S))
6795 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6796 ReuseShuffleIndices);
6801 if (!
SLPReVec && S.OpValue->getType()->isVectorTy() &&
6802 !isa<InsertElementInst>(S.OpValue)) {
6804 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6808 if (
StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6809 if (!
SLPReVec &&
SI->getValueOperand()->getType()->isVectorTy()) {
6810 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
6811 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6820 auto &&NotProfitableForVectorization = [&S,
this,
6822 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6831 for (
Value *V : VL) {
6832 auto *
I = cast<Instruction>(V);
6834 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6838 if ((IsCommutative &&
6839 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6841 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
6843 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
6845 auto *
I1 = cast<Instruction>(VL.front());
6846 auto *I2 = cast<Instruction>(VL.back());
6849 I2->getOperand(
Op));
6850 if (
static_cast<unsigned>(
count_if(
6851 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6853 })) >= S.MainOp->getNumOperands() / 2)
6855 if (S.MainOp->getNumOperands() > 2)
6857 if (IsCommutative) {
6862 I2->getOperand((
Op + 1) % E));
6864 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6873 bool IsScatterVectorizeUserTE =
6874 UserTreeIdx.UserTE &&
6875 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6876 bool AreAllSameBlock = S.getOpcode() &&
allSameBlock(VL);
6877 bool AreScatterAllGEPSameBlock =
6878 (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() &&
6882 auto *
I = dyn_cast<GetElementPtrInst>(V);
6886 BB =
I->getParent();
6887 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
6890 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6892 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
6894 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6897 NotProfitableForVectorization(VL)) {
6898 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
6899 if (TryToFindDuplicates(S))
6900 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6901 ReuseShuffleIndices);
6909 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6910 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.OpValue <<
".\n");
6911 if (!E->isSame(VL)) {
6912 auto It = MultiNodeScalars.
find(S.OpValue);
6913 if (It != MultiNodeScalars.
end()) {
6914 auto *TEIt =
find_if(It->getSecond(),
6915 [&](TreeEntry *ME) { return ME->isSame(VL); });
6916 if (TEIt != It->getSecond().end())
6926 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
6927 if (TryToFindDuplicates(S))
6928 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6929 ReuseShuffleIndices);
6935 E->UserTreeIndices.push_back(UserTreeIdx);
6936 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.OpValue
6943 for (
Value *V : VL) {
6944 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6947 if (getTreeEntry(V)) {
6949 <<
") is already in tree.\n");
6950 if (TryToFindDuplicates(S))
6951 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6952 ReuseShuffleIndices);
6958 if (UserIgnoreList && !UserIgnoreList->empty()) {
6959 for (
Value *V : VL) {
6960 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6961 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
6962 if (TryToFindDuplicates(S))
6963 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6964 ReuseShuffleIndices);
6972 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
6973 assert(S.OpValue->getType()->isPointerTy() &&
6974 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6975 "Expected pointers only.");
6977 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
6978 assert(It != VL.end() &&
"Expected at least one GEP.");
6984 auto *VL0 = cast<Instruction>(S.OpValue);
6991 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
7000 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
7005 if (!TryToFindDuplicates(S,
true))
7011 TreeEntry::EntryState State = getScalarsVectorizationState(
7012 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
7013 if (State == TreeEntry::NeedToGather) {
7014 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7015 ReuseShuffleIndices);
7019 auto &BSRef = BlocksSchedules[BB];
7021 BSRef = std::make_unique<BlockScheduling>(BB);
7023 BlockScheduling &BS = *BSRef;
7025 std::optional<ScheduleData *> Bundle =
7026 BS.tryScheduleBundle(UniqueValues,
this, S);
7027#ifdef EXPENSIVE_CHECKS
7032 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
7033 assert((!BS.getScheduleData(VL0) ||
7034 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
7035 "tryScheduleBundle should cancelScheduling on failure");
7036 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7037 ReuseShuffleIndices);
7038 NonScheduledFirst.insert(VL.front());
7041 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
7043 unsigned ShuffleOrOp = S.isAltShuffle() ?
7044 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
7045 switch (ShuffleOrOp) {
7046 case Instruction::PHI: {
7047 auto *PH = cast<PHINode>(VL0);
7050 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
7054 PHIHandler Handler(*DT, PH, VL);
7055 Handler.buildOperands();
7056 for (
unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7057 TE->setOperand(
I, Handler.getOperands(
I));
7058 for (
unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7059 buildTree_rec(Handler.getOperands(
I),
Depth + 1, {TE, I});
7062 case Instruction::ExtractValue:
7063 case Instruction::ExtractElement: {
7064 if (CurrentOrder.empty()) {
7065 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
7068 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
7070 for (
unsigned Idx : CurrentOrder)
7078 newTreeEntry(VL, Bundle , S, UserTreeIdx,
7079 ReuseShuffleIndices, CurrentOrder);
7083 Op0.
assign(VL.size(), VL0->getOperand(0));
7084 VectorizableTree.back()->setOperand(0, Op0);
7087 case Instruction::InsertElement: {
7088 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
7090 auto OrdCompare = [](
const std::pair<int, int> &P1,
7091 const std::pair<int, int> &P2) {
7092 return P1.first > P2.first;
7095 decltype(OrdCompare)>
7096 Indices(OrdCompare);
7097 for (
int I = 0, E = VL.size();
I < E; ++
I) {
7099 Indices.emplace(
Idx,
I);
7101 OrdersType CurrentOrder(VL.size(), VL.size());
7102 bool IsIdentity =
true;
7103 for (
int I = 0, E = VL.size();
I < E; ++
I) {
7104 CurrentOrder[Indices.top().second] =
I;
7105 IsIdentity &= Indices.top().second ==
I;
7109 CurrentOrder.clear();
7110 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7111 std::nullopt, CurrentOrder);
7114 TE->setOperandsInOrder();
7115 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
7118 case Instruction::Load: {
7125 TreeEntry *
TE =
nullptr;
7128 case TreeEntry::Vectorize:
7129 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7130 ReuseShuffleIndices, CurrentOrder);
7131 if (CurrentOrder.empty())
7135 TE->setOperandsInOrder();
7137 case TreeEntry::StridedVectorize:
7139 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
7140 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
7141 TE->setOperandsInOrder();
7144 case TreeEntry::ScatterVectorize:
7146 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
7147 UserTreeIdx, ReuseShuffleIndices);
7148 TE->setOperandsInOrder();
7149 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
7150 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
7152 case TreeEntry::NeedToGather:
7157 case Instruction::ZExt:
7158 case Instruction::SExt:
7159 case Instruction::FPToUI:
7160 case Instruction::FPToSI:
7161 case Instruction::FPExt:
7162 case Instruction::PtrToInt:
7163 case Instruction::IntToPtr:
7164 case Instruction::SIToFP:
7165 case Instruction::UIToFP:
7166 case Instruction::Trunc:
7167 case Instruction::FPTrunc:
7168 case Instruction::BitCast: {
7169 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
7170 std::make_pair(std::numeric_limits<unsigned>::min(),
7171 std::numeric_limits<unsigned>::max()));
7172 if (ShuffleOrOp == Instruction::ZExt ||
7173 ShuffleOrOp == Instruction::SExt) {
7174 CastMaxMinBWSizes = std::make_pair(
7180 }
else if (ShuffleOrOp == Instruction::Trunc) {
7181 CastMaxMinBWSizes = std::make_pair(
7187 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
7188 }
else if (ShuffleOrOp == Instruction::SIToFP ||
7189 ShuffleOrOp == Instruction::UIToFP) {
7190 unsigned NumSignBits =
7192 if (
auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7194 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
7196 if (NumSignBits * 2 >=
7198 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
7200 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7201 ReuseShuffleIndices);
7204 TE->setOperandsInOrder();
7205 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7206 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
7209 case Instruction::ICmp:
7210 case Instruction::FCmp: {
7213 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7214 ReuseShuffleIndices);
7222 "Commutative Predicate mismatch");
7223 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7226 for (
Value *V : VL) {
7227 auto *
Cmp = cast<CmpInst>(V);
7230 if (
Cmp->getPredicate() != P0)
7232 Left.push_back(LHS);
7233 Right.push_back(RHS);
7240 if (ShuffleOrOp == Instruction::ICmp) {
7241 unsigned NumSignBits0 =
7243 if (NumSignBits0 * 2 >=
7245 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
7246 unsigned NumSignBits1 =
7248 if (NumSignBits1 * 2 >=
7250 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
7254 case Instruction::Select:
7255 case Instruction::FNeg:
7256 case Instruction::Add:
7257 case Instruction::FAdd:
7258 case Instruction::Sub:
7259 case Instruction::FSub:
7260 case Instruction::Mul:
7261 case Instruction::FMul:
7262 case Instruction::UDiv:
7263 case Instruction::SDiv:
7264 case Instruction::FDiv:
7265 case Instruction::URem:
7266 case Instruction::SRem:
7267 case Instruction::FRem:
7268 case Instruction::Shl:
7269 case Instruction::LShr:
7270 case Instruction::AShr:
7271 case Instruction::And:
7272 case Instruction::Or:
7273 case Instruction::Xor: {
7274 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7275 ReuseShuffleIndices);
7282 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7290 TE->setOperandsInOrder();
7291 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7292 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
7295 case Instruction::GetElementPtr: {
7296 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7297 ReuseShuffleIndices);
7301 for (
Value *V : VL) {
7302 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
7307 Operands.front().push_back(
GEP->getPointerOperand());
7316 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7318 [VL0Ty, IndexIdx](
Value *V) {
7319 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
7322 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
7326 ->getPointerOperandType()
7329 for (
Value *V : VL) {
7330 auto *
I = dyn_cast<GetElementPtrInst>(V);
7333 ConstantInt::get(Ty, 0,
false));
7336 auto *
Op =
I->getOperand(IndexIdx);
7337 auto *CI = dyn_cast<ConstantInt>(
Op);
7342 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7346 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
7350 case Instruction::Store: {
7351 bool Consecutive = CurrentOrder.empty();
7354 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7355 ReuseShuffleIndices, CurrentOrder);
7356 TE->setOperandsInOrder();
7357 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
7361 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");
7364 case Instruction::Call: {
7367 CallInst *CI = cast<CallInst>(VL0);
7370 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7371 ReuseShuffleIndices);
7376 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7380 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7384 for (
Value *V : VL) {
7385 auto *CI2 = cast<CallInst>(V);
7392 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7399 TE->setOperandsInOrder();
7400 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
7407 for (
Value *V : VL) {
7408 auto *CI2 = cast<CallInst>(V);
7415 case Instruction::ShuffleVector: {
7416 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7417 ReuseShuffleIndices);
7421 auto *CI = dyn_cast<CmpInst>(VL0);
7422 if (isa<BinaryOperator>(VL0) || CI) {
7425 return cast<CmpInst>(V)->isCommutative();
7427 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7429 auto *MainCI = cast<CmpInst>(S.MainOp);
7430 auto *AltCI = cast<CmpInst>(S.AltOp);
7434 "Expected different main/alternate predicates.");
7437 for (
Value *V : VL) {
7438 auto *
Cmp = cast<CmpInst>(V);
7449 Left.push_back(LHS);
7450 Right.push_back(RHS);
7460 TE->setOperandsInOrder();
7461 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7462 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
7475 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7476 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
7478 for (
const auto *Ty : ST->elements())
7479 if (Ty != *ST->element_begin())
7481 N *= ST->getNumElements();
7482 EltTy = *ST->element_begin();
7483 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
7484 N *= AT->getNumElements();
7485 EltTy = AT->getElementType();
7487 auto *VT = cast<FixedVectorType>(EltTy);
7488 N *= VT->getNumElements();
7489 EltTy = VT->getElementType();
7496 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7504 bool ResizeAllowed)
const {
7505 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7506 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
7507 auto *E0 = cast<Instruction>(*It);
7509 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7513 Value *Vec = E0->getOperand(0);
7515 CurrentOrder.
clear();
7519 if (E0->getOpcode() == Instruction::ExtractValue) {
7524 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7528 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
7531 unsigned E = VL.
size();
7532 if (!ResizeAllowed && NElts != E)
7535 unsigned MinIdx = NElts, MaxIdx = 0;
7537 auto *Inst = dyn_cast<Instruction>(V);
7540 if (Inst->getOperand(0) != Vec)
7542 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
7543 if (isa<UndefValue>(EE->getIndexOperand()))
7548 const unsigned ExtIdx = *
Idx;
7549 if (ExtIdx >= NElts)
7551 Indices[
I] = ExtIdx;
7552 if (MinIdx > ExtIdx)
7554 if (MaxIdx < ExtIdx)
7557 if (MaxIdx - MinIdx + 1 > E)
7559 if (MaxIdx + 1 <= E)
7563 bool ShouldKeepOrder =
true;
7569 CurrentOrder.
assign(E, E);
7570 for (
unsigned I = 0;
I < E; ++
I) {
7573 const unsigned ExtIdx = Indices[
I] - MinIdx;
7574 if (CurrentOrder[ExtIdx] != E) {
7575 CurrentOrder.
clear();
7578 ShouldKeepOrder &= ExtIdx ==
I;
7579 CurrentOrder[ExtIdx] =
I;
7581 if (ShouldKeepOrder)
7582 CurrentOrder.
clear();
7584 return ShouldKeepOrder;
7587bool BoUpSLP::areAllUsersVectorized(
7589 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
7591 return ScalarToTreeEntry.contains(U) ||
7592 isVectorLikeInstWithConstOps(U) ||
7593 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7597static std::pair<InstructionCost, InstructionCost>
7605 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
7606 FMF = FPCI->getFastMathFlags();
7609 dyn_cast<IntrinsicInst>(CI));
7610 auto IntrinsicCost =
7617 auto LibCost = IntrinsicCost;
7624 return {IntrinsicCost, LibCost};
7627void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7631 unsigned Sz = Scalars.size();
7634 if (!ReorderIndices.empty())
7636 for (
unsigned I = 0;
I < Sz; ++
I) {
7638 if (!ReorderIndices.empty())
7640 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
7641 if (IsAltOp(OpInst)) {
7651 if (!ReuseShuffleIndices.
empty()) {
7654 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7664 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7665 auto *AltCI = cast<CmpInst>(AltOp);
7668 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
7669 auto *CI = cast<CmpInst>(
I);
7677 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
7678 "CmpInst expected to match either main or alternate predicate or "
7681 return MainP !=
P && MainP != SwappedP;
7688 const auto *Op0 = Ops.
front();
7694 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
7698 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
7700 if (
auto *CI = dyn_cast<ConstantInt>(V))
7701 return CI->getValue().isPowerOf2();
7704 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
7706 if (
auto *CI = dyn_cast<ConstantInt>(V))
7707 return CI->getValue().isNegatedPowerOf2();
7712 if (IsConstant && IsUniform)
7714 else if (IsConstant)
7728class BaseShuffleAnalysis {
7735 int Limit =
Mask.size();
7747 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
7763 unsigned VF =
Mask.size();
7765 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
7768 int MaskedIdx =
Mask[ExtMask[
I] % VF];
7809 bool SinglePermute) {
7813 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
7815 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7821 if (isIdentityMask(Mask, SVTy,
false)) {
7822 if (!IdentityOp || !SinglePermute ||
7823 (isIdentityMask(Mask, SVTy,
true) &&
7825 IdentityMask.
size()))) {
7830 IdentityMask.
assign(Mask);
7850 if (SV->isZeroEltSplat()) {
7852 IdentityMask.
assign(Mask);
7854 int LocalVF =
Mask.size();
7856 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7857 LocalVF = SVOpTy->getNumElements();
7861 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
7863 ExtMask[
Idx] = SV->getMaskValue(
I);
7873 if (!IsOp1Undef && !IsOp2Undef) {
7875 for (
int &
I : Mask) {
7878 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
7885 SV->getShuffleMask().end());
7886 combineMasks(LocalVF, ShuffleMask, Mask);
7887 Mask.swap(ShuffleMask);
7889 Op = SV->getOperand(0);
7891 Op = SV->getOperand(1);
7893 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
7894 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7899 "Expected masks of same sizes.");
7904 Mask.swap(IdentityMask);
7905 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7906 return SinglePermute &&
7907 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
7909 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
7910 Shuffle->isZeroEltSplat() &&
7923 template <
typename T,
typename ShuffleBuilderTy>
7925 ShuffleBuilderTy &Builder) {
7926 assert(V1 &&
"Expected at least one vector value.");
7928 Builder.resizeToMatch(V1, V2);
7929 int VF =
Mask.size();
7930 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
7931 VF = FTy->getNumElements();
7938 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
7941 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7943 CombinedMask1[
I] =
Mask[
I];
7945 CombinedMask2[
I] =
Mask[
I] - VF;
7952 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
7953 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
7956 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7957 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7962 ExtMask1[
Idx] = SV1->getMaskValue(
I);
7965 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7967 ExtMask1, UseMask::SecondArg);
7972 ExtMask2[
Idx] = SV2->getMaskValue(
I);
7975 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7977 ExtMask2, UseMask::SecondArg);
7978 if (SV1->getOperand(0)->getType() ==
7979 SV2->getOperand(0)->getType() &&
7980 SV1->getOperand(0)->getType() != SV1->getType() &&
7983 Op1 = SV1->getOperand(0);
7984 Op2 = SV2->getOperand(0);
7986 SV1->getShuffleMask().end());
7987 int LocalVF = ShuffleMask1.size();
7988 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
7989 LocalVF = FTy->getNumElements();
7990 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7991 CombinedMask1.swap(ShuffleMask1);
7993 SV2->getShuffleMask().end());
7994 LocalVF = ShuffleMask2.size();
7995 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
7996 LocalVF = FTy->getNumElements();
7997 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7998 CombinedMask2.swap(ShuffleMask2);
8001 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
8002 Builder.resizeToMatch(Op1, Op2);
8003 VF = std::max(cast<VectorType>(Op1->
getType())
8005 .getKnownMinValue(),
8006 cast<VectorType>(Op2->
getType())
8008 .getKnownMinValue());
8009 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
8012 "Expected undefined mask element");
8013 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
8019 isa<ShuffleVectorInst>(Op1) &&
8020 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
8022 return Builder.createIdentity(Op1);
8023 return Builder.createShuffleVector(
8027 if (isa<PoisonValue>(V1))
8028 return Builder.createPoison(
8029 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
8031 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
8032 assert(V1 &&
"Expected non-null value after looking through shuffles.");
8035 return Builder.createShuffleVector(V1, NewMask);
8036 return Builder.createIdentity(V1);
8052 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
8055 Mask, NumSrcElts, NumSubElts,
Index)) {
8056 if (
Index + NumSubElts > NumSrcElts &&
8057 Index + NumSrcElts <=
static_cast<int>(Mask.size()))
8067static std::pair<InstructionCost, InstructionCost>
8078 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
8088 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
8092 for (
Value *V : Ptrs) {
8097 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
8102 if (!
Ptr || !
Ptr->hasOneUse())
8106 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
8112 TTI::PointersChainInfo::getKnownStride(),
8122 [](
const Value *V) {
8123 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
8124 return Ptr && !
Ptr->hasAllConstantIndices();
8126 ? TTI::PointersChainInfo::getUnknownStride()
8127 : TTI::PointersChainInfo::getKnownStride();
8131 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
8133 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
8134 if (It != Ptrs.
end())
8135 BaseGEP = cast<GEPOperator>(*It);
8140 BaseGEP->getPointerOperand(), Indices, VecTy,
8145 return std::make_pair(ScalarCost, VecCost);
8150 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8152 switch (E.getOpcode()) {
8153 case Instruction::Load: {
8156 if (E.State != TreeEntry::Vectorize)
8158 Type *ScalarTy = E.getMainOp()->getType();
8160 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8167 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8174 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8175 false, CommonAlignment,
CostKind, BaseLI);
8176 if (StridedCost < OriginalVecCost)
8179 E.State = TreeEntry::StridedVectorize;
8183 case Instruction::Store: {
8185 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8187 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8194 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8201 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8202 false, CommonAlignment,
CostKind, BaseSI);
8203 if (StridedCost < OriginalVecCost)
8206 E.State = TreeEntry::StridedVectorize;
8223 bool IsFinalized =
false;
8226 Type *ScalarTy =
nullptr;
8237 bool SameNodesEstimated =
true;
8246 if (
auto *VTy = dyn_cast<VectorType>(Ty))
8262 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8263 unsigned MinVF = R.getMinVF(2 * Sz);
8264 if (VL.
size() > 2 &&
8265 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8266 (InVectors.
empty() &&
8269 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8270 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8271 return S.getOpcode() == Instruction::Load &&
8274 !
all_of(Gathers, [&](
Value *V) {
return R.getTreeEntry(V); }) &&
8280 unsigned StartIdx = 0;
8281 unsigned VF = VL.
size() / 2;
8282 for (; VF >= MinVF; VF /= 2) {
8283 for (
unsigned Cnt = StartIdx,
End = VL.
size(); Cnt + VF <=
End;
8286 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8288 if (SliceS.getOpcode() != Instruction::Load ||
8289 SliceS.isAltShuffle())
8297 CurrentOrder, PointerOps);
8307 CurrentOrder.
empty()) ||
8316 if (Cnt == StartIdx)
8325 if (StartIdx >= VL.
size())
8328 if (!VectorizedLoads.
empty())
8331 if (!VectorizedLoads.
empty()) {
8333 bool NeedInsertSubvectorAnalysis =
8334 !NumParts || (VL.
size() / VF) > NumParts;
8340 getBuildVectorCost(VL.
slice(
I, std::min(
End -
I, VF)), Root);
8347 for (
Value *V : VectorizedLoads) {
8348 auto *LI = cast<LoadInst>(V);
8355 for (
const std::pair<unsigned, LoadsState> &
P : VectorizedStarts) {
8356 auto *LI = cast<LoadInst>(VL[
P.first]);
8365 false, Alignment, CostKind, LI);
8369 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8370 auto [ScalarGEPCost, VectorGEPCost] =
8372 Instruction::Load, CostKind, LI->
getType(), LoadTy);
8373 GatherCost += VectorGEPCost - ScalarGEPCost;
8375 for (
unsigned P : ScatterVectorized) {
8376 auto *LI0 = cast<LoadInst>(VL[
P]);
8378 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8380 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8381 false, CommonAlignment, CostKind, LI0);
8385 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8393 auto [ScalarGEPCost, VectorGEPCost] =
8395 CostKind, ScalarTy, VecTy);
8396 GatherCost += VectorGEPCost - ScalarGEPCost;
8397 if (!Order.
empty()) {
8401 VecTy, Mask, CostKind);
8404 GatherCost += R.getGatherCost(PointerOps,
true,
8405 PointerOps.
front()->getType());
8408 if (NeedInsertSubvectorAnalysis) {
8411 for (
unsigned I = VF, E = VL.
size();
I < E;
I += VF) {
8412 for (
unsigned Idx : seq<unsigned>(0, E))
8415 ShuffleMask, CostKind,
I, LoadTy);
8418 GatherCost -= ScalarsCost;
8420 GatherCost = std::min(BaseCost, GatherCost);
8421 }
else if (!Root &&
isSplat(VL)) {
8424 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
8425 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
8428 count(VL, *It) > 1 &&
8432 CostKind, std::distance(VL.
begin(), It),
8437 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8443 VecTy, ShuffleMask, CostKind,
8448 (
all_of(Gathers, IsaPred<UndefValue>)
8450 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
8458 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8459 unsigned NumParts) {
8460 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
8462 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
8463 auto *EE = dyn_cast<ExtractElementInst>(V);
8466 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8469 return std::max(Sz, VecTy->getNumElements());
8475 -> std::optional<TTI::ShuffleKind> {
8476 if (NumElts <= EltsPerVector)
8477 return std::nullopt;
8479 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
8481 if (I == PoisonMaskElem)
8483 return std::min(S, I);
8486 int OffsetReg1 = OffsetReg0;
8490 int FirstRegId = -1;
8491 Indices.assign(1, OffsetReg0);
8495 int Idx =
I - OffsetReg0;
8497 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
8500 RegIndices.
insert(RegId);
8501 if (RegIndices.
size() > 2)
8502 return std::nullopt;
8503 if (RegIndices.
size() == 2) {
8505 if (Indices.size() == 1) {
8508 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
8510 if (I == PoisonMaskElem)
8512 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8513 ((I - OffsetReg0) % NumElts) / EltsPerVector;
8514 if (RegId == FirstRegId)
8516 return std::min(S, I);
8519 Indices.push_back(OffsetReg1 % NumElts);
8521 Idx =
I - OffsetReg1;
8523 I = (
Idx % NumElts) % EltsPerVector +
8524 (RegId == FirstRegId ? 0 : EltsPerVector);
8533 for (
unsigned Part : seq<unsigned>(NumParts)) {
8534 if (!ShuffleKinds[Part])
8537 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
8541 std::optional<TTI::ShuffleKind> RegShuffleKind =
8542 CheckPerRegistersShuffle(SubMask, Indices);
8543 if (!RegShuffleKind) {
8546 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
8558 for (
unsigned Idx : Indices) {
8560 "SK_ExtractSubvector index out of range");
8564 std::nullopt, CostKind,
Idx,
8573 if (OriginalCost <
Cost)
8574 Cost = OriginalCost;
8582 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8589 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
8591 unsigned SliceSize) {
8592 if (SameNodesEstimated) {
8598 if ((InVectors.
size() == 2 &&
8599 InVectors.
front().get<
const TreeEntry *>() == &E1 &&
8600 InVectors.
back().get<
const TreeEntry *>() == E2) ||
8601 (!E2 && InVectors.
front().get<
const TreeEntry *>() == &E1)) {
8602 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
8605 "Expected all poisoned elements.");
8607 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
8612 Cost += createShuffle(InVectors.
front(),
8613 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
8615 transformMaskAfterShuffle(CommonMask, CommonMask);
8617 SameNodesEstimated =
false;
8618 if (!E2 && InVectors.
size() == 1) {
8619 unsigned VF = E1.getVectorFactor();
8622 cast<FixedVectorType>(V1->
getType())->getNumElements());
8624 const auto *E = InVectors.
front().get<
const TreeEntry *>();
8625 VF = std::max(VF, E->getVectorFactor());
8627 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8629 CommonMask[
Idx] = Mask[
Idx] + VF;
8630 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
8631 transformMaskAfterShuffle(CommonMask, CommonMask);
8633 Cost += createShuffle(&E1, E2, Mask);
8634 transformMaskAfterShuffle(CommonMask, Mask);
8638 class ShuffleCostBuilder {
8641 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
8643 return Mask.empty() ||
8644 (VF == Mask.size() &&
8652 ~ShuffleCostBuilder() =
default;
8657 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8658 if (isEmptyOrIdentity(Mask, VF))
8661 cast<VectorType>(V1->
getType()), Mask);
8666 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8667 if (isEmptyOrIdentity(Mask, VF))
8670 cast<VectorType>(V1->
getType()), Mask);
8676 void resizeToMatch(
Value *&,
Value *&)
const {}
8686 ShuffleCostBuilder Builder(
TTI);
8689 unsigned CommonVF = Mask.size();
8691 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
8695 Type *EScalarTy = E.Scalars.front()->getType();
8696 bool IsSigned =
true;
8697 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8699 IsSigned = It->second.second;
8701 if (EScalarTy != ScalarTy) {
8702 unsigned CastOpcode = Instruction::Trunc;
8703 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8704 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8706 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8714 if (isa<Constant>(V))
8716 auto *VecTy = cast<VectorType>(V->getType());
8718 if (EScalarTy != ScalarTy) {
8720 unsigned CastOpcode = Instruction::Trunc;
8721 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8722 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8724 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8731 if (!V1 && !V2 && !P2.
isNull()) {
8733 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8734 unsigned VF = E->getVectorFactor();
8735 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8736 CommonVF = std::max(VF, E2->getVectorFactor());
8739 return Idx < 2 * static_cast<int>(CommonVF);
8741 "All elements in mask must be less than 2 * CommonVF.");
8742 if (E->Scalars.size() == E2->Scalars.size()) {
8746 for (
int &
Idx : CommonMask) {
8749 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
8751 else if (
Idx >=
static_cast<int>(CommonVF))
8752 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
8756 CommonVF = E->Scalars.size();
8757 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8758 GetNodeMinBWAffectedCost(*E2, CommonVF);
8760 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8761 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8765 }
else if (!V1 && P2.
isNull()) {
8767 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8768 unsigned VF = E->getVectorFactor();
8772 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8773 "All elements in mask must be less than CommonVF.");
8774 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8776 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
8777 for (
int &
Idx : CommonMask) {
8781 CommonVF = E->Scalars.size();
8783 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8786 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8787 CommonVF == CommonMask.
size() &&
8789 [](
const auto &&
P) {
8791 static_cast<unsigned>(
P.value()) !=
P.index();
8799 }
else if (V1 && P2.
isNull()) {
8801 ExtraCost += GetValueMinBWAffectedCost(V1);
8802 CommonVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8805 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8806 "All elements in mask must be less than CommonVF.");
8807 }
else if (V1 && !V2) {
8809 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8810 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8811 CommonVF = std::max(VF, E2->getVectorFactor());
8814 return Idx < 2 * static_cast<int>(CommonVF);
8816 "All elements in mask must be less than 2 * CommonVF.");
8817 if (E2->Scalars.size() == VF && VF != CommonVF) {
8819 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
8820 for (
int &
Idx : CommonMask) {
8823 if (
Idx >=
static_cast<int>(CommonVF))
8824 Idx = E2Mask[
Idx - CommonVF] + VF;
8828 ExtraCost += GetValueMinBWAffectedCost(V1);
8830 ExtraCost += GetNodeMinBWAffectedCost(
8831 *E2, std::min(CommonVF, E2->getVectorFactor()));
8833 }
else if (!V1 && V2) {
8835 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8836 const TreeEntry *E1 = P1.
get<
const TreeEntry *>();
8837 CommonVF = std::max(VF, E1->getVectorFactor());
8840 return Idx < 2 * static_cast<int>(CommonVF);
8842 "All elements in mask must be less than 2 * CommonVF.");
8843 if (E1->Scalars.size() == VF && VF != CommonVF) {
8845 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
8846 for (
int &
Idx : CommonMask) {
8849 if (
Idx >=
static_cast<int>(CommonVF))
8850 Idx = E1Mask[
Idx - CommonVF] + VF;
8856 ExtraCost += GetNodeMinBWAffectedCost(
8857 *E1, std::min(CommonVF, E1->getVectorFactor()));
8859 ExtraCost += GetValueMinBWAffectedCost(V2);
8862 assert(V1 && V2 &&
"Expected both vectors.");
8863 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8865 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8868 return Idx < 2 * static_cast<int>(CommonVF);
8870 "All elements in mask must be less than 2 * CommonVF.");
8872 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8873 if (V1->
getType() != V2->getType()) {
8877 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
8879 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8885 if (InVectors.
size() == 2)
8887 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8888 V1, V2, CommonMask, Builder);
8895 : ScalarTy(ScalarTy),
TTI(
TTI),
8896 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8897 CheckedExtracts(CheckedExtracts) {}
8899 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8900 unsigned NumParts,
bool &UseVecBaseAsInput) {
8901 UseVecBaseAsInput =
false;
8904 Value *VecBase =
nullptr;
8907 if (NumParts == VL.
size())
8911 bool PrevNodeFound =
any_of(
8913 [&](
const std::unique_ptr<TreeEntry> &TE) {
8914 return ((!TE->isAltShuffle() &&
8915 TE->getOpcode() == Instruction::ExtractElement) ||
8917 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8918 return VL.size() > Data.index() &&
8919 (Mask[Data.index()] == PoisonMaskElem ||
8920 isa<UndefValue>(VL[Data.index()]) ||
8921 Data.value() == VL[Data.index()]);
8926 for (
unsigned Part : seq<unsigned>(NumParts)) {
8928 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
8931 if (isa<UndefValue>(V) ||
8940 auto *EE = cast<ExtractElementInst>(V);
8941 VecBase = EE->getVectorOperand();
8942 UniqueBases.
insert(VecBase);
8943 const TreeEntry *VE = R.getTreeEntry(V);
8944 if (!CheckedExtracts.
insert(V).second ||
8945 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8948 return isa<GetElementPtrInst>(U) &&
8949 !R.areAllUsersVectorized(cast<Instruction>(U),
8957 unsigned Idx = *EEIdx;
8959 if (EE->hasOneUse() || !PrevNodeFound) {
8961 if (isa<SExtInst, ZExtInst>(Ext) &&
8962 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8967 EE->getVectorOperandType(),
Idx);
8970 Ext->getOpcode(), Ext->getType(), EE->getType(),
8986 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8989 transformMaskAfterShuffle(CommonMask, CommonMask);
8990 SameNodesEstimated =
false;
8991 if (NumParts != 1 && UniqueBases.
size() != 1) {
8992 UseVecBaseAsInput =
true;
9000 std::optional<InstructionCost>
9004 return std::nullopt;
9010 return Idx < static_cast<int>(E1.getVectorFactor());
9012 "Expected single vector shuffle mask.");
9016 if (InVectors.
empty()) {
9017 CommonMask.
assign(Mask.begin(), Mask.end());
9018 InVectors.
assign({&E1, &E2});
9021 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
9024 if (NumParts == 0 || NumParts >= Mask.size())
9029 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9030 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
9033 if (InVectors.
empty()) {
9034 CommonMask.
assign(Mask.begin(), Mask.end());
9035 InVectors.
assign(1, &E1);
9038 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
9041 if (NumParts == 0 || NumParts >= Mask.size())
9046 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9047 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
9048 if (!SameNodesEstimated && InVectors.
size() == 1)
9061 cast<ExtractElementInst>(InVectors.
front()
9062 .get<
const TreeEntry *>()
9063 ->Scalars[
P.index()]);
9064 return EI->getVectorOperand() == V1 ||
9065 EI->getVectorOperand() == V2;
9067 "Expected extractelement vectors.");
9071 if (InVectors.
empty()) {
9073 "Expected empty input mask/vectors.");
9074 CommonMask.
assign(Mask.begin(), Mask.end());
9081 InVectors.
front().is<
const TreeEntry *>() && !CommonMask.
empty() &&
9085 .get<const TreeEntry *>()
9086 ->Scalars[
P.index()];
9088 return P.value() == Mask[
P.index()] ||
9089 isa<UndefValue>(Scalar);
9090 if (isa<Constant>(V1))
9092 auto *EI = cast<ExtractElementInst>(Scalar);
9093 return EI->getVectorOperand() == V1;
9095 "Expected only tree entry for extractelement vectors.");
9099 "Expected only tree entries from extracts/reused buildvectors.");
9100 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
9101 if (InVectors.
size() == 2) {
9102 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
9103 transformMaskAfterShuffle(CommonMask, CommonMask);
9104 VF = std::max<unsigned>(VF, CommonMask.
size());
9105 }
else if (
const auto *InTE =
9106 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
9107 VF = std::max(VF, InTE->getVectorFactor());
9111 ->getNumElements());
9114 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
9116 CommonMask[
Idx] = Mask[
Idx] + VF;
9119 Value *Root =
nullptr) {
9120 Cost += getBuildVectorCost(VL, Root);
9124 unsigned VF = VL.
size();
9126 VF = std::min(VF, MaskVF);
9128 if (isa<UndefValue>(V)) {
9138 cast<FixedVectorType>(Root->
getType())->getNumElements()),
9139 getAllOnesValue(*R.DL, ScalarTy));
9149 if (InVectors.
size() == 2)
9150 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
9152 Cost += createShuffle(Vec,
nullptr, CommonMask);
9153 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
9157 "Expected vector length for the final value before action.");
9159 Action(V, CommonMask);
9160 InVectors.
front() = V;
9163 if (CommonMask.
empty()) {
9164 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
9168 createShuffle(InVectors.
front(),
9169 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
9175 "Shuffle construction must be finalized.");
9179const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
9180 unsigned Idx)
const {
9182 if (
const TreeEntry *TE = getTreeEntry(
Op)) {
9183 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
9184 return EI.EdgeIdx == Idx && EI.UserTE == E;
9185 }) != TE->UserTreeIndices.end())
9187 auto MIt = MultiNodeScalars.
find(
Op);
9188 if (MIt != MultiNodeScalars.
end()) {
9189 for (
const TreeEntry *TE : MIt->second) {
9190 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
9191 return EI.EdgeIdx == Idx && EI.UserTE == E;
9192 }) != TE->UserTreeIndices.end())
9198 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9199 return TE->isGather() &&
9200 find_if(
TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
9201 return EI.EdgeIdx == Idx && EI.UserTE == E;
9202 }) !=
TE->UserTreeIndices.end();
9204 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
9209 if (
TE.State == TreeEntry::ScatterVectorize ||
9210 TE.State == TreeEntry::StridedVectorize)
9212 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
9213 !
TE.isAltShuffle()) {
9214 if (
TE.ReorderIndices.empty())
9253 Type *ScalarTy = VL[0]->getType();
9254 if (!E->isGather()) {
9255 if (
auto *SI = dyn_cast<StoreInst>(VL[0]))
9256 ScalarTy =
SI->getValueOperand()->getType();
9257 else if (
auto *CI = dyn_cast<CmpInst>(VL[0]))
9259 else if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9260 ScalarTy =
IE->getOperand(1)->getType();
9268 auto It = MinBWs.
find(E);
9269 Type *OrigScalarTy = ScalarTy;
9270 if (It != MinBWs.
end())
9273 unsigned EntryVF = E->getVectorFactor();
9276 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9277 if (E->isGather()) {
9280 if (isa<InsertElementInst>(VL[0]))
9282 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9283 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
9288 if (!E->ReorderIndices.empty() &&
9289 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9291 if (E->getOpcode() == Instruction::Store) {
9293 NewMask.
resize(E->ReorderIndices.size());
9294 copy(E->ReorderIndices, NewMask.
begin());
9300 if (NeedToShuffleReuses)
9301 ::addMask(Mask, E->ReuseShuffleIndices);
9305 assert((E->State == TreeEntry::Vectorize ||
9306 E->State == TreeEntry::ScatterVectorize ||
9307 E->State == TreeEntry::StridedVectorize) &&
9311 (E->getOpcode() == Instruction::GetElementPtr &&
9312 E->getMainOp()->getType()->isPointerTy())) &&
9315 unsigned ShuffleOrOp =
9316 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
9318 const unsigned Sz = UniqueValues.
size();
9320 for (
unsigned I = 0;
I < Sz; ++
I) {
9321 if (getTreeEntry(UniqueValues[
I]) == E)
9325 auto GetCastContextHint = [&](
Value *
V) {
9326 if (
const TreeEntry *OpTE = getTreeEntry(V))
9327 return getCastContextHint(*OpTE);
9328 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
9329 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9338 if (isa<CastInst, CallInst>(VL0)) {
9342 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9344 for (
unsigned I = 0;
I < Sz; ++
I) {
9345 if (UsedScalars.test(
I))
9347 ScalarCost += ScalarEltCost(
I);
9355 const EdgeInfo &EI = E->UserTreeIndices.front();
9356 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9358 It != MinBWs.
end()) {
9359 auto UserBWIt = MinBWs.
find(EI.UserTE);
9360 Type *UserScalarTy =
9361 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9362 if (UserBWIt != MinBWs.
end())
9364 UserBWIt->second.first);
9365 if (ScalarTy != UserScalarTy) {
9366 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9367 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
9372 VecOpcode = Instruction::Trunc;
9375 It->second.second ? Instruction::SExt : Instruction::ZExt;
9382 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9383 ScalarCost,
"Calculated costs for Tree"));
9384 return VecCost - ScalarCost;
9389 assert((E->State == TreeEntry::Vectorize ||
9390 E->State == TreeEntry::StridedVectorize) &&
9391 "Entry state expected to be Vectorize or StridedVectorize here.");
9395 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
9396 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9397 "Calculated GEPs cost for Tree"));
9399 return VecCost - ScalarCost;
9402 switch (ShuffleOrOp) {
9403 case Instruction::PHI: {
9407 for (
Value *V : UniqueValues) {
9408 auto *
PHI = dyn_cast<PHINode>(V);
9413 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
9417 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
9419 if (!OpTE->ReuseShuffleIndices.empty())
9420 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9421 OpTE->Scalars.size());
9424 return CommonCost - ScalarCost;
9426 case Instruction::ExtractValue:
9427 case Instruction::ExtractElement: {
9428 auto GetScalarCost = [&](
unsigned Idx) {
9429 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
9431 if (ShuffleOrOp == Instruction::ExtractElement) {
9432 auto *EE = cast<ExtractElementInst>(
I);
9433 SrcVecTy = EE->getVectorOperandType();
9435 auto *EV = cast<ExtractValueInst>(
I);
9436 Type *AggregateTy = EV->getAggregateOperand()->getType();
9438 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9439 NumElts = ATy->getNumElements();
9444 if (
I->hasOneUse()) {
9446 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9447 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
9454 Ext->getOpcode(),
Ext->getType(),
I->getType(),
9462 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
9463 return GetCostDiff(GetScalarCost, GetVectorCost);
9465 case Instruction::InsertElement: {
9466 assert(E->ReuseShuffleIndices.empty() &&
9467 "Unique insertelements only are expected.");
9468 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
9469 unsigned const NumElts = SrcVecTy->getNumElements();
9470 unsigned const NumScalars = VL.
size();
9476 unsigned OffsetEnd = OffsetBeg;
9477 InsertMask[OffsetBeg] = 0;
9480 if (OffsetBeg >
Idx)
9482 else if (OffsetEnd <
Idx)
9484 InsertMask[
Idx] =
I + 1;
9488 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9489 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9491 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9492 unsigned InsertVecSz = std::min<unsigned>(
9494 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9495 bool IsWholeSubvector =
9496 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9500 if (OffsetBeg + InsertVecSz > VecSz) {
9503 InsertVecSz = VecSz;
9509 if (!E->ReorderIndices.empty()) {
9514 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
9516 bool IsIdentity =
true;
9518 Mask.swap(PrevMask);
9519 for (
unsigned I = 0;
I < NumScalars; ++
I) {
9521 DemandedElts.
setBit(InsertIdx);
9522 IsIdentity &= InsertIdx - OffsetBeg ==
I;
9523 Mask[InsertIdx - OffsetBeg] =
I;
9525 assert(
Offset < NumElts &&
"Failed to find vector index offset");
9540 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
9541 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9549 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9550 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
9551 if (InsertVecSz != VecSz) {
9563 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
9572 case Instruction::ZExt:
9573 case Instruction::SExt:
9574 case Instruction::FPToUI:
9575 case Instruction::FPToSI:
9576 case Instruction::FPExt:
9577 case Instruction::PtrToInt:
9578 case Instruction::IntToPtr:
9579 case Instruction::SIToFP:
9580 case Instruction::UIToFP:
9581 case Instruction::Trunc:
9582 case Instruction::FPTrunc:
9583 case Instruction::BitCast: {
9584 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9587 unsigned Opcode = ShuffleOrOp;
9588 unsigned VecOpcode = Opcode;
9590 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
9592 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
9593 if (SrcIt != MinBWs.
end()) {
9594 SrcBWSz = SrcIt->second.first;
9598 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9599 if (BWSz == SrcBWSz) {
9600 VecOpcode = Instruction::BitCast;
9601 }
else if (BWSz < SrcBWSz) {
9602 VecOpcode = Instruction::Trunc;
9603 }
else if (It != MinBWs.
end()) {
9604 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9605 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9606 }
else if (SrcIt != MinBWs.
end()) {
9607 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9609 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9611 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
9612 !SrcIt->second.second) {
9613 VecOpcode = Instruction::UIToFP;
9616 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9624 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9626 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
9630 VecOpcode == Opcode ? VI :
nullptr);
9632 return GetCostDiff(GetScalarCost, GetVectorCost);
9634 case Instruction::FCmp:
9635 case Instruction::ICmp:
9636 case Instruction::Select: {
9640 match(VL0, MatchCmp))
9646 auto GetScalarCost = [&](
unsigned Idx) {
9647 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9653 !
match(VI, MatchCmp)) ||
9654 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9660 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
9664 Type *CanonicalType = OrigScalarTy;
9671 {CanonicalType, CanonicalType});
9677 auto *CI = cast<CmpInst>(
VI->getOperand(0));
9679 CI->
getOpcode(), OrigScalarTy, Builder.getInt1Ty(),
9682 ScalarCost = std::min(ScalarCost, IntrinsicCost);
9691 E->getOpcode(), VecTy, MaskTy, VecPred,
CostKind, VL0);
9697 Type *CanonicalType = VecTy;
9703 {CanonicalType, CanonicalType});
9710 cast<CmpInst>(cast<Instruction>(VL.
front())->getOperand(0));
9714 VecCost = std::min(VecCost, IntrinsicCost);
9716 return VecCost + CommonCost;
9718 return GetCostDiff(GetScalarCost, GetVectorCost);
9720 case Instruction::FNeg:
9721 case Instruction::Add:
9722 case Instruction::FAdd:
9723 case Instruction::Sub:
9724 case Instruction::FSub:
9725 case Instruction::Mul:
9726 case Instruction::FMul:
9727 case Instruction::UDiv:
9728 case Instruction::SDiv:
9729 case Instruction::FDiv:
9730 case Instruction::URem:
9731 case Instruction::SRem:
9732 case Instruction::FRem:
9733 case Instruction::Shl:
9734 case Instruction::LShr:
9735 case Instruction::AShr:
9736 case Instruction::And:
9737 case Instruction::Or:
9738 case Instruction::Xor: {
9739 auto GetScalarCost = [&](
unsigned Idx) {
9740 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9741 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9750 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
9751 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9754 auto *CI = dyn_cast<ConstantInt>(
Op);
9755 return CI && CI->getValue().countr_one() >= It->second.first;
9760 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9764 Op2Info, std::nullopt,
nullptr, TLI) +
9767 return GetCostDiff(GetScalarCost, GetVectorCost);
9769 case Instruction::GetElementPtr: {
9770 return CommonCost + GetGEPCostDiff(VL, VL0);
9772 case Instruction::Load: {
9773 auto GetScalarCost = [&](
unsigned Idx) {
9774 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
9776 VI->getAlign(),
VI->getPointerAddressSpace(),
9779 auto *LI0 = cast<LoadInst>(VL0);
9782 if (E->State == TreeEntry::Vectorize) {
9784 Instruction::Load, VecTy, LI0->getAlign(),
9786 }
else if (E->State == TreeEntry::StridedVectorize) {
9787 Align CommonAlignment =
9788 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9790 Instruction::Load, VecTy, LI0->getPointerOperand(),
9793 assert(E->State == TreeEntry::ScatterVectorize &&
"Unknown EntryState");
9794 Align CommonAlignment =
9795 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9797 Instruction::Load, VecTy, LI0->getPointerOperand(),
9800 return VecLdCost + CommonCost;
9806 if (E->State == TreeEntry::ScatterVectorize)
9812 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
9813 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9815 case Instruction::Store: {
9816 bool IsReorder = !E->ReorderIndices.empty();
9817 auto GetScalarCost = [=](
unsigned Idx) {
9818 auto *
VI = cast<StoreInst>(VL[
Idx]);
9821 VI->getAlign(),
VI->getPointerAddressSpace(),
9825 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9829 if (E->State == TreeEntry::StridedVectorize) {
9830 Align CommonAlignment =
9831 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9833 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9836 assert(E->State == TreeEntry::Vectorize &&
9837 "Expected either strided or consecutive stores.");
9840 Instruction::Store, VecTy, BaseSI->getAlign(),
9841 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
9843 return VecStCost + CommonCost;
9847 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
9848 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
9851 return GetCostDiff(GetScalarCost, GetVectorCost) +
9852 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9854 case Instruction::Call: {
9855 auto GetScalarCost = [&](
unsigned Idx) {
9856 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
9867 auto *CI = cast<CallInst>(VL0);
9871 It != MinBWs.
end() ? It->second.first : 0);
9873 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9875 return GetCostDiff(GetScalarCost, GetVectorCost);
9877 case Instruction::ShuffleVector: {
9878 assert(E->isAltShuffle() &&
9883 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9884 "Invalid Shuffle Vector Operand");
9887 auto TryFindNodeWithEqualOperands = [=]() {
9888 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9891 if (
TE->isAltShuffle() &&
9892 ((
TE->getOpcode() == E->getOpcode() &&
9893 TE->getAltOpcode() == E->getAltOpcode()) ||
9894 (
TE->getOpcode() == E->getAltOpcode() &&
9895 TE->getAltOpcode() == E->getOpcode())) &&
9896 TE->hasEqualOperands(*E))
9901 auto GetScalarCost = [&](
unsigned Idx) {
9902 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9903 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
9913 if (TryFindNodeWithEqualOperands()) {
9915 dbgs() <<
"SLP: diamond match for alternate node found.\n";
9922 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
9924 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
9925 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9927 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9928 CI0->getPredicate(),
CostKind, VL0);
9929 VecCost += TTIRef.getCmpSelInstrCost(
9930 E->getOpcode(), VecTy, MaskTy,
9931 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
9934 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9937 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9938 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9940 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9941 if (SrcIt != MinBWs.
end()) {
9942 SrcBWSz = SrcIt->second.first;
9946 if (BWSz <= SrcBWSz) {
9949 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9953 <<
"SLP: alternate extension, which should be truncated.\n";
9959 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9962 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9966 E->buildAltOpShuffleMask(
9968 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
9969 return I->getOpcode() == E->getAltOpcode();
9978 unsigned Opcode0 = E->getOpcode();
9979 unsigned Opcode1 = E->getAltOpcode();
9983 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9985 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
9986 return AltVecCost < VecCost ? AltVecCost : VecCost;
9991 return GetCostDiff(GetScalarCost, GetVectorCost);
9998bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
10000 << VectorizableTree.size() <<
" is fully vectorizable .\n");
10002 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
10004 return TE->isGather() &&
10006 [
this](
Value *V) { return EphValues.contains(V); }) &&
10008 TE->Scalars.size() < Limit ||
10009 ((
TE->getOpcode() == Instruction::ExtractElement ||
10010 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
10012 (
TE->isGather() &&
TE->getOpcode() == Instruction::Load &&
10013 !
TE->isAltShuffle()));
10017 if (VectorizableTree.size() == 1 &&
10018 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
10020 AreVectorizableGathers(VectorizableTree[0].
get(),
10021 VectorizableTree[0]->Scalars.size()) &&
10022 VectorizableTree[0]->getVectorFactor() > 2)))
10025 if (VectorizableTree.size() != 2)
10033 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
10034 AreVectorizableGathers(VectorizableTree[1].
get(),
10035 VectorizableTree[0]->Scalars.size()))
10039 if (VectorizableTree[0]->
isGather() ||
10040 (VectorizableTree[1]->isGather() &&
10041 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
10042 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
10050 bool MustMatchOrInst) {
10054 Value *ZextLoad = Root;
10055 const APInt *ShAmtC;
10056 bool FoundOr =
false;
10057 while (!isa<ConstantExpr>(ZextLoad) &&
10060 ShAmtC->
urem(8) == 0))) {
10061 auto *BinOp = cast<BinaryOperator>(ZextLoad);
10062 ZextLoad = BinOp->getOperand(0);
10063 if (BinOp->getOpcode() == Instruction::Or)
10068 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
10075 Type *SrcTy = Load->getType();
10082 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
10083 << *(cast<Instruction>(Root)) <<
"\n");
10092 unsigned NumElts = VectorizableTree[0]->Scalars.size();
10093 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
10101 unsigned NumElts = Stores.
size();
10102 for (
Value *Scalar : Stores) {
10113 if (VectorizableTree.size() == 2 &&
10114 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
10115 VectorizableTree[1]->isGather() &&
10116 (VectorizableTree[1]->getVectorFactor() <= 2 ||
10117 !(
isSplat(VectorizableTree[1]->Scalars) ||
10125 constexpr int Limit = 4;
10127 !VectorizableTree.empty() &&
10128 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
10129 return (TE->isGather() &&
10130 TE->getOpcode() != Instruction::ExtractElement &&
10131 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
10132 TE->getOpcode() == Instruction::PHI;
10143 if (isFullyVectorizableTinyTree(ForReduction))
10148 bool IsAllowedSingleBVNode =
10149 VectorizableTree.size() > 1 ||
10150 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
10151 !VectorizableTree.front()->isAltShuffle() &&
10152 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
10153 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
10155 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
10156 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
10157 return isa<ExtractElementInst, UndefValue>(V) ||
10158 (IsAllowedSingleBVNode &&
10159 !V->hasNUsesOrMore(UsesLimit) &&
10160 any_of(V->users(), IsaPred<InsertElementInst>));
10165 assert(VectorizableTree.empty()
10166 ? ExternalUses.empty()
10167 :
true &&
"We shouldn't have any external users");
10179 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
10192 for (
const auto &TEPtr : VectorizableTree) {
10193 if (TEPtr->State != TreeEntry::Vectorize)
10195 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
10201 auto *NodeA = DT->
getNode(
A->getParent());
10202 auto *NodeB = DT->
getNode(
B->getParent());
10203 assert(NodeA &&
"Should only process reachable instructions");
10204 assert(NodeB &&
"Should only process reachable instructions");
10205 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10206 "Different nodes should have different DFS numbers");
10207 if (NodeA != NodeB)
10208 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
10209 return B->comesBefore(
A);
10219 LiveValues.
erase(PrevInst);
10220 for (
auto &J : PrevInst->
operands()) {
10221 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10222 LiveValues.
insert(cast<Instruction>(&*J));
10226 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
10227 for (
auto *
X : LiveValues)
10228 dbgs() <<
" " <<
X->getName();
10229 dbgs() <<
", Looking at ";
10234 unsigned NumCalls = 0;
10238 while (InstIt != PrevInstIt) {
10239 if (PrevInstIt == PrevInst->
getParent()->rend()) {
10240 PrevInstIt = Inst->getParent()->rbegin();
10245 if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
10246 if (
II->isAssumeLikeIntrinsic())
10250 for (
auto &ArgOp :
II->args())
10252 if (
auto *FPMO = dyn_cast<FPMathOperator>(
II))
10253 FMF = FPMO->getFastMathFlags();
10260 if (IntrCost < CallCost)
10267 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10268 &*PrevInstIt != PrevInst)
10276 for (
auto *
II : LiveValues) {
10277 auto *ScalarTy =
II->getType();
10278 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10279 ScalarTy = VectorTy->getElementType();
10297 const auto *I1 = IE1;
10298 const auto *I2 = IE2;
10310 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10312 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10313 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
10315 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10316 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10323struct ValueSelect {
10324 template <
typename U>
10325 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
10328 template <
typename U>
10329 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
10347template <
typename T>
10353 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
10355 auto VMIt = std::next(ShuffleMask.begin());
10358 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10360 if (!IsBaseUndef.
all()) {
10362 std::pair<T *, bool> Res =
10363 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
10365 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
10369 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
10371 auto *V = ValueSelect::get<T *>(
Base);
10373 assert((!V || GetVF(V) == Mask.size()) &&
10374 "Expected base vector of VF number of elements.");
10375 Prev = Action(Mask, {
nullptr, Res.first});
10376 }
else if (ShuffleMask.size() == 1) {
10379 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10385 Prev = Action(Mask, {ShuffleMask.begin()->first});
10389 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10390 unsigned Vec2VF = GetVF(VMIt->first);
10391 if (Vec1VF == Vec2VF) {
10395 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10398 Mask[
I] = SecMask[
I] + Vec1VF;
10401 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10404 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10406 std::pair<T *, bool> Res2 =
10407 ResizeAction(VMIt->first, VMIt->second,
false);
10409 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10416 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
10419 Prev = Action(Mask, {Res1.first, Res2.first});
10421 VMIt = std::next(VMIt);
10423 bool IsBaseNotUndef = !IsBaseUndef.
all();
10424 (void)IsBaseNotUndef;
10426 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10428 std::pair<T *, bool> Res =
10429 ResizeAction(VMIt->first, VMIt->second,
false);
10431 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10434 "Multiple uses of scalars.");
10435 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
10440 Prev = Action(Mask, {Prev, Res.first});
10448 << VectorizableTree.size() <<
".\n");
10450 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10453 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
10454 TreeEntry &TE = *VectorizableTree[
I];
10455 if (TE.isGather()) {
10456 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
10457 E && E->getVectorFactor() == TE.getVectorFactor() &&
10458 E->isSame(TE.Scalars)) {
10463 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10472 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10482 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10483 for (ExternalUser &EU : ExternalUses) {
10485 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10486 !ExtractCostCalculated.
insert(EU.Scalar).second)
10492 if (EphValues.
count(EU.User))
10496 if (isa<FixedVectorType>(EU.Scalar->getType()))
10501 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
10503 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
10504 if (!UsedInserts.
insert(VU).second)
10508 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10511 [
this, VU](
const std::pair<Value *, const TreeEntry *> &Pair) {
10513 VU, cast<InsertElementInst>(Pair.first),
10515 Value *Op0 = II->getOperand(0);
10516 if (getTreeEntry(II) && !getTreeEntry(Op0))
10522 if (It == FirstUsers.
end()) {
10529 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
10530 if (IEBase != EU.User &&
10531 (!IEBase->hasOneUse() ||
10535 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
10538 IEBase = cast<InsertElementInst>(
Base);
10541 "InsertElementInstruction used already.");
10543 Base = IEBase->getOperand(0);
10544 }
while (E == getTreeEntry(
Base));
10547 Base = cast<InsertElementInst>(
Base)->getOperand(0);
10551 VecId = FirstUsers.
size() - 1;
10552 auto It = MinBWs.
find(ScalarTE);
10553 if (It != MinBWs.
end() &&
10555 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
10557 unsigned BWSz = It->second.first;
10558 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
10559 unsigned VecOpcode;
10560 if (DstBWSz < BWSz)
10561 VecOpcode = Instruction::Trunc;
10564 It->second.second ? Instruction::SExt : Instruction::ZExt;
10569 FTy->getNumElements()),
10572 <<
" for extending externally used vector with "
10573 "non-equal minimum bitwidth.\n");
10579 VecId = std::distance(FirstUsers.
begin(), It);
10581 int InIdx = *InsertIdx;
10585 Mask[InIdx] = EU.Lane;
10586 DemandedElts[VecId].setBit(InIdx);
10594 if (
auto *
GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10595 if (!ValueToExtUses) {
10596 ValueToExtUses.emplace();
10598 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
10604 if (!getTreeEntry(V))
10606 auto It = ValueToExtUses->find(V);
10607 if (It != ValueToExtUses->end()) {
10609 ExternalUses[It->second].User = nullptr;
10614 if (CanBeUsedAsGEP) {
10616 ExternalUsesAsGEPs.
insert(EU.Scalar);
10624 auto *VecTy =
getWidenedType(EU.Scalar->getType(), BundleWidth);
10625 auto It = MinBWs.
find(getTreeEntry(EU.Scalar));
10626 if (It != MinBWs.
end()) {
10629 It->second.second ? Instruction::SExt : Instruction::ZExt;
10639 if (!VectorizedVals.
empty()) {
10640 const TreeEntry &Root = *VectorizableTree.front();
10641 auto BWIt = MinBWs.find(&Root);
10642 if (BWIt != MinBWs.end()) {
10643 Type *DstTy = Root.Scalars.front()->getType();
10644 unsigned OriginalSz =
DL->getTypeSizeInBits(DstTy);
10646 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10647 if (OriginalSz != SrcSz) {
10648 unsigned Opcode = Instruction::Trunc;
10649 if (OriginalSz > SrcSz)
10650 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10660 Cost += SpillCost + ExtractCost;
10664 unsigned VF =
Mask.size();
10665 unsigned VecVF =
TE->getVectorFactor();
10667 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
10670 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
10676 dbgs() <<
"SLP: Adding cost " <<
C
10677 <<
" for final shuffle of insertelement external users.\n";
10678 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10680 return std::make_pair(TE,
true);
10682 return std::make_pair(TE,
false);
10685 for (
int I = 0, E = FirstUsers.size();
I < E; ++
I) {
10686 Value *
Base = cast<Instruction>(FirstUsers[
I].first)->getOperand(0);
10687 auto Vector = ShuffleMasks[
I].takeVector();
10691 assert((TEs.size() == 1 || TEs.size() == 2) &&
10692 "Expected exactly 1 or 2 tree entries.");
10693 if (TEs.size() == 1) {
10695 VF = TEs.front()->getVectorFactor();
10696 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
10700 (
Data.index() < VF &&
10701 static_cast<int>(
Data.index()) ==
Data.value());
10706 <<
" for final shuffle of insertelement "
10707 "external users.\n";
10708 TEs.front()->
dump();
10709 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10715 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10716 VF = TEs.front()->getVectorFactor();
10720 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
10724 <<
" for final shuffle of vector node and external "
10725 "insertelement users.\n";
10726 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10727 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10733 (void)performExtractsShuffleAction<const TreeEntry>(
10735 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
10736 EstimateShufflesCost);
10738 cast<FixedVectorType>(FirstUsers[
I].first->getType()), DemandedElts[
I],
10740 Cost -= InsertCost;
10744 if (ReductionBitWidth != 0) {
10745 assert(UserIgnoreList &&
"Expected reduction tree.");
10746 const TreeEntry &E = *VectorizableTree.front();
10747 auto It = MinBWs.find(&E);
10748 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10749 unsigned SrcSize = It->second.first;
10750 unsigned DstSize = ReductionBitWidth;
10751 unsigned Opcode = Instruction::Trunc;
10752 if (SrcSize < DstSize)
10753 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10755 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10757 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
10760 switch (E.getOpcode()) {
10761 case Instruction::SExt:
10762 case Instruction::ZExt:
10763 case Instruction::Trunc: {
10764 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10765 CCH = getCastContextHint(*OpTE);
10775 <<
" for final resize for reduction from " << SrcVecTy
10776 <<
" to " << DstVecTy <<
"\n";
10777 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10785 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
10786 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
10787 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
10791 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
10802std::optional<TTI::ShuffleKind>
10803BoUpSLP::tryToGatherSingleRegisterExtractElements(
10809 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
10810 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10812 if (isa<UndefValue>(VL[
I]))
10816 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10817 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10826 ExtractMask.reset(*
Idx);
10831 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
10836 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
10837 return P1.second.size() > P2.second.size();
10840 const int UndefSz = UndefVectorExtracts.
size();
10841 unsigned SingleMax = 0;
10842 unsigned PairMax = 0;
10843 if (!Vectors.
empty()) {
10844 SingleMax = Vectors.
front().second.size() + UndefSz;
10845 if (Vectors.
size() > 1) {
10846 auto *ItNext = std::next(Vectors.
begin());
10847 PairMax = SingleMax + ItNext->second.size();
10850 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10851 return std::nullopt;
10857 if (SingleMax >= PairMax && SingleMax) {
10858 for (
int Idx : Vectors.
front().second)
10860 }
else if (!Vectors.
empty()) {
10861 for (
unsigned Idx : {0, 1})
10862 for (
int Idx : Vectors[
Idx].second)
10866 for (
int Idx : UndefVectorExtracts)
10870 std::optional<TTI::ShuffleKind> Res =
10876 return std::nullopt;
10880 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
10881 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
10882 isa<UndefValue>(GatheredExtracts[
I])) {
10886 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10887 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10888 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10903 unsigned NumParts)
const {
10904 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
10908 for (
unsigned Part : seq<unsigned>(NumParts)) {
10914 std::optional<TTI::ShuffleKind> Res =
10915 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10916 ShufflesRes[Part] = Res;
10917 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
10919 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
10920 return Res.has_value();
10922 ShufflesRes.clear();
10923 return ShufflesRes;
10926std::optional<TargetTransformInfo::ShuffleKind>
10927BoUpSLP::isGatherShuffledSingleRegisterEntry(
10933 const EdgeInfo &TEUseEI =
TE->UserTreeIndices.front();
10934 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10938 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10939 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10942 TEInsertBlock = TEInsertPt->
getParent();
10945 return std::nullopt;
10946 auto *NodeUI = DT->
getNode(TEInsertBlock);
10947 assert(NodeUI &&
"Should only process reachable instructions");
10949 auto CheckOrdering = [&](
const Instruction *InsertPt) {
10963 auto *NodeEUI = DT->
getNode(InsertBlock);
10966 assert((NodeUI == NodeEUI) ==
10967 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10968 "Different nodes should have different DFS numbers");
10970 if (TEInsertPt->
getParent() != InsertBlock &&
10973 if (TEInsertPt->
getParent() == InsertBlock &&
10987 for (
Value *V : VL) {
10992 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10996 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
10997 "Must contain at least single gathered value.");
10998 assert(TEPtr->UserTreeIndices.size() == 1 &&
10999 "Expected only single user of a gather node.");
11000 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
11002 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
11005 : &getLastInstructionInBundle(UseEI.UserTE);
11006 if (TEInsertPt == InsertPt) {
11010 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
11014 if (TEUseEI.UserTE != UseEI.UserTE &&
11015 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
11021 if ((TEInsertBlock != InsertPt->
getParent() ||
11022 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
11023 !CheckOrdering(InsertPt))
11027 if (
const TreeEntry *VTE = getTreeEntry(V)) {
11029 if (VTE->State != TreeEntry::Vectorize) {
11030 auto It = MultiNodeScalars.
find(V);
11031 if (It == MultiNodeScalars.
end())
11033 VTE = *It->getSecond().begin();
11035 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
11036 return MTE->State == TreeEntry::Vectorize;
11038 if (MIt == It->getSecond().end())
11043 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
11044 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
11048 if (VToTEs.
empty())
11050 if (UsedTEs.
empty()) {
11064 if (!VToTEs.
empty()) {
11070 VToTEs = SavedVToTEs;
11079 if (UsedTEs.
size() == 2)
11081 UsedTEs.push_back(SavedVToTEs);
11088 if (UsedTEs.
empty()) {
11090 return std::nullopt;
11094 if (UsedTEs.
size() == 1) {
11097 UsedTEs.front().
end());
11098 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
11099 return TE1->Idx < TE2->Idx;
11102 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
11103 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
11105 if (It != FirstEntries.end() &&
11106 ((*It)->getVectorFactor() == VL.size() ||
11107 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
11108 TE->ReuseShuffleIndices.size() == VL.size() &&
11109 (*It)->isSame(
TE->Scalars)))) {
11110 Entries.push_back(*It);
11111 if ((*It)->getVectorFactor() == VL.size()) {
11112 std::iota(std::next(
Mask.begin(), Part * VL.size()),
11113 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
11119 for (
int I = 0, Sz = VL.size();
I < Sz; ++
I)
11120 if (isa<PoisonValue>(VL[
I]))
11126 Entries.push_back(FirstEntries.front());
11129 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
11132 for (
const TreeEntry *TE : UsedTEs.front()) {
11133 unsigned VF =
TE->getVectorFactor();
11134 auto It = VFToTE.
find(VF);
11135 if (It != VFToTE.
end()) {
11136 if (It->second->Idx >
TE->Idx)
11137 It->getSecond() =
TE;
11144 UsedTEs.back().
end());
11145 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
11146 return TE1->Idx < TE2->Idx;
11148 for (
const TreeEntry *TE : SecondEntries) {
11149 auto It = VFToTE.
find(
TE->getVectorFactor());
11150 if (It != VFToTE.
end()) {
11152 Entries.push_back(It->second);
11153 Entries.push_back(TE);
11159 if (Entries.empty()) {
11161 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
11162 return TE1->Idx < TE2->Idx;
11164 Entries.push_back(SecondEntries.front());
11165 VF = std::max(Entries.front()->getVectorFactor(),
11166 Entries.back()->getVectorFactor());
11170 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
11173 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
11174 auto *
PHI = cast<PHINode>(V);
11175 auto *PHI1 = cast<PHINode>(V1);
11180 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
11182 Value *In1 = PHI1->getIncomingValue(
I);
11187 if (cast<Instruction>(In)->
getParent() !=
11197 auto MightBeIgnored = [=](
Value *
V) {
11198 auto *
I = dyn_cast<Instruction>(V);
11199 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
11201 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
11206 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
11208 bool UsedInSameVTE =
false;
11209 auto It = UsedValuesEntry.
find(V1);
11210 if (It != UsedValuesEntry.
end())
11211 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
11212 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11214 cast<Instruction>(V)->getParent() ==
11215 cast<Instruction>(V1)->getParent() &&
11216 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11221 for (
int I = 0, E = VL.size();
I < E; ++
I) {
11223 auto It = UsedValuesEntry.
find(V);
11224 if (It == UsedValuesEntry.
end())
11230 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
11231 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
11233 unsigned Idx = It->second;
11240 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
11241 if (!UsedIdxs.test(
I))
11247 for (std::pair<unsigned, int> &Pair : EntryLanes)
11248 if (Pair.first ==
I)
11249 Pair.first = TempEntries.
size();
11252 Entries.swap(TempEntries);
11253 if (EntryLanes.size() == Entries.size() &&
11255 .
slice(Part * VL.size(),
11256 std::min<int>(VL.size(),
TE->Scalars.size())))) {
11262 return std::nullopt;
11265 bool IsIdentity = Entries.size() == 1;
11268 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
11269 unsigned Idx = Part * VL.size() + Pair.second;
11272 (ForOrder ? std::distance(
11273 Entries[Pair.first]->Scalars.begin(),
11274 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11275 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11276 IsIdentity &=
Mask[
Idx] == Pair.second;
11278 switch (Entries.size()) {
11280 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11284 if (EntryLanes.size() > 2 || VL.size() <= 2)
11292 std::fill(std::next(
Mask.begin(), Part * VL.size()),
11294 return std::nullopt;
11298BoUpSLP::isGatherShuffledEntry(
11302 assert(NumParts > 0 && NumParts < VL.
size() &&
11303 "Expected positive number of registers.");
11306 if (TE == VectorizableTree.front().get())
11309 if (
TE->isNonPowOf2Vec())
11312 assert(
TE->UserTreeIndices.size() == 1 &&
11313 "Expected only single user of the gather node.");
11315 "Number of scalars must be divisible by NumParts.");
11318 for (
unsigned Part : seq<unsigned>(NumParts)) {
11322 std::optional<TTI::ShuffleKind> SubRes =
11323 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11326 SubEntries.
clear();
11329 SubEntries.
front()->getVectorFactor() == VL.
size() &&
11330 (SubEntries.
front()->isSame(
TE->Scalars) ||
11331 SubEntries.
front()->isSame(VL))) {
11333 LocalSubEntries.
swap(SubEntries);
11336 std::iota(
Mask.begin(),
Mask.end(), 0);
11338 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
11339 if (isa<PoisonValue>(VL[
I]))
11341 Entries.emplace_back(1, LocalSubEntries.
front());
11347 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
11355 Type *ScalarTy)
const {
11357 bool DuplicateNonConst =
false;
11365 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
11366 if (
V->getType() != ScalarTy) {
11377 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
11380 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
11388 EstimateInsertCost(
I, V);
11389 ShuffleMask[
I] =
I;
11393 DuplicateNonConst =
true;
11395 ShuffleMask[
I] = Res.first->second;
11401 if (DuplicateNonConst)
11403 VecTy, ShuffleMask);
11415 VLOperands Ops(VL, R);
11418 Left = Ops.getVL(0);
11419 Right = Ops.getVL(1);
11422Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
11425 return *Res.second;
11429 auto *Front = E->getMainOp();
11432 if (E->getOpcode() == Instruction::GetElementPtr &&
11433 !isa<GetElementPtrInst>(V))
11435 auto *I = cast<Instruction>(V);
11436 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11437 isVectorLikeInstWithConstOps(I);
11440 auto FindLastInst = [&]() {
11442 for (
Value *V : E->Scalars) {
11443 auto *
I = dyn_cast<Instruction>(V);
11446 if (LastInst->
getParent() ==
I->getParent()) {
11451 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11452 !isa<GetElementPtrInst>(
I)) ||
11455 "Expected vector-like or non-GEP in GEP node insts only.");
11463 auto *NodeB = DT->
getNode(
I->getParent());
11464 assert(NodeA &&
"Should only process reachable instructions");
11465 assert(NodeB &&
"Should only process reachable instructions");
11466 assert((NodeA == NodeB) ==
11467 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11468 "Different nodes should have different DFS numbers");
11469 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11476 auto FindFirstInst = [&]() {
11478 for (
Value *V : E->Scalars) {
11479 auto *
I = dyn_cast<Instruction>(V);
11482 if (FirstInst->
getParent() ==
I->getParent()) {
11483 if (
I->comesBefore(FirstInst))
11487 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11488 !isa<GetElementPtrInst>(
I)) ||
11491 "Expected vector-like or non-GEP in GEP node insts only.");
11499 auto *NodeB = DT->
getNode(
I->getParent());
11500 assert(NodeA &&
"Should only process reachable instructions");
11501 assert(NodeB &&
"Should only process reachable instructions");
11502 assert((NodeA == NodeB) ==
11503 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11504 "Different nodes should have different DFS numbers");
11505 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11515 if ((E->getOpcode() == Instruction::GetElementPtr &&
11518 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11522 return !isVectorLikeInstWithConstOps(V) &&
11523 isUsedOutsideBlock(V);
11525 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
11526 return isa<ExtractElementInst, UndefValue>(V) ||
11527 areAllOperandsNonInsts(V);
11529 Res.second = FindLastInst();
11531 Res.second = FindFirstInst();
11532 return *Res.second;
11539 if (BlocksSchedules.count(BB)) {
11540 Value *
V = E->isOneOf(E->Scalars.back());
11543 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11544 if (Bundle && Bundle->isPartOfBundle())
11545 for (; Bundle; Bundle = Bundle->NextInBundle)
11546 if (Bundle->OpValue == Bundle->Inst)
11547 Res.second = Bundle->Inst;
11569 Res.second = FindLastInst();
11570 assert(Res.second &&
"Failed to find last instruction in bundle");
11571 return *Res.second;
11574void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
11575 auto *Front = E->getMainOp();
11576 Instruction *LastInst = &getLastInstructionInBundle(E);
11577 assert(LastInst &&
"Failed to find last instruction in bundle");
11580 bool IsPHI = isa<PHINode>(LastInst);
11582 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
11584 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
11588 Builder.SetInsertPoint(
11592 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11602 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
11605 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
11606 InsertBB = InsertBB->getSinglePredecessor();
11607 return InsertBB && InsertBB == InstBB;
11609 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11610 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
11611 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11612 getTreeEntry(Inst) ||
11613 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
11614 PostponedIndices.
insert(
I).second)
11618 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
11621 if (
Scalar->getType() != Ty) {
11623 "Expected integer types only.");
11625 if (
auto *CI = dyn_cast<CastInst>(Scalar);
11626 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
11628 if (
auto *IOp = dyn_cast<Instruction>(
Op);
11629 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
11632 Scalar = Builder.CreateIntCast(
11636 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11637 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11640 GatherShuffleExtractSeq.
insert(InsElt);
11641 CSEBlocks.
insert(InsElt->getParent());
11643 if (isa<Instruction>(V)) {
11644 if (TreeEntry *Entry = getTreeEntry(V)) {
11646 User *UserOp =
nullptr;
11648 if (
auto *SI = dyn_cast<Instruction>(Scalar))
11654 unsigned FoundLane =
Entry->findLaneForValue(V);
11655 ExternalUses.emplace_back(V, UserOp, FoundLane);
11665 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11673 if (!isa<UndefValue>(VL[
I])) {
11677 if (isa<PoisonValue>(VL[
I]))
11679 if (
auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11684 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11687 for (
int I : NonConsts)
11688 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11691 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11692 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11730 bool IsFinalized =
false;
11740 Type *ScalarTy =
nullptr;
11744 class ShuffleIRBuilder {
11757 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11758 CSEBlocks(CSEBlocks),
DL(
DL) {}
11759 ~ShuffleIRBuilder() =
default;
11762 if (V1->
getType() != V2->getType()) {
11765 "Expected integer vector types only.");
11766 if (V1->
getType() != V2->getType()) {
11767 if (cast<VectorType>(V2->getType())
11769 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
11771 ->getIntegerBitWidth())
11780 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11781 GatherShuffleExtractSeq.
insert(
I);
11782 CSEBlocks.
insert(
I->getParent());
11791 unsigned VF = Mask.size();
11792 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11796 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11797 GatherShuffleExtractSeq.
insert(
I);
11798 CSEBlocks.
insert(
I->getParent());
11802 Value *createIdentity(
Value *V) {
return V; }
11803 Value *createPoison(
Type *Ty,
unsigned VF) {
11808 void resizeToMatch(
Value *&V1,
Value *&V2) {
11809 if (V1->
getType() == V2->getType())
11811 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11812 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11813 int VF = std::max(V1VF, V2VF);
11814 int MinVF = std::min(V1VF, V2VF);
11816 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
11818 Value *&
Op = MinVF == V1VF ? V1 : V2;
11820 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
11821 GatherShuffleExtractSeq.
insert(
I);
11822 CSEBlocks.
insert(
I->getParent());
11835 assert(V1 &&
"Expected at least one vector value.");
11836 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11837 R.CSEBlocks, *R.DL);
11838 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11846 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11854 std::optional<bool> IsSigned = std::nullopt) {
11855 auto *VecTy = cast<VectorType>(V->getType());
11866 : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11870 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11871 unsigned NumParts,
bool &UseVecBaseAsInput) {
11872 UseVecBaseAsInput =
false;
11874 Value *VecBase =
nullptr;
11875 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
11879 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
11880 VecBase = EI->getVectorOperand();
11881 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
11882 VecBase = TE->VectorizedValue;
11883 assert(VecBase &&
"Expected vectorized value.");
11884 UniqueBases.
insert(VecBase);
11887 if (!EI->hasOneUse() || (NumParts != 1 &&
count(E->Scalars, EI) > 1) ||
11889 const TreeEntry *UTE = R.getTreeEntry(U);
11890 return !UTE || R.MultiNodeScalars.contains(U) ||
11891 (isa<GetElementPtrInst>(U) &&
11892 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11893 count_if(R.VectorizableTree,
11894 [&](const std::unique_ptr<TreeEntry> &TE) {
11895 return any_of(TE->UserTreeIndices,
11896 [&](const EdgeInfo &Edge) {
11897 return Edge.UserTE == UTE;
11899 is_contained(TE->Scalars, EI);
11903 R.eraseInstruction(EI);
11905 if (NumParts == 1 || UniqueBases.
size() == 1) {
11906 assert(VecBase &&
"Expected vectorized value.");
11907 return castToScalarTyElem(VecBase);
11909 UseVecBaseAsInput =
true;
11919 Value *Vec =
nullptr;
11922 for (
unsigned Part : seq<unsigned>(NumParts)) {
11923 unsigned Limit =
getNumElems(E->Scalars.size(), SliceSize, Part);
11927 constexpr int MaxBases = 2;
11929 auto VLMask =
zip(VL, SubMask);
11930 const unsigned VF = std::accumulate(
11931 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
11932 if (std::get<1>(D) == PoisonMaskElem)
11935 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
11936 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11937 VecOp = TE->VectorizedValue;
11938 assert(VecOp &&
"Expected vectorized value.");
11939 const unsigned Size =
11940 cast<FixedVectorType>(VecOp->getType())->getNumElements();
11941 return std::max(S, Size);
11943 for (
const auto [V,
I] : VLMask) {
11946 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11947 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
11948 VecOp = TE->VectorizedValue;
11949 assert(VecOp &&
"Expected vectorized value.");
11950 VecOp = castToScalarTyElem(VecOp);
11951 Bases[
I / VF] = VecOp;
11953 if (!Bases.front())
11956 if (Bases.back()) {
11957 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11958 TransformToIdentity(SubMask);
11960 SubVec = Bases.front();
11967 Mask.slice(
P * SliceSize,
11974 "Expected first part or all previous parts masked.");
11975 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11978 cast<FixedVectorType>(Vec->
getType())->getNumElements();
11980 unsigned SubVecVF =
11981 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
11982 NewVF = std::max(NewVF, SubVecVF);
11985 for (
int &
Idx : SubMask)
11988 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11989 Vec = createShuffle(Vec, SubVec, VecMask);
11990 TransformToIdentity(VecMask);
11998 std::optional<Value *>
12004 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
12006 return std::nullopt;
12009 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
12018 Value *V1 = E1.VectorizedValue;
12020 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
12021 return !isKnownNonNegative(
12022 V, SimplifyQuery(*R.DL));
12024 Value *V2 = E2.VectorizedValue;
12025 if (V2->getType()->isIntOrIntVectorTy())
12026 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
12027 return !isKnownNonNegative(
12028 V, SimplifyQuery(*R.DL));
12035 Value *V1 = E1.VectorizedValue;
12037 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
12038 return !isKnownNonNegative(
12039 V, SimplifyQuery(*R.DL));
12045 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
12047 isa<FixedVectorType>(V2->getType()) &&
12048 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
12049 V1 = castToScalarTyElem(V1);
12050 V2 = castToScalarTyElem(V2);
12051 if (InVectors.
empty()) {
12054 CommonMask.
assign(Mask.begin(), Mask.end());
12058 if (InVectors.
size() == 2) {
12059 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
12060 transformMaskAfterShuffle(CommonMask, CommonMask);
12061 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
12063 Vec = createShuffle(Vec,
nullptr, CommonMask);
12064 transformMaskAfterShuffle(CommonMask, CommonMask);
12066 V1 = createShuffle(V1, V2, Mask);
12067 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
12069 CommonMask[
Idx] =
Idx + Sz;
12070 InVectors.
front() = Vec;
12071 if (InVectors.
size() == 2)
12072 InVectors.
back() = V1;
12079 "castToScalarTyElem expects V1 to be FixedVectorType");
12080 V1 = castToScalarTyElem(V1);
12081 if (InVectors.
empty()) {
12083 CommonMask.
assign(Mask.begin(), Mask.end());
12086 const auto *It =
find(InVectors, V1);
12087 if (It == InVectors.
end()) {
12088 if (InVectors.
size() == 2 ||
12091 if (InVectors.
size() == 2) {
12092 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
12093 transformMaskAfterShuffle(CommonMask, CommonMask);
12094 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
12095 CommonMask.
size()) {
12096 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
12097 transformMaskAfterShuffle(CommonMask, CommonMask);
12099 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
12102 V->getType() != V1->
getType()
12104 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
12105 ->getNumElements();
12106 if (V->getType() != V1->
getType())
12107 V1 = createShuffle(V1,
nullptr, Mask);
12108 InVectors.
front() = V;
12109 if (InVectors.
size() == 2)
12110 InVectors.
back() = V1;
12117 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
12123 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
12124 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
12126 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
12135 Value *Root =
nullptr) {
12136 return R.gather(VL, Root, ScalarTy);
12145 IsFinalized =
true;
12148 if (InVectors.
size() == 2) {
12149 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
12152 Vec = createShuffle(Vec,
nullptr, CommonMask);
12154 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
12158 "Expected vector length for the final value before action.");
12159 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
12162 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
12163 Vec = createShuffle(Vec,
nullptr, ResizeMask);
12165 Action(Vec, CommonMask);
12166 InVectors.
front() = Vec;
12168 if (!ExtMask.
empty()) {
12169 if (CommonMask.
empty()) {
12173 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12176 NewMask[
I] = CommonMask[ExtMask[
I]];
12178 CommonMask.
swap(NewMask);
12181 if (CommonMask.
empty()) {
12182 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
12183 return InVectors.
front();
12185 if (InVectors.
size() == 2)
12186 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
12187 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
12192 "Shuffle construction must be finalized.");
12196Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
12197 bool PostponedPHIs) {
12198 ValueList &VL = E->getOperand(NodeIdx);
12199 const unsigned VF = VL.size();
12202 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12203 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
12204 if (It != VL.end())
12207 if (S.getOpcode()) {
12208 auto CheckSameVE = [&](
const TreeEntry *VE) {
12209 return VE->isSame(VL) &&
12210 (
any_of(VE->UserTreeIndices,
12211 [E, NodeIdx](
const EdgeInfo &EI) {
12212 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12214 any_of(VectorizableTree,
12215 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
12216 return TE->isOperandGatherNode({E, NodeIdx}) &&
12217 VE->isSame(TE->Scalars);
12220 TreeEntry *VE = getTreeEntry(S.OpValue);
12221 bool IsSameVE = VE && CheckSameVE(VE);
12223 auto It = MultiNodeScalars.
find(S.OpValue);
12224 if (It != MultiNodeScalars.
end()) {
12225 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
12226 return TE != VE && CheckSameVE(TE);
12228 if (
I != It->getSecond().end()) {
12236 ShuffleInstructionBuilder ShuffleBuilder(
12237 cast<VectorType>(
V->getType())->getElementType(), Builder, *
this);
12238 ShuffleBuilder.add(V, Mask);
12239 return ShuffleBuilder.finalize(std::nullopt);
12243 cast<FixedVectorType>(
V->getType())->getNumElements()) {
12244 if (!VE->ReuseShuffleIndices.empty()) {
12265 if (isa<PoisonValue>(V))
12267 Mask[
I] = VE->findLaneForValue(V);
12269 V = FinalShuffle(V, Mask);
12271 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
12272 "Expected vectorization factor less "
12273 "than original vector size.");
12275 std::iota(UniformMask.begin(), UniformMask.end(), 0);
12276 V = FinalShuffle(V, UniformMask);
12282 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
12283 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12284 }) == VE->UserTreeIndices.end()) {
12286 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12287 return TE->isGather() &&
12288 TE->UserTreeIndices.front().UserTE == E &&
12289 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12291 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
12292 (*It)->VectorizedValue =
V;
12301 auto *
I =
find_if(VectorizableTree,
12302 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
12303 return TE->isOperandGatherNode({E, NodeIdx});
12305 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
12306 assert(
I->get()->UserTreeIndices.size() == 1 &&
12307 "Expected only single user for the gather node.");
12308 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
12312template <
typename BVTy,
typename ResTy,
typename...
Args>
12313ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
12315 assert(E->isGather() &&
"Expected gather node.");
12316 unsigned VF = E->getVectorFactor();
12318 bool NeedFreeze =
false;
12320 E->ReuseShuffleIndices.end());
12326 if (!ReorderMask.
empty())
12329 unsigned I,
unsigned SliceSize) {
12331 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12334 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12335 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12336 if (UserTE->getNumOperands() != 2)
12339 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
12340 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
12341 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12342 }) !=
TE->UserTreeIndices.end();
12344 if (It == VectorizableTree.end())
12347 if ((
Mask.size() < InputVF &&
12350 (
Mask.size() == InputVF &&
12353 std::next(
Mask.begin(),
I * SliceSize),
12354 std::next(
Mask.begin(),
12361 std::next(
Mask.begin(),
I * SliceSize),
12362 std::next(
Mask.begin(),
12368 BVTy ShuffleBuilder(ScalarTy, Params...);
12369 ResTy Res = ResTy();
12373 Value *ExtractVecBase =
nullptr;
12374 bool UseVecBaseAsInput =
false;
12377 Type *OrigScalarTy = GatheredScalars.front()->getType();
12380 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12382 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
12384 bool Resized =
false;
12386 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12387 if (!ExtractShuffles.
empty()) {
12392 if (
const auto *TE = getTreeEntry(
12393 cast<ExtractElementInst>(E->Scalars[
Idx])->getVectorOperand()))
12396 if (std::optional<ResTy> Delayed =
12397 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12399 PostponedGathers.
insert(E);
12404 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
12405 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12406 ExtractVecBase = VecBase;
12407 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12408 if (VF == VecBaseTy->getNumElements() &&
12409 GatheredScalars.size() != VF) {
12411 GatheredScalars.append(VF - GatheredScalars.size(),
12417 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
12418 E->isAltShuffle() ||
12419 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
12421 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12423 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12425 if (!GatherShuffles.
empty()) {
12426 if (std::optional<ResTy> Delayed =
12427 ShuffleBuilder.needToDelay(E, Entries)) {
12429 PostponedGathers.
insert(E);
12434 if (GatherShuffles.
size() == 1 &&
12436 Entries.front().front()->isSame(E->Scalars)) {
12441 <<
"SLP: perfect diamond match for gather bundle "
12444 Mask.resize(E->Scalars.size());
12445 const TreeEntry *FrontTE = Entries.front().front();
12446 if (FrontTE->ReorderIndices.empty() &&
12447 ((FrontTE->ReuseShuffleIndices.empty() &&
12448 E->Scalars.size() == FrontTE->Scalars.size()) ||
12449 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12450 std::iota(
Mask.begin(),
Mask.end(), 0);
12453 if (isa<PoisonValue>(V)) {
12457 Mask[
I] = FrontTE->findLaneForValue(V);
12460 ShuffleBuilder.add(*FrontTE, Mask);
12461 Res = ShuffleBuilder.finalize(E->getCommonMask());
12465 if (GatheredScalars.size() != VF &&
12467 return any_of(TEs, [&](
const TreeEntry *TE) {
12468 return TE->getVectorFactor() == VF;
12471 GatheredScalars.append(VF - GatheredScalars.size(),
12475 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
12483 bool IsRootPoison) {
12486 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
12493 int NumNonConsts = 0;
12496 if (isa<UndefValue>(V)) {
12497 if (!isa<PoisonValue>(V)) {
12512 Scalars.
front() = OrigV;
12515 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
12516 Scalars[Res.first->second] = OrigV;
12517 ReuseMask[
I] = Res.first->second;
12520 if (NumNonConsts == 1) {
12525 if (!UndefPos.
empty() && UndefPos.
front() == 0)
12528 ReuseMask[SinglePos] = SinglePos;
12529 }
else if (!UndefPos.
empty() && IsSplat) {
12534 return !isa<UndefValue>(V) &&
12536 (E->UserTreeIndices.size() == 1 &&
12540 return E->UserTreeIndices.front().EdgeIdx !=
12541 U.getOperandNo() &&
12543 E->UserTreeIndices.front().UserTE->Scalars,
12547 if (It != Scalars.
end()) {
12549 int Pos = std::distance(Scalars.
begin(), It);
12550 for (
int I : UndefPos) {
12552 ReuseMask[
I] = Pos;
12561 for (
int I : UndefPos) {
12563 if (isa<UndefValue>(Scalars[
I]))
12570 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
12571 bool IsNonPoisoned =
true;
12572 bool IsUsedInExpr =
true;
12573 Value *Vec1 =
nullptr;
12574 if (!ExtractShuffles.
empty()) {
12578 Value *Vec2 =
nullptr;
12579 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12583 if (UseVecBaseAsInput) {
12584 Vec1 = ExtractVecBase;
12586 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12589 if (isa<UndefValue>(E->Scalars[
I]))
12591 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
12592 Value *VecOp = EI->getVectorOperand();
12593 if (
const auto *TE = getTreeEntry(VecOp))
12594 if (
TE->VectorizedValue)
12595 VecOp =
TE->VectorizedValue;
12598 }
else if (Vec1 != VecOp) {
12599 assert((!Vec2 || Vec2 == VecOp) &&
12600 "Expected only 1 or 2 vectors shuffle.");
12606 IsUsedInExpr =
false;
12609 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12611 IsUsedInExpr &= FindReusedSplat(
12613 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
12614 ExtractMask.size());
12615 ShuffleBuilder.add(Vec1, ExtractMask,
true);
12618 IsUsedInExpr =
false;
12623 if (!GatherShuffles.
empty()) {
12626 for (
const auto [
I, TEs] :
enumerate(Entries)) {
12629 "No shuffles with empty entries list expected.");
12633 "Expected shuffle of 1 or 2 entries.");
12637 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
12638 if (TEs.
size() == 1) {
12639 IsUsedInExpr &= FindReusedSplat(
12640 VecMask, TEs.
front()->getVectorFactor(),
I, SliceSize);
12641 ShuffleBuilder.add(*TEs.
front(), VecMask);
12642 if (TEs.
front()->VectorizedValue)
12646 IsUsedInExpr =
false;
12647 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
12648 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
12659 int EMSz = ExtractMask.size();
12660 int MSz =
Mask.size();
12663 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
12664 bool IsIdentityShuffle =
12665 ((UseVecBaseAsInput ||
12667 [](
const std::optional<TTI::ShuffleKind> &SK) {
12671 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
12673 (!GatherShuffles.
empty() &&
12675 [](
const std::optional<TTI::ShuffleKind> &SK) {
12679 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
12681 bool EnoughConstsForShuffle =
12685 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12689 return isa<Constant>(V) && !isa<UndefValue>(V);
12691 (!IsIdentityShuffle ||
12692 (GatheredScalars.size() == 2 &&
12694 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
12696 return isa<Constant>(V) && !isa<PoisonValue>(V);
12700 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
12701 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
12707 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12709 TryPackScalars(GatheredScalars, BVMask,
true);
12710 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12711 ShuffleBuilder.add(BV, BVMask);
12714 return isa<PoisonValue>(V) ||
12715 (IsSingleShuffle && ((IsIdentityShuffle &&
12716 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12718 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12720 Res = ShuffleBuilder.finalize(
12721 E->ReuseShuffleIndices, E->Scalars.size(),
12723 TryPackScalars(NonConstants, Mask,
false);
12724 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
12729 TryPackScalars(GatheredScalars, ReuseMask,
true);
12730 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
12731 ShuffleBuilder.add(BV, ReuseMask);
12732 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12737 if (!isa<PoisonValue>(V))
12740 Value *BV = ShuffleBuilder.gather(E->Scalars);
12741 ShuffleBuilder.add(BV, Mask);
12742 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12746 Res = ShuffleBuilder.createFreeze(Res);
12750Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy) {
12751 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12758 if (E->VectorizedValue &&
12759 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12760 E->isAltShuffle())) {
12761 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
12762 return E->VectorizedValue;
12765 Value *
V = E->Scalars.front();
12766 Type *ScalarTy =
V->getType();
12767 if (
auto *Store = dyn_cast<StoreInst>(V))
12768 ScalarTy =
Store->getValueOperand()->getType();
12769 else if (
auto *IE = dyn_cast<InsertElementInst>(V))
12770 ScalarTy =
IE->getOperand(1)->getType();
12771 auto It = MinBWs.
find(E);
12772 if (It != MinBWs.
end())
12775 if (E->isGather()) {
12777 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12778 setInsertPointAfterBundle(E);
12779 Value *Vec = createBuildVector(E, ScalarTy);
12780 E->VectorizedValue = Vec;
12785 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E,
VectorType *VecTy) {
12786 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
12787 if (E->getOpcode() == Instruction::Store &&
12788 E->State == TreeEntry::Vectorize) {
12790 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
12791 E->ReorderIndices.size());
12792 ShuffleBuilder.add(V, Mask);
12793 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12794 ShuffleBuilder.addOrdered(V, std::nullopt);
12796 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12798 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12801 assert((E->State == TreeEntry::Vectorize ||
12802 E->State == TreeEntry::ScatterVectorize ||
12803 E->State == TreeEntry::StridedVectorize) &&
12804 "Unhandled state");
12805 unsigned ShuffleOrOp =
12806 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
12808 auto GetOperandSignedness = [&](
unsigned Idx) {
12809 const TreeEntry *OpE = getOperandEntry(E,
Idx);
12810 bool IsSigned =
false;
12811 auto It = MinBWs.
find(OpE);
12812 if (It != MinBWs.
end())
12813 IsSigned = It->second.second;
12816 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12820 switch (ShuffleOrOp) {
12821 case Instruction::PHI: {
12822 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12823 E != VectorizableTree.front().get() ||
12824 !E->UserTreeIndices.empty()) &&
12825 "PHI reordering is free.");
12826 if (PostponedPHIs && E->VectorizedValue)
12827 return E->VectorizedValue;
12828 auto *PH = cast<PHINode>(VL0);
12830 PH->getParent()->getFirstNonPHIIt());
12832 if (PostponedPHIs || !E->VectorizedValue) {
12839 PH->getParent()->getFirstInsertionPt());
12842 V = FinalShuffle(V, E, VecTy);
12844 E->VectorizedValue =
V;
12848 PHINode *NewPhi = cast<PHINode>(E->PHI);
12857 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12863 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12867 if (!VisitedBBs.
insert(IBB).second) {
12874 Value *Vec = vectorizeOperand(E,
I,
true);
12875 if (VecTy != Vec->
getType()) {
12877 MinBWs.
contains(getOperandEntry(E,
I))) &&
12878 "Expected item in MinBWs.");
12879 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
12885 "Invalid number of incoming values");
12889 case Instruction::ExtractElement: {
12890 Value *
V = E->getSingleOperand(0);
12891 if (
const TreeEntry *TE = getTreeEntry(V))
12892 V =
TE->VectorizedValue;
12893 setInsertPointAfterBundle(E);
12894 V = FinalShuffle(V, E, VecTy);
12895 E->VectorizedValue =
V;
12898 case Instruction::ExtractValue: {
12899 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12904 NewV = FinalShuffle(NewV, E, VecTy);
12905 E->VectorizedValue = NewV;
12908 case Instruction::InsertElement: {
12909 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
12911 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
12913 Type *ScalarTy =
Op.front()->getType();
12914 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
12916 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
12917 assert(Res.first > 0 &&
"Expected item in MinBWs.");
12922 cast<FixedVectorType>(
V->getType())->getNumElements()),
12927 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
12928 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12930 const unsigned NumElts =
12931 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12932 const unsigned NumScalars = E->Scalars.size();
12935 assert(
Offset < NumElts &&
"Failed to find vector index offset");
12939 if (!E->ReorderIndices.empty()) {
12944 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
12947 bool IsIdentity =
true;
12949 Mask.swap(PrevMask);
12950 for (
unsigned I = 0;
I < NumScalars; ++
I) {
12953 IsIdentity &= InsertIdx -
Offset ==
I;
12956 if (!IsIdentity || NumElts != NumScalars) {
12960 if (NumElts != NumScalars &&
Offset == 0) {
12969 InsertMask[*InsertIdx] = *InsertIdx;
12970 if (!
Ins->hasOneUse())
12972 Ins = dyn_cast_or_null<InsertElementInst>(
12973 Ins->getUniqueUndroppableUser());
12976 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12978 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12981 if (!IsFirstPoison.
all()) {
12983 for (
unsigned I = 0;
I < NumElts;
I++) {
12985 IsFirstUndef.
test(
I)) {
12986 if (IsVNonPoisonous) {
12987 InsertMask[
I] =
I < NumScalars ?
I : 0;
12992 if (
Idx >= NumScalars)
12993 Idx = NumScalars - 1;
12994 InsertMask[
I] = NumScalars +
Idx;
13008 if (
auto *
I = dyn_cast<Instruction>(V)) {
13009 GatherShuffleExtractSeq.
insert(
I);
13010 CSEBlocks.
insert(
I->getParent());
13015 for (
unsigned I = 0;
I < NumElts;
I++) {
13020 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
13023 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
13024 NumElts != NumScalars) {
13025 if (IsFirstUndef.
all()) {
13028 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13029 if (!IsFirstPoison.
all()) {
13030 for (
unsigned I = 0;
I < NumElts;
I++) {
13032 InsertMask[
I] =
I + NumElts;
13039 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
13040 if (
auto *
I = dyn_cast<Instruction>(V)) {
13041 GatherShuffleExtractSeq.
insert(
I);
13042 CSEBlocks.
insert(
I->getParent());
13047 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13048 for (
unsigned I = 0;
I < NumElts;
I++) {
13052 InsertMask[
I] += NumElts;
13055 FirstInsert->getOperand(0), V, InsertMask,
13056 cast<Instruction>(E->Scalars.back())->getName());
13057 if (
auto *
I = dyn_cast<Instruction>(V)) {
13058 GatherShuffleExtractSeq.
insert(
I);
13059 CSEBlocks.
insert(
I->getParent());
13064 ++NumVectorInstructions;
13065 E->VectorizedValue =
V;
13068 case Instruction::ZExt:
13069 case Instruction::SExt:
13070 case Instruction::FPToUI:
13071 case Instruction::FPToSI:
13072 case Instruction::FPExt:
13073 case Instruction::PtrToInt:
13074 case Instruction::IntToPtr:
13075 case Instruction::SIToFP:
13076 case Instruction::UIToFP:
13077 case Instruction::Trunc:
13078 case Instruction::FPTrunc:
13079 case Instruction::BitCast: {
13080 setInsertPointAfterBundle(E);
13082 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
13083 if (E->VectorizedValue) {
13084 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13085 return E->VectorizedValue;
13088 auto *CI = cast<CastInst>(VL0);
13090 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
13091 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
13093 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
13096 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
13097 if (SrcIt != MinBWs.
end())
13098 SrcBWSz = SrcIt->second.first;
13099 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
13100 if (BWSz == SrcBWSz) {
13101 VecOpcode = Instruction::BitCast;
13102 }
else if (BWSz < SrcBWSz) {
13103 VecOpcode = Instruction::Trunc;
13104 }
else if (It != MinBWs.
end()) {
13105 assert(BWSz > SrcBWSz &&
"Invalid cast!");
13106 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
13107 }
else if (SrcIt != MinBWs.
end()) {
13108 assert(BWSz > SrcBWSz &&
"Invalid cast!");
13110 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
13112 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
13113 !SrcIt->second.second) {
13114 VecOpcode = Instruction::UIToFP;
13116 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
13118 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
13119 V = FinalShuffle(V, E, VecTy);
13121 E->VectorizedValue =
V;
13122 ++NumVectorInstructions;
13125 case Instruction::FCmp:
13126 case Instruction::ICmp: {
13127 setInsertPointAfterBundle(E);
13129 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
13130 if (E->VectorizedValue) {
13131 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13132 return E->VectorizedValue;
13134 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
13135 if (E->VectorizedValue) {
13136 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13137 return E->VectorizedValue;
13139 if (
L->getType() !=
R->getType()) {
13141 getOperandEntry(E, 1)->
isGather() ||
13142 MinBWs.
contains(getOperandEntry(E, 0)) ||
13143 MinBWs.
contains(getOperandEntry(E, 1))) &&
13144 "Expected item in MinBWs.");
13145 if (cast<VectorType>(
L->getType())
13147 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
13149 ->getIntegerBitWidth()) {
13150 Type *CastTy =
R->getType();
13153 Type *CastTy =
L->getType();
13162 VecTy = cast<FixedVectorType>(
V->getType());
13163 V = FinalShuffle(V, E, VecTy);
13165 E->VectorizedValue =
V;
13166 ++NumVectorInstructions;
13169 case Instruction::Select: {
13170 setInsertPointAfterBundle(E);
13172 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
13173 if (E->VectorizedValue) {
13174 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13175 return E->VectorizedValue;
13177 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
13178 if (E->VectorizedValue) {
13179 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13180 return E->VectorizedValue;
13182 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
13183 if (E->VectorizedValue) {
13184 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13185 return E->VectorizedValue;
13189 getOperandEntry(E, 2)->
isGather() ||
13190 MinBWs.
contains(getOperandEntry(E, 1)) ||
13191 MinBWs.
contains(getOperandEntry(E, 2))) &&
13192 "Expected item in MinBWs.");
13193 if (True->
getType() != VecTy)
13194 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
13195 if (False->
getType() != VecTy)
13196 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
13200 V = FinalShuffle(V, E, VecTy);
13202 E->VectorizedValue =
V;
13203 ++NumVectorInstructions;
13206 case Instruction::FNeg: {
13207 setInsertPointAfterBundle(E);
13209 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
13211 if (E->VectorizedValue) {
13212 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13213 return E->VectorizedValue;
13219 if (
auto *
I = dyn_cast<Instruction>(V))
13222 V = FinalShuffle(V, E, VecTy);
13224 E->VectorizedValue =
V;
13225 ++NumVectorInstructions;
13229 case Instruction::Add:
13230 case Instruction::FAdd:
13231 case Instruction::Sub:
13232 case Instruction::FSub:
13233 case Instruction::Mul:
13234 case Instruction::FMul:
13235 case Instruction::UDiv:
13236 case Instruction::SDiv:
13237 case Instruction::FDiv:
13238 case Instruction::URem:
13239 case Instruction::SRem:
13240 case Instruction::FRem:
13241 case Instruction::Shl:
13242 case Instruction::LShr:
13243 case Instruction::AShr:
13244 case Instruction::And:
13245 case Instruction::Or:
13246 case Instruction::Xor: {
13247 setInsertPointAfterBundle(E);
13249 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
13250 if (E->VectorizedValue) {
13251 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13252 return E->VectorizedValue;
13254 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
13255 if (E->VectorizedValue) {
13256 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13257 return E->VectorizedValue;
13259 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
13260 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13263 auto *CI = dyn_cast<ConstantInt>(
Op);
13264 return CI && CI->getValue().countr_one() >= It->second.first;
13266 V = FinalShuffle(
I == 0 ? RHS : LHS, E, VecTy);
13267 E->VectorizedValue =
V;
13268 ++NumVectorInstructions;
13275 getOperandEntry(E, 1)->
isGather() ||
13276 MinBWs.
contains(getOperandEntry(E, 0)) ||
13277 MinBWs.
contains(getOperandEntry(E, 1))) &&
13278 "Expected item in MinBWs.");
13289 if (
auto *
I = dyn_cast<Instruction>(V)) {
13292 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
13294 return isCommutative(cast<Instruction>(V));
13296 I->setHasNoUnsignedWrap(
false);
13299 V = FinalShuffle(V, E, VecTy);
13301 E->VectorizedValue =
V;
13302 ++NumVectorInstructions;
13306 case Instruction::Load: {
13309 setInsertPointAfterBundle(E);
13311 LoadInst *LI = cast<LoadInst>(VL0);
13314 if (E->State == TreeEntry::Vectorize) {
13316 }
else if (E->State == TreeEntry::StridedVectorize) {
13317 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13318 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13319 PO = IsReverseOrder ? PtrN : Ptr0;
13325 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
13327 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13328 DL->getTypeAllocSize(ScalarTy));
13332 return cast<LoadInst>(V)->getPointerOperand();
13335 std::optional<Value *> Stride =
13344 (IsReverseOrder ? -1 : 1) *
13345 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
13347 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13349 Intrinsic::experimental_vp_strided_load,
13350 {VecTy, PO->
getType(), StrideTy},
13352 Builder.
getInt32(E->Scalars.size())});
13358 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
13359 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13360 if (E->VectorizedValue) {
13361 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13362 return E->VectorizedValue;
13365 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13370 V = FinalShuffle(V, E, VecTy);
13371 E->VectorizedValue =
V;
13372 ++NumVectorInstructions;
13375 case Instruction::Store: {
13376 auto *
SI = cast<StoreInst>(VL0);
13378 setInsertPointAfterBundle(E);
13380 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13381 if (VecValue->
getType() != VecTy)
13383 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13384 VecValue = FinalShuffle(VecValue, E, VecTy);
13388 if (E->State == TreeEntry::Vectorize) {
13391 assert(E->State == TreeEntry::StridedVectorize &&
13392 "Expected either strided or conseutive stores.");
13393 if (!E->ReorderIndices.empty()) {
13394 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
13395 Ptr =
SI->getPointerOperand();
13397 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13398 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
13400 Intrinsic::experimental_vp_strided_store,
13401 {VecTy,
Ptr->getType(), StrideTy},
13404 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
13406 Builder.
getInt32(E->Scalars.size())});
13415 E->VectorizedValue =
V;
13416 ++NumVectorInstructions;
13419 case Instruction::GetElementPtr: {
13420 auto *GEP0 = cast<GetElementPtrInst>(VL0);
13421 setInsertPointAfterBundle(E);
13423 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13424 if (E->VectorizedValue) {
13425 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13426 return E->VectorizedValue;
13430 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
13431 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13432 if (E->VectorizedValue) {
13433 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13434 return E->VectorizedValue;
13439 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13440 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
13442 for (
Value *V : E->Scalars) {
13443 if (isa<GetElementPtrInst>(V))
13449 V = FinalShuffle(V, E, VecTy);
13451 E->VectorizedValue =
V;
13452 ++NumVectorInstructions;
13456 case Instruction::Call: {
13457 CallInst *CI = cast<CallInst>(VL0);
13458 setInsertPointAfterBundle(E);
13464 It != MinBWs.
end() ? It->second.first : 0);
13467 VecCallCosts.first <= VecCallCosts.second;
13469 Value *ScalarArg =
nullptr;
13475 auto *CEI = cast<CallInst>(VL0);
13476 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
13481 ScalarArg = CEI->getArgOperand(
I);
13484 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
13485 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
13493 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
13494 if (E->VectorizedValue) {
13495 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13496 return E->VectorizedValue;
13498 ScalarArg = CEI->getArgOperand(
I);
13499 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
13501 It == MinBWs.
end()) {
13504 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
13505 }
else if (It != MinBWs.
end()) {
13506 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
13515 if (!UseIntrinsic) {
13531 V = FinalShuffle(V, E, VecTy);
13533 E->VectorizedValue =
V;
13534 ++NumVectorInstructions;
13537 case Instruction::ShuffleVector: {
13538 assert(E->isAltShuffle() &&
13543 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13544 "Invalid Shuffle Vector Operand");
13548 setInsertPointAfterBundle(E);
13549 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13550 if (E->VectorizedValue) {
13551 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13552 return E->VectorizedValue;
13554 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13556 setInsertPointAfterBundle(E);
13557 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13559 if (E->VectorizedValue) {
13560 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13561 return E->VectorizedValue;
13568 getOperandEntry(E, 1)->
isGather() ||
13569 MinBWs.
contains(getOperandEntry(E, 0)) ||
13570 MinBWs.
contains(getOperandEntry(E, 1))) &&
13571 "Expected item in MinBWs.");
13572 Type *CastTy = VecTy;
13576 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
13578 ->getIntegerBitWidth())
13595 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13596 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
13597 auto *AltCI = cast<CmpInst>(E->getAltOp());
13599 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
13602 unsigned SrcBWSz =
DL->getTypeSizeInBits(
13603 cast<VectorType>(
LHS->
getType())->getElementType());
13604 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
13605 if (BWSz <= SrcBWSz) {
13606 if (BWSz < SrcBWSz)
13609 if (
auto *
I = dyn_cast<Instruction>(LHS))
13611 E->VectorizedValue =
LHS;
13612 ++NumVectorInstructions;
13623 for (
Value *V : {V0, V1}) {
13624 if (
auto *
I = dyn_cast<Instruction>(V)) {
13625 GatherShuffleExtractSeq.
insert(
I);
13626 CSEBlocks.
insert(
I->getParent());
13635 E->buildAltOpShuffleMask(
13637 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
13641 Mask, &OpScalars, &AltScalars);
13645 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
13647 if (
auto *
I = dyn_cast<Instruction>(Vec);
13648 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
13650 auto *IV = cast<Instruction>(V);
13651 return IV->getOpcode() == Instruction::Sub &&
13652 isCommutative(cast<Instruction>(IV));
13654 I->setHasNoUnsignedWrap(
false);
13656 DropNuwFlag(V0, E->getOpcode());
13657 DropNuwFlag(V1, E->getAltOpcode());
13660 if (
auto *
I = dyn_cast<Instruction>(V)) {
13662 GatherShuffleExtractSeq.
insert(
I);
13663 CSEBlocks.
insert(
I->getParent());
13666 E->VectorizedValue =
V;
13667 ++NumVectorInstructions;
13680 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13686struct ShuffledInsertData {
13699 for (
auto &BSIter : BlocksSchedules) {
13700 scheduleBlock(BSIter.second.get());
13704 EntryToLastInstruction.
clear();
13714 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13715 if (TE->State == TreeEntry::Vectorize &&
13716 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13717 TE->VectorizedValue)
13723 for (
const TreeEntry *E : PostponedNodes) {
13724 auto *TE =
const_cast<TreeEntry *
>(E);
13725 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
13726 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13727 TE->UserTreeIndices.front().EdgeIdx)) &&
13728 VecTE->isSame(TE->Scalars))
13732 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13733 TE->VectorizedValue =
nullptr;
13735 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13744 if (isa<PHINode>(UserI)) {
13747 for (
User *U : PrevVec->users()) {
13750 auto *UI = dyn_cast<Instruction>(U);
13751 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
13753 if (UI->comesBefore(InsertPt))
13762 if (Vec->
getType() != PrevVec->getType()) {
13764 PrevVec->getType()->isIntOrIntVectorTy() &&
13765 "Expected integer vector types only.");
13766 std::optional<bool> IsSigned;
13767 for (
Value *V : TE->Scalars) {
13768 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
13769 auto It = MinBWs.
find(BaseTE);
13770 if (It != MinBWs.
end()) {
13771 IsSigned = IsSigned.value_or(
false) || It->second.second;
13775 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
13776 auto It = MinBWs.
find(MNTE);
13777 if (It != MinBWs.
end()) {
13778 IsSigned = IsSigned.value_or(
false) || It->second.second;
13783 if (IsSigned.value_or(
false))
13786 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13787 auto It = MinBWs.
find(BVE);
13788 if (It != MinBWs.
end()) {
13789 IsSigned = IsSigned.value_or(
false) || It->second.second;
13794 if (IsSigned.value_or(
false))
13796 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
13798 IsSigned.value_or(
false) ||
13802 if (IsSigned.value_or(
false))
13806 if (IsSigned.value_or(
false)) {
13808 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
13809 if (It != MinBWs.
end())
13810 IsSigned = It->second.second;
13813 "Expected user node or perfect diamond match in MinBWs.");
13817 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
13820 auto It = PostponedValues.
find(PrevVec);
13821 if (It != PostponedValues.
end()) {
13822 for (TreeEntry *VTE : It->getSecond())
13823 VTE->VectorizedValue = Vec;
13843 for (
const auto &ExternalUse : ExternalUses) {
13844 Value *Scalar = ExternalUse.Scalar;
13851 TreeEntry *E = getTreeEntry(Scalar);
13852 assert(E &&
"Invalid scalar");
13853 assert(!E->isGather() &&
"Extracting from a gather list");
13855 if (E->getOpcode() == Instruction::GetElementPtr &&
13856 !isa<GetElementPtrInst>(Scalar))
13859 Value *Vec = E->VectorizedValue;
13860 assert(Vec &&
"Can't find vectorizable value");
13863 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
13864 if (Scalar->getType() != Vec->
getType()) {
13865 Value *Ex =
nullptr;
13866 Value *ExV =
nullptr;
13867 auto *
GEP = dyn_cast<GetElementPtrInst>(Scalar);
13869 auto It = ScalarToEEs.find(Scalar);
13870 if (It != ScalarToEEs.end()) {
13874 if (EEIt != It->second.end()) {
13880 if (
auto *CI = EEIt->second.second)
13884 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13889 if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13890 Value *V = ES->getVectorOperand();
13891 if (
const TreeEntry *ETE = getTreeEntry(V))
13892 V = ETE->VectorizedValue;
13894 }
else if (ReplaceGEP) {
13897 auto *CloneGEP =
GEP->clone();
13898 if (isa<Instruction>(Vec))
13902 CloneGEP->insertBefore(
GEP);
13903 if (
GEP->hasName())
13904 CloneGEP->takeName(
GEP);
13912 if (Scalar->getType() != Ex->
getType())
13914 MinBWs.
find(E)->second.second);
13915 if (
auto *
I = dyn_cast<Instruction>(Ex))
13916 ScalarToEEs[Scalar].try_emplace(
13918 std::make_pair(
I, cast<Instruction>(ExV)));
13922 if (
auto *ExI = dyn_cast<Instruction>(Ex)) {
13923 GatherShuffleExtractSeq.
insert(ExI);
13924 CSEBlocks.
insert(ExI->getParent());
13928 assert(isa<FixedVectorType>(Scalar->getType()) &&
13929 isa<InsertElementInst>(Scalar) &&
13930 "In-tree scalar of vector type is not insertelement?");
13931 auto *IE = cast<InsertElementInst>(Scalar);
13939 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
13945 if (ExternalUsesAsGEPs.contains(U))
13947 TreeEntry *UseEntry = getTreeEntry(U);
13949 (UseEntry->State == TreeEntry::Vectorize ||
13951 TreeEntry::StridedVectorize) &&
13952 (E->State == TreeEntry::Vectorize ||
13953 E->State == TreeEntry::StridedVectorize) &&
13954 doesInTreeUserNeedToExtract(
13956 cast<Instruction>(UseEntry->Scalars.front()),
13959 "Scalar with nullptr User must be registered in "
13960 "ExternallyUsedValues map or remain as scalar in vectorized "
13962 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13963 if (
auto *
PHI = dyn_cast<PHINode>(VecI))
13965 PHI->getParent()->getFirstNonPHIIt());
13968 std::next(VecI->getIterator()));
13972 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13974 Scalar->replaceAllUsesWith(NewInst);
13975 ReplacedExternals.emplace_back(Scalar, NewInst);
13979 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
13982 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13983 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
13984 if (!UsedInserts.
insert(VU).second)
13987 auto BWIt = MinBWs.
find(E);
13989 auto *ScalarTy = FTy->getElementType();
13990 auto Key = std::make_pair(Vec, ScalarTy);
13991 auto VecIt = VectorCasts.
find(Key);
13992 if (VecIt == VectorCasts.
end()) {
13994 if (
auto *IVec = dyn_cast<PHINode>(Vec))
13996 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
13997 else if (
auto *IVec = dyn_cast<Instruction>(Vec))
14003 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
14004 BWIt->second.second);
14007 Vec = VecIt->second;
14014 find_if(ShuffledInserts, [VU](
const ShuffledInsertData &
Data) {
14021 unsigned Idx = *InsertIdx;
14022 if (It == ShuffledInserts.
end()) {
14024 It = std::next(ShuffledInserts.
begin(),
14025 ShuffledInserts.
size() - 1);
14031 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
14032 if (IEBase !=
User &&
14033 (!IEBase->hasOneUse() ||
14037 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
14039 IEBase = cast<InsertElementInst>(
Base);
14042 "InsertElementInstruction used already.");
14043 Mask[IEIdx] = IEIdx;
14044 Base = IEBase->getOperand(0);
14045 }
while (E == getTreeEntry(
Base));
14048 Base = cast<InsertElementInst>(
Base)->getOperand(0);
14052 auto It = VectorToInsertElement.
find(
Base);
14053 if (It != VectorToInsertElement.
end())
14060 Mask[
Idx] = ExternalUse.Lane;
14061 It->InsertElements.push_back(cast<InsertElementInst>(
User));
14070 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
14072 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
14073 if (PH->getIncomingValue(
I) == Scalar) {
14075 PH->getIncomingBlock(
I)->getTerminator();
14076 if (isa<CatchSwitchInst>(IncomingTerminator)) {
14078 std::next(VecI->getIterator()));
14082 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14083 PH->setOperand(
I, NewInst);
14088 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14093 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14103 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14104 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
14106 CombinedMask1[
I] = Mask[
I];
14108 CombinedMask2[
I] = Mask[
I] - VF;
14111 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
14112 ShuffleBuilder.
add(V1, CombinedMask1);
14114 ShuffleBuilder.
add(V2, CombinedMask2);
14115 return ShuffleBuilder.
finalize(std::nullopt);
14119 bool ForSingleMask) {
14120 unsigned VF = Mask.size();
14121 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14123 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
14124 Vec = CreateShuffle(Vec,
nullptr, Mask);
14125 return std::make_pair(Vec,
true);
14127 if (!ForSingleMask) {
14129 for (
unsigned I = 0;
I < VF; ++
I) {
14131 ResizeMask[Mask[
I]] = Mask[
I];
14133 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
14137 return std::make_pair(Vec,
false);
14141 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
14147 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
14148 Value *NewInst = performExtractsShuffleAction<Value>(
14152 return cast<VectorType>(Vec->getType())
14153 ->getElementCount()
14154 .getKnownMinValue();
14159 assert((Vals.size() == 1 || Vals.size() == 2) &&
14160 "Expected exactly 1 or 2 input values.");
14161 if (Vals.size() == 1) {
14164 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
14165 ->getNumElements() ||
14166 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14167 return CreateShuffle(Vals.front(), nullptr, Mask);
14168 return Vals.front();
14170 return CreateShuffle(Vals.
front() ? Vals.
front()
14172 Vals.
back(), Mask);
14174 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
14177 if (It != ShuffledInserts[
I].InsertElements.
rend())
14180 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
14181 assert(
II &&
"Must be an insertelement instruction.");
14186 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
14189 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
14190 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
14191 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
14192 II->moveAfter(NewI);
14195 LastInsert->replaceAllUsesWith(NewInst);
14197 IE->replaceUsesOfWith(IE->getOperand(0),
14199 IE->replaceUsesOfWith(IE->getOperand(1),
14203 CSEBlocks.
insert(LastInsert->getParent());
14208 for (
auto &TEPtr : VectorizableTree) {
14209 TreeEntry *Entry = TEPtr.get();
14212 if (Entry->isGather())
14215 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
14218 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14219 Value *Scalar = Entry->Scalars[Lane];
14221 if (Entry->getOpcode() == Instruction::GetElementPtr &&
14222 !isa<GetElementPtrInst>(Scalar))
14225 Type *Ty = Scalar->getType();
14227 for (
User *U : Scalar->users()) {
14231 assert((getTreeEntry(U) ||
14232 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14233 (isa_and_nonnull<Instruction>(U) &&
14234 isDeleted(cast<Instruction>(U)))) &&
14235 "Deleting out-of-tree value");
14239 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
14240 auto *
I = cast<Instruction>(Scalar);
14247 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14248 V->mergeDIAssignID(RemovedInsts);
14251 if (UserIgnoreList) {
14253 if (getTreeEntry(
I)->
Idx != 0)
14258 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
14259 (match(U.getUser(), m_LogicalAnd()) ||
14260 match(U.getUser(), m_LogicalOr())) &&
14261 U.getOperandNo() == 0;
14262 if (IsPoisoningLogicalOp) {
14263 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
14266 return UserIgnoreList->contains(
U.getUser());
14278 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
14281 InstrElementSize.
clear();
14283 const TreeEntry &RootTE = *VectorizableTree.front();
14284 Value *Vec = RootTE.VectorizedValue;
14285 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14286 It != MinBWs.end() &&
14287 ReductionBitWidth != It->second.first) {
14290 ReductionRoot->getIterator());
14294 cast<VectorType>(Vec->
getType())->getElementCount()),
14295 It->second.second);
14302 <<
" gather sequences instructions.\n");
14309 Loop *L = LI->getLoopFor(
I->getParent());
14314 BasicBlock *PreHeader = L->getLoopPreheader();
14322 auto *OpI = dyn_cast<Instruction>(V);
14323 return OpI && L->contains(OpI);
14329 CSEBlocks.
insert(PreHeader);
14344 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
14345 "Different nodes should have different DFS numbers");
14346 return A->getDFSNumIn() <
B->getDFSNumIn();
14356 if (I1->getType() != I2->getType())
14358 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14359 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14361 return I1->isIdenticalTo(I2);
14362 if (SI1->isIdenticalTo(SI2))
14364 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
14365 if (SI1->getOperand(
I) != SI2->getOperand(
I))
14368 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14372 unsigned LastUndefsCnt = 0;
14373 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
14379 NewMask[
I] != SM1[
I])
14382 NewMask[
I] = SM1[
I];
14386 return SM1.
size() - LastUndefsCnt > 1 &&
14390 SM1.
size() - LastUndefsCnt));
14396 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
14399 "Worklist not sorted properly!");
14405 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14406 !GatherShuffleExtractSeq.contains(&In))
14411 bool Replaced =
false;
14414 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14415 DT->
dominates(V->getParent(), In.getParent())) {
14416 In.replaceAllUsesWith(V);
14418 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
14419 if (!NewMask.
empty())
14420 SI->setShuffleMask(NewMask);
14424 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14425 GatherShuffleExtractSeq.contains(V) &&
14426 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14427 DT->
dominates(In.getParent(), V->getParent())) {
14429 V->replaceAllUsesWith(&In);
14431 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14432 if (!NewMask.
empty())
14433 SI->setShuffleMask(NewMask);
14441 Visited.push_back(&In);
14446 GatherShuffleExtractSeq.clear();
14449BoUpSLP::ScheduleData *
14451 ScheduleData *Bundle =
nullptr;
14452 ScheduleData *PrevInBundle =
nullptr;
14453 for (
Value *V : VL) {
14456 ScheduleData *BundleMember = getScheduleData(V);
14458 "no ScheduleData for bundle member "
14459 "(maybe not in same basic block)");
14460 assert(BundleMember->isSchedulingEntity() &&
14461 "bundle member already part of other bundle");
14462 if (PrevInBundle) {
14463 PrevInBundle->NextInBundle = BundleMember;
14465 Bundle = BundleMember;
14469 BundleMember->FirstInBundle = Bundle;
14470 PrevInBundle = BundleMember;
14472 assert(Bundle &&
"Failed to find schedule bundle");
14478std::optional<BoUpSLP::ScheduleData *>
14480 const InstructionsState &S) {
14491 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
14492 ScheduleData *Bundle) {
14498 if (ScheduleEnd != OldScheduleEnd) {
14499 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
14500 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->clearDependencies(); });
14505 <<
" in block " << BB->
getName() <<
"\n");
14506 calculateDependencies(Bundle,
true, SLP);
14511 initialFillReadyList(ReadyInsts);
14518 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14519 !ReadyInsts.empty()) {
14520 ScheduleData *Picked = ReadyInsts.pop_back_val();
14521 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14522 "must be ready to schedule");
14523 schedule(Picked, ReadyInsts);
14529 for (
Value *V : VL) {
14532 if (!extendSchedulingRegion(V, S)) {
14539 TryScheduleBundleImpl(
false,
nullptr);
14540 return std::nullopt;
14544 bool ReSchedule =
false;
14545 for (
Value *V : VL) {
14548 ScheduleData *BundleMember = getScheduleData(V);
14550 "no ScheduleData for bundle member (maybe not in same basic block)");
14554 ReadyInsts.remove(BundleMember);
14556 if (!BundleMember->IsScheduled)
14561 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
14562 <<
" was already scheduled\n");
14566 auto *Bundle = buildBundle(VL);
14567 TryScheduleBundleImpl(ReSchedule, Bundle);
14568 if (!Bundle->isReady()) {
14569 cancelScheduling(VL, S.OpValue);
14570 return std::nullopt;
14583 ScheduleData *Bundle = getScheduleData(OpValue);
14584 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
14585 assert(!Bundle->IsScheduled &&
14586 "Can't cancel bundle which is already scheduled");
14587 assert(Bundle->isSchedulingEntity() &&
14589 "tried to unbundle something which is not a bundle");
14592 if (Bundle->isReady())
14593 ReadyInsts.remove(Bundle);
14596 ScheduleData *BundleMember = Bundle;
14597 while (BundleMember) {
14598 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
14599 BundleMember->FirstInBundle = BundleMember;
14600 ScheduleData *Next = BundleMember->NextInBundle;
14601 BundleMember->NextInBundle =
nullptr;
14602 BundleMember->TE =
nullptr;
14603 if (BundleMember->unscheduledDepsInBundle() == 0) {
14604 ReadyInsts.insert(BundleMember);
14606 BundleMember = Next;
14610BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14612 if (ChunkPos >= ChunkSize) {
14613 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14616 return &(ScheduleDataChunks.back()[ChunkPos++]);
14619bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V,
14620 const InstructionsState &S) {
14621 if (getScheduleData(V,
isOneOf(S, V)))
14624 assert(
I &&
"bundle member must be an instruction");
14627 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14629 auto &&CheckScheduleForI = [
this, &S](
Instruction *
I) ->
bool {
14630 ScheduleData *ISD = getScheduleData(
I);
14633 assert(isInSchedulingRegion(ISD) &&
14634 "ScheduleData not in scheduling region");
14635 ScheduleData *SD = allocateScheduleDataChunks();
14637 SD->init(SchedulingRegionID, S.OpValue);
14638 ExtraScheduleDataMap[
I][S.OpValue] = SD;
14641 if (CheckScheduleForI(
I))
14643 if (!ScheduleStart) {
14645 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
14647 ScheduleEnd =
I->getNextNode();
14649 CheckScheduleForI(
I);
14650 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14651 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
14659 ++ScheduleStart->getIterator().getReverse();
14664 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
14665 return II->isAssumeLikeIntrinsic();
14668 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14669 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14670 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
14672 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14673 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
14680 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14681 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14683 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
14684 assert(
I->getParent() == ScheduleStart->getParent() &&
14685 "Instruction is in wrong basic block.");
14686 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
14689 CheckScheduleForI(
I);
14694 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
14695 "Expected to reach top of the basic block or instruction down the "
14697 assert(
I->getParent() == ScheduleEnd->getParent() &&
14698 "Instruction is in wrong basic block.");
14699 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
14701 ScheduleEnd =
I->getNextNode();
14703 CheckScheduleForI(
I);
14704 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14705 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
14709void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
14711 ScheduleData *PrevLoadStore,
14712 ScheduleData *NextLoadStore) {
14713 ScheduleData *CurrentLoadStore = PrevLoadStore;
14718 ScheduleData *SD = ScheduleDataMap.lookup(
I);
14720 SD = allocateScheduleDataChunks();
14721 ScheduleDataMap[
I] = SD;
14724 assert(!isInSchedulingRegion(SD) &&
14725 "new ScheduleData already in scheduling region");
14726 SD->init(SchedulingRegionID,
I);
14728 if (
I->mayReadOrWriteMemory() &&
14729 (!isa<IntrinsicInst>(
I) ||
14730 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
14731 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
14732 Intrinsic::pseudoprobe))) {
14734 if (CurrentLoadStore) {
14735 CurrentLoadStore->NextLoadStore = SD;
14737 FirstLoadStoreInRegion = SD;
14739 CurrentLoadStore = SD;
14742 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14743 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14744 RegionHasStackSave =
true;
14746 if (NextLoadStore) {
14747 if (CurrentLoadStore)
14748 CurrentLoadStore->NextLoadStore = NextLoadStore;
14750 LastLoadStoreInRegion = CurrentLoadStore;
14754void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14755 bool InsertInReadyList,
14757 assert(SD->isSchedulingEntity());
14762 while (!WorkList.
empty()) {
14764 for (ScheduleData *BundleMember = SD; BundleMember;
14765 BundleMember = BundleMember->NextInBundle) {
14766 assert(isInSchedulingRegion(BundleMember));
14767 if (BundleMember->hasValidDependencies())
14772 BundleMember->Dependencies = 0;
14773 BundleMember->resetUnscheduledDeps();
14776 if (BundleMember->OpValue != BundleMember->Inst) {
14777 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14778 BundleMember->Dependencies++;
14779 ScheduleData *DestBundle = UseSD->FirstInBundle;
14780 if (!DestBundle->IsScheduled)
14781 BundleMember->incrementUnscheduledDeps(1);
14782 if (!DestBundle->hasValidDependencies())
14786 for (
User *U : BundleMember->Inst->
users()) {
14787 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14788 BundleMember->Dependencies++;
14789 ScheduleData *DestBundle = UseSD->FirstInBundle;
14790 if (!DestBundle->IsScheduled)
14791 BundleMember->incrementUnscheduledDeps(1);
14792 if (!DestBundle->hasValidDependencies())
14799 auto *DepDest = getScheduleData(
I);
14800 assert(DepDest &&
"must be in schedule window");
14801 DepDest->ControlDependencies.push_back(BundleMember);
14802 BundleMember->Dependencies++;
14803 ScheduleData *DestBundle = DepDest->FirstInBundle;
14804 if (!DestBundle->IsScheduled)
14805 BundleMember->incrementUnscheduledDeps(1);
14806 if (!DestBundle->hasValidDependencies())
14814 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14815 I != ScheduleEnd;
I =
I->getNextNode()) {
14820 MakeControlDependent(
I);
14828 if (RegionHasStackSave) {
14832 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14833 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14834 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14835 I != ScheduleEnd;
I =
I->getNextNode()) {
14836 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14837 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14842 if (!isa<AllocaInst>(
I))
14846 MakeControlDependent(
I);
14855 if (isa<AllocaInst>(BundleMember->Inst) ||
14856 BundleMember->Inst->mayReadOrWriteMemory()) {
14857 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14858 I != ScheduleEnd;
I =
I->getNextNode()) {
14859 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
14860 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14864 MakeControlDependent(
I);
14871 ScheduleData *DepDest = BundleMember->NextLoadStore;
14876 "NextLoadStore list for non memory effecting bundle?");
14878 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14879 unsigned NumAliased = 0;
14880 unsigned DistToSrc = 1;
14882 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14883 assert(isInSchedulingRegion(DepDest));
14893 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14895 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14902 DepDest->MemoryDependencies.push_back(BundleMember);
14903 BundleMember->Dependencies++;
14904 ScheduleData *DestBundle = DepDest->FirstInBundle;
14905 if (!DestBundle->IsScheduled) {
14906 BundleMember->incrementUnscheduledDeps(1);
14908 if (!DestBundle->hasValidDependencies()) {
14931 if (InsertInReadyList && SD->isReady()) {
14932 ReadyInsts.insert(SD);
14939void BoUpSLP::BlockScheduling::resetSchedule() {
14941 "tried to reset schedule on block which has not been scheduled");
14942 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
14943 doForAllOpcodes(
I, [&](ScheduleData *SD) {
14944 assert(isInSchedulingRegion(SD) &&
14945 "ScheduleData not in scheduling region");
14946 SD->IsScheduled =
false;
14947 SD->resetUnscheduledDeps();
14950 ReadyInsts.clear();
14953void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14954 if (!BS->ScheduleStart)
14957 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
14964 BS->resetSchedule();
14971 struct ScheduleDataCompare {
14972 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
14973 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14976 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14981 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
14982 I =
I->getNextNode()) {
14983 BS->doForAllOpcodes(
I, [
this, &
Idx, BS](ScheduleData *SD) {
14984 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14987 SD->isPartOfBundle() ==
14989 "scheduler and vectorizer bundle mismatch");
14990 SD->FirstInBundle->SchedulingPriority =
Idx++;
14992 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14993 BS->calculateDependencies(SD,
false,
this);
14996 BS->initialFillReadyList(ReadyInsts);
14998 Instruction *LastScheduledInst = BS->ScheduleEnd;
15001 while (!ReadyInsts.empty()) {
15002 ScheduleData *Picked = *ReadyInsts.begin();
15003 ReadyInsts.erase(ReadyInsts.begin());
15007 for (ScheduleData *BundleMember = Picked; BundleMember;
15008 BundleMember = BundleMember->NextInBundle) {
15012 LastScheduledInst = PickedInst;
15015 BS->schedule(Picked, ReadyInsts);
15019#ifdef EXPENSIVE_CHECKS
15023#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
15025 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
15026 BS->doForAllOpcodes(
I, [&](ScheduleData *SD) {
15027 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
15028 assert(SD->IsScheduled &&
"must be scheduled at this point");
15035 BS->ScheduleStart =
nullptr;
15042 if (
auto *Store = dyn_cast<StoreInst>(V))
15043 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
15045 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
15048 auto E = InstrElementSize.
find(V);
15049 if (E != InstrElementSize.
end())
15058 if (
auto *
I = dyn_cast<Instruction>(V)) {
15066 Value *FirstNonBool =
nullptr;
15067 while (!Worklist.
empty()) {
15072 auto *Ty =
I->getType();
15073 if (isa<VectorType>(Ty))
15075 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
15082 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
15083 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
15091 for (
Use &U :
I->operands()) {
15092 if (
auto *J = dyn_cast<Instruction>(U.get()))
15093 if (Visited.
insert(J).second &&
15094 (isa<PHINode>(
I) || J->getParent() == Parent)) {
15098 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
15099 FirstNonBool = U.get();
15110 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
15112 Width =
DL->getTypeSizeInBits(V->getType());
15116 InstrElementSize[
I] = Width;
15121bool BoUpSLP::collectValuesToDemote(
15122 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
15124 unsigned &MaxDepthLevel,
bool &IsProfitableToDemote,
15125 bool IsTruncRoot)
const {
15127 if (
all_of(E.Scalars, IsaPred<Constant>))
15130 unsigned OrigBitWidth =
DL->getTypeSizeInBits(E.Scalars.front()->getType());
15139 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
15140 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15142 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
15149 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
15155 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
15158 if (
auto *
I = dyn_cast<Instruction>(V)) {
15160 unsigned BitWidth2 =
15161 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
15162 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
15168 BitWidth1 = std::min(BitWidth1, BitWidth2);
15173 using namespace std::placeholders;
15174 auto FinalAnalysis = [&]() {
15175 if (!IsProfitableToDemote)
15178 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
15180 if (Res && E.isGather()) {
15184 for (
Value *V : E.Scalars) {
15185 auto *EE = dyn_cast<ExtractElementInst>(V);
15188 UniqueBases.
insert(EE->getVectorOperand());
15190 const unsigned VF = E.Scalars.size();
15191 Type *OrigScalarTy = E.Scalars.front()->getType();
15192 if (UniqueBases.
size() <= 2 ||
15200 if (E.isGather() || !Visited.
insert(&E).second ||
15202 return all_of(V->users(), [&](User *U) {
15203 return isa<InsertElementInst>(U) && !getTreeEntry(U);
15206 return FinalAnalysis();
15209 return !all_of(V->users(), [=](User *U) {
15210 return getTreeEntry(U) ||
15211 (UserIgnoreList && UserIgnoreList->contains(U)) ||
15212 (!isa<CmpInst>(U) && U->getType()->isSized() &&
15213 !U->getType()->isScalableTy() &&
15214 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
15215 }) && !IsPotentiallyTruncated(V,
BitWidth);
15220 bool &NeedToExit) {
15221 NeedToExit =
false;
15222 unsigned InitLevel = MaxDepthLevel;
15224 unsigned Level = InitLevel;
15225 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
15226 ToDemote, Visited, Level, IsProfitableToDemote,
15228 if (!IsProfitableToDemote)
15231 if (!FinalAnalysis())
15235 MaxDepthLevel = std::max(MaxDepthLevel, Level);
15239 auto AttemptCheckBitwidth =
15242 NeedToExit =
false;
15243 unsigned BestFailBitwidth = 0;
15245 if (Checker(
BitWidth, OrigBitWidth))
15247 if (BestFailBitwidth == 0 && FinalAnalysis())
15251 if (BestFailBitwidth == 0) {
15262 auto TryProcessInstruction =
15269 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15274 if (E.UserTreeIndices.size() > 1 &&
15275 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15278 bool NeedToExit =
false;
15279 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15283 if (!ProcessOperands(
Operands, NeedToExit))
15292 return IsProfitableToDemote;
15294 switch (E.getOpcode()) {
15298 case Instruction::Trunc:
15299 if (IsProfitableToDemoteRoot)
15300 IsProfitableToDemote =
true;
15301 return TryProcessInstruction(
BitWidth);
15302 case Instruction::ZExt:
15303 case Instruction::SExt:
15304 IsProfitableToDemote =
true;
15305 return TryProcessInstruction(
BitWidth);
15309 case Instruction::Add:
15310 case Instruction::Sub:
15311 case Instruction::Mul:
15312 case Instruction::And:
15313 case Instruction::Or:
15314 case Instruction::Xor: {
15315 return TryProcessInstruction(
15316 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15318 case Instruction::Shl: {
15323 auto *I = cast<Instruction>(V);
15324 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15325 return AmtKnownBits.getMaxValue().ult(BitWidth);
15328 return TryProcessInstruction(
15329 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15331 case Instruction::LShr: {
15335 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15337 auto *I = cast<Instruction>(V);
15338 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15339 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15340 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15341 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15342 SimplifyQuery(*DL));
15345 return TryProcessInstruction(
15346 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15349 case Instruction::AShr: {
15353 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15355 auto *I = cast<Instruction>(V);
15356 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15357 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15358 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15359 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15363 return TryProcessInstruction(
15364 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15367 case Instruction::UDiv:
15368 case Instruction::URem: {
15370 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15373 auto *I = cast<Instruction>(V);
15374 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15375 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15376 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15379 return TryProcessInstruction(
15380 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15384 case Instruction::Select: {
15385 return TryProcessInstruction(
15386 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15391 case Instruction::PHI: {
15392 const unsigned NumOps = E.getNumOperands();
15395 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
15397 return TryProcessInstruction(
BitWidth, Ops);
15400 case Instruction::Call: {
15401 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15405 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
15406 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
15410 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15413 auto *I = cast<Instruction>(V);
15414 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15415 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15416 return MaskedValueIsZero(I->getOperand(0), Mask,
15417 SimplifyQuery(*DL)) &&
15418 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15420 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
15421 "Expected min/max intrinsics only.");
15422 unsigned SignBits = OrigBitWidth -
BitWidth;
15428 return SignBits <= Op0SignBits &&
15429 ((SignBits != Op0SignBits &&
15433 SignBits <= Op1SignBits &&
15434 ((SignBits != Op1SignBits &&
15439 if (
ID != Intrinsic::abs) {
15440 Operands.push_back(getOperandEntry(&E, 1));
15441 CallChecker = CompChecker;
15444 std::numeric_limits<InstructionCost::CostType>::max();
15446 unsigned VF = E.Scalars.size();
15455 if (
Cost < BestCost) {
15461 [[maybe_unused]]
bool NeedToExit;
15462 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15472 return FinalAnalysis();
15479 bool IsStoreOrInsertElt =
15480 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15481 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15482 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15483 ExtraBitWidthNodes.
size() <= 1 &&
15484 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15485 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15488 unsigned NodeIdx = 0;
15489 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
15493 if (VectorizableTree[NodeIdx]->
isGather() ||
15494 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.
empty()) ||
15495 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15498 static_cast<int>(NodeIdx);
15504 bool IsTruncRoot =
false;
15505 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15507 if (NodeIdx != 0 &&
15508 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15509 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
15510 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
15511 IsTruncRoot =
true;
15513 IsProfitableToDemoteRoot =
true;
15518 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
15522 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
15523 bool IsProfitableToDemoteRoot,
unsigned Opcode,
15524 unsigned Limit,
bool IsTruncRoot,
15525 bool IsSignedCmp) ->
unsigned {
15529 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
15530 E.Idx > (IsStoreOrInsertElt ? 2 : 1) &&
15532 return V->hasOneUse() || isa<Constant>(V) ||
15535 const TreeEntry *TE = getTreeEntry(U);
15536 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15537 if (TE == UserTE || !TE)
15539 unsigned UserTESz = DL->getTypeSizeInBits(
15540 UserTE->Scalars.front()->getType());
15541 auto It = MinBWs.find(TE);
15542 if (It != MinBWs.end() && It->second.first > UserTESz)
15544 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
15548 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15549 auto It = MinBWs.
find(UserTE);
15550 if (It != MinBWs.
end())
15551 return It->second.first;
15552 unsigned MaxBitWidth =
15553 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
15554 MaxBitWidth =
bit_ceil(MaxBitWidth);
15555 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15557 return MaxBitWidth;
15560 unsigned VF = E.getVectorFactor();
15561 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
15562 if (!TreeRootIT || !Opcode)
15566 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
15574 unsigned MaxBitWidth = 1u;
15582 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
15583 KnownBits Known = computeKnownBits(R, *DL);
15584 return Known.isNonNegative();
15589 for (
Value *Root : E.Scalars) {
15592 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15608 if (!IsKnownPositive)
15612 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15614 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15617 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15622 if (NumParts > 1 &&
15628 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15629 Opcode == Instruction::SExt ||
15630 Opcode == Instruction::ZExt || NumParts > 1;
15635 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15636 bool NeedToDemote = IsProfitableToDemote;
15638 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15639 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15641 (MaxDepthLevel <= Limit &&
15642 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15643 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15644 DL->getTypeSizeInBits(TreeRootIT) /
15645 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15651 MaxBitWidth =
bit_ceil(MaxBitWidth);
15653 return MaxBitWidth;
15660 if (UserIgnoreList &&
15661 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15662 for (
Value *V : *UserIgnoreList) {
15664 auto NumTypeBits =
DL->getTypeSizeInBits(V->getType());
15665 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15668 unsigned BitWidth2 = BitWidth1;
15671 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15673 ReductionBitWidth =
15674 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15676 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15677 ReductionBitWidth = 8;
15679 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
15681 bool IsTopRoot = NodeIdx == 0;
15682 while (NodeIdx < VectorizableTree.size() &&
15683 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15684 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15687 IsTruncRoot =
true;
15689 bool IsSignedCmp =
false;
15690 while (NodeIdx < VectorizableTree.size()) {
15692 unsigned Limit = 2;
15693 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15695 ReductionBitWidth ==
15696 DL->getTypeSizeInBits(
15697 VectorizableTree.front()->Scalars.front()->getType()))
15699 unsigned MaxBitWidth = ComputeMaxBitWidth(
15700 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
15701 Limit, IsTruncRoot, IsSignedCmp);
15702 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
15703 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15704 ReductionBitWidth =
bit_ceil(MaxBitWidth);
15705 else if (MaxBitWidth == 0)
15706 ReductionBitWidth = 0;
15709 for (
unsigned Idx : RootDemotes) {
15711 uint32_t OrigBitWidth =
DL->getTypeSizeInBits(V->getType());
15712 if (OrigBitWidth > MaxBitWidth) {
15720 RootDemotes.clear();
15722 IsProfitableToDemoteRoot =
true;
15724 if (ExtraBitWidthNodes.
empty()) {
15725 NodeIdx = VectorizableTree.size();
15727 unsigned NewIdx = 0;
15729 NewIdx = *ExtraBitWidthNodes.
begin();
15730 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
15731 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
15734 NodeIdx < VectorizableTree.size() &&
15735 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15738 EI.
UserTE->getOpcode() == Instruction::Trunc &&
15739 !EI.
UserTE->isAltShuffle();
15742 NodeIdx < VectorizableTree.size() &&
15743 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15745 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
15747 auto *IC = dyn_cast<ICmpInst>(V);
15750 !isKnownNonNegative(IC->getOperand(0),
15751 SimplifyQuery(*DL)) ||
15752 !isKnownNonNegative(IC->getOperand(1),
15753 SimplifyQuery(*DL)));
15760 if (MaxBitWidth == 0 ||
15762 cast<IntegerType>(TreeRoot.
front()->getType())->getBitWidth()) {
15763 if (UserIgnoreList)
15770 for (
unsigned Idx : ToDemote) {
15771 TreeEntry *TE = VectorizableTree[
Idx].get();
15774 bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
15775 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15793 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
15818 DL = &
F.getDataLayout();
15822 bool Changed =
false;
15828 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
15833 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
15836 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
15840 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
15849 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
15851 R.clearReductionData();
15852 collectSeedInstructions(BB);
15855 if (!Stores.empty()) {
15857 <<
" underlying objects.\n");
15858 Changed |= vectorizeStoreChains(R);
15862 Changed |= vectorizeChainsInBlock(BB, R);
15867 if (!GEPs.
empty()) {
15869 <<
" underlying objects.\n");
15870 Changed |= vectorizeGEPIndices(BB, R);
15875 R.optimizeGatherSequence();
15883 unsigned Idx,
unsigned MinVF,
15888 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15889 unsigned VF = Chain.
size();
15903 for (
Value *V : Chain)
15904 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
15907 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
15912 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15913 (!S.MainOp->isSafeToRemove() ||
15916 return !isa<ExtractElementInst>(V) &&
15917 (V->getNumUses() > Chain.size() ||
15918 any_of(V->users(), [&](User *U) {
15919 return !Stores.contains(U);
15922 (ValOps.
size() > Chain.size() / 2 && !S.getOpcode())) {
15923 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15927 if (
R.isLoadCombineCandidate(Chain))
15929 R.buildTree(Chain);
15931 if (
R.isTreeTinyAndNotFullyVectorizable()) {
15932 if (
R.isGathered(Chain.front()) ||
15933 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15934 return std::nullopt;
15935 Size =
R.getTreeSize();
15938 R.reorderTopToBottom();
15939 R.reorderBottomToTop();
15940 R.buildExternalUses();
15942 R.computeMinimumValueSizes();
15943 R.transformNodes();
15945 Size =
R.getTreeSize();
15946 if (S.getOpcode() == Instruction::Load)
15954 using namespace ore;
15957 cast<StoreInst>(Chain[0]))
15958 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
15959 <<
" and with tree size "
15960 <<
NV(
"TreeSize",
R.getTreeSize()));
15974 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
15975 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
15976 unsigned Size = First ? Val.first : Val.second;
15988 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
15989 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
15990 unsigned P = First ? Val.first : Val.second;
15993 return V + (P - Mean) * (P - Mean);
15996 return Dev * 81 / (Mean * Mean) == 0;
15999bool SLPVectorizerPass::vectorizeStores(
16001 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
16006 bool Changed =
false;
16008 struct StoreDistCompare {
16009 bool operator()(
const std::pair<unsigned, int> &Op1,
16010 const std::pair<unsigned, int> &Op2)
const {
16011 return Op1.second < Op2.second;
16016 using StoreIndexToDistSet =
16017 std::set<std::pair<unsigned, int>, StoreDistCompare>;
16018 auto TryToVectorize = [&](
const StoreIndexToDistSet &Set) {
16023 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
16025 PrevDist =
Data.second;
16026 if (
Idx != Set.size() - 1)
16031 Operands.push_back(Stores[DataVar.first]);
16032 PrevDist = DataVar.second;
16037 .
insert({Operands.front(),
16038 cast<StoreInst>(Operands.front())->getValueOperand(),
16040 cast<StoreInst>(Operands.back())->getValueOperand(),
16045 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
16046 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
16050 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
16051 unsigned MaxRegVF = MaxVF;
16053 Type *StoreTy =
Store->getValueOperand()->getType();
16054 Type *ValueTy = StoreTy;
16055 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
16056 ValueTy = Trunc->getSrcTy();
16057 if (ValueTy == StoreTy &&
16058 R.getVectorElementSize(
Store->getValueOperand()) <= EltSize)
16060 unsigned MinVF = std::max<unsigned>(
16062 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
16065 if (MaxVF < MinVF) {
16066 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
16068 <<
"MinVF (" << MinVF <<
")\n");
16072 unsigned NonPowerOf2VF = 0;
16077 unsigned CandVF =
Operands.size();
16079 NonPowerOf2VF = CandVF;
16084 unsigned Size = MinVF;
16086 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
16090 unsigned Repeat = 0;
16091 constexpr unsigned MaxAttempts = 4;
16093 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
16094 P.first =
P.second = 1;
16097 auto IsNotVectorized = [](
bool First,
16098 const std::pair<unsigned, unsigned> &
P) {
16099 return First ?
P.first > 0 :
P.second > 0;
16101 auto IsVectorized = [](
bool First,
16102 const std::pair<unsigned, unsigned> &
P) {
16103 return First ?
P.first == 0 :
P.second == 0;
16105 auto VFIsProfitable = [](
bool First,
unsigned Size,
16106 const std::pair<unsigned, unsigned> &
P) {
16109 auto FirstSizeSame = [](
unsigned Size,
16110 const std::pair<unsigned, unsigned> &
P) {
16111 return Size ==
P.first;
16115 bool RepeatChanged =
false;
16116 bool AnyProfitableGraph =
false;
16117 for (
unsigned Size : CandidateVFs) {
16118 AnyProfitableGraph =
false;
16119 unsigned StartIdx = std::distance(
16120 RangeSizes.begin(),
16121 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
16122 std::placeholders::_1)));
16123 while (StartIdx <
End) {
16125 std::distance(RangeSizes.begin(),
16126 find_if(RangeSizes.drop_front(StartIdx),
16127 std::bind(IsVectorized,
Size >= MaxRegVF,
16128 std::placeholders::_1)));
16129 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
16130 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
16132 Size >= MaxRegVF)) {
16139 return cast<StoreInst>(V)
16140 ->getValueOperand()
16142 cast<StoreInst>(Slice.
front())
16143 ->getValueOperand()
16146 "Expected all operands of same type.");
16147 if (!NonSchedulable.empty()) {
16148 auto [NonSchedSizeMax, NonSchedSizeMin] =
16149 NonSchedulable.lookup(Slice.
front());
16150 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
16151 Cnt += NonSchedSizeMax;
16156 std::optional<bool> Res =
16157 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
16161 .first->getSecond()
16169 AnyProfitableGraph = RepeatChanged = Changed =
true;
16173 [](std::pair<unsigned, unsigned> &
P) {
16174 P.first = P.second = 0;
16176 if (Cnt < StartIdx + MinVF) {
16177 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
16178 [](std::pair<unsigned, unsigned> &
P) {
16179 P.first = P.second = 0;
16181 StartIdx = Cnt +
Size;
16183 if (Cnt > Sz -
Size - MinVF) {
16185 [](std::pair<unsigned, unsigned> &
P) {
16186 P.first = P.second = 0;
16195 if (
Size > 2 && Res &&
16197 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
16198 std::placeholders::_1))) {
16204 if (
Size > MaxRegVF && TreeSize > 1 &&
16206 std::bind(FirstSizeSame, TreeSize,
16207 std::placeholders::_1))) {
16209 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
16215 [&](std::pair<unsigned, unsigned> &
P) {
16216 if (Size >= MaxRegVF)
16217 P.second = std::max(P.second, TreeSize);
16219 P.first = std::max(P.first, TreeSize);
16222 AnyProfitableGraph =
true;
16224 if (StartIdx >=
End)
16226 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
16227 AnyProfitableGraph =
true;
16228 StartIdx = std::distance(
16229 RangeSizes.begin(),
16230 find_if(RangeSizes.drop_front(Sz),
16231 std::bind(IsNotVectorized,
Size >= MaxRegVF,
16232 std::placeholders::_1)));
16234 if (!AnyProfitableGraph &&
Size >= MaxRegVF)
16238 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
16239 return P.first == 0 &&
P.second == 0;
16243 if (Repeat >= MaxAttempts ||
16244 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
16246 constexpr unsigned StoresLimit = 64;
16247 const unsigned MaxTotalNum =
bit_floor(std::min<unsigned>(
16249 static_cast<unsigned>(
16252 RangeSizes.begin(),
16253 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
16254 std::placeholders::_1))) +
16257 if (VF > MaxTotalNum || VF >= StoresLimit)
16259 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
16261 P.first = std::max(
P.second,
P.first);
16265 CandidateVFs.clear();
16266 CandidateVFs.push_back(VF);
16313 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16315 Stores[Set.first]->getValueOperand()->getType(),
16316 Stores[Set.first]->getPointerOperand(),
16317 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
16321 auto It = Set.second.find(std::make_pair(
Idx, *Diff));
16322 if (It == Set.second.end()) {
16323 Set.second.emplace(
Idx, *Diff);
16327 TryToVectorize(Set.second);
16328 StoreIndexToDistSet PrevSet;
16329 PrevSet.swap(Set.second);
16331 Set.second.emplace(
Idx, 0);
16334 unsigned StartIdx = It->first + 1;
16339 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
16341 if (Pair.first <= It->first ||
16342 VectorizedStores.
contains(Stores[Pair.first]))
16344 unsigned BI = Pair.first - StartIdx;
16345 UsedStores.set(BI);
16346 Dists[BI] = Pair.second - It->second;
16348 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
16349 unsigned BI =
I - StartIdx;
16350 if (UsedStores.test(BI))
16351 Set.second.emplace(
I, Dists[BI]);
16355 auto &Res = SortedStores.emplace_back();
16357 Res.second.emplace(
Idx, 0);
16359 Type *PrevValTy =
nullptr;
16361 if (
R.isDeleted(SI))
16364 PrevValTy =
SI->getValueOperand()->getType();
16366 if (PrevValTy !=
SI->getValueOperand()->getType()) {
16367 for (
auto &Set : SortedStores)
16368 TryToVectorize(Set.second);
16369 SortedStores.clear();
16370 PrevValTy =
SI->getValueOperand()->getType();
16372 FillStoresSet(
I, SI);
16376 for (
auto &Set : SortedStores)
16377 TryToVectorize(Set.second);
16382void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
16393 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
16394 if (!
SI->isSimple())
16404 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
16405 if (
GEP->getNumIndices() != 1)
16408 if (isa<Constant>(
Idx))
16412 if (
GEP->getType()->isVectorTy())
16424 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
16425 << VL.
size() <<
".\n");
16430 if (!S.getOpcode())
16436 for (
Value *V : VL) {
16437 Type *Ty =
V->getType();
16441 R.getORE()->emit([&]() {
16442 std::string TypeStr;
16446 <<
"Cannot SLP vectorize list: type "
16447 << TypeStr +
" is unsupported by vectorizer";
16453 unsigned Sz =
R.getVectorElementSize(I0);
16454 unsigned MinVF =
R.getMinVF(Sz);
16455 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
16456 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16458 R.getORE()->emit([&]() {
16460 <<
"Cannot SLP vectorize list: vectorization factor "
16461 <<
"less than 2 is not supported";
16466 bool Changed =
false;
16467 bool CandidateFound =
false;
16469 Type *ScalarTy = VL[0]->getType();
16470 if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16471 ScalarTy =
IE->getOperand(1)->getType();
16473 unsigned NextInst = 0, MaxInst = VL.size();
16474 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16481 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
16482 unsigned ActualVF = std::min(MaxInst -
I, VF);
16487 if (MaxVFOnly && ActualVF < MaxVF)
16489 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16495 auto *
I = dyn_cast<Instruction>(V);
16496 return I &&
R.isDeleted(
I);
16500 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
16504 if (
R.isTreeTinyAndNotFullyVectorizable())
16506 R.reorderTopToBottom();
16507 R.reorderBottomToTop(
16508 !isa<InsertElementInst>(Ops.
front()) &&
16509 !
R.doesRootHaveInTreeUses());
16510 R.buildExternalUses();
16512 R.computeMinimumValueSizes();
16513 R.transformNodes();
16515 CandidateFound =
true;
16516 MinCost = std::min(MinCost,
Cost);
16519 <<
" for VF=" << ActualVF <<
"\n");
16523 cast<Instruction>(Ops[0]))
16524 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
16525 <<
" and with tree size "
16526 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
16537 if (!Changed && CandidateFound) {
16538 R.getORE()->emit([&]() {
16540 <<
"List vectorization was possible but not beneficial with cost "
16541 <<
ore::NV(
"Cost", MinCost) <<
" >= "
16544 }
else if (!Changed) {
16545 R.getORE()->emit([&]() {
16547 <<
"Cannot SLP vectorize list: vectorization was impossible"
16548 <<
" with available vectorization factors";
16558 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
16564 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
16565 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
16566 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P)
16573 auto *
A = dyn_cast<BinaryOperator>(Op0);
16574 auto *
B = dyn_cast<BinaryOperator>(Op1);
16576 if (
A &&
B &&
B->hasOneUse()) {
16577 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
16578 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
16579 if (B0 && B0->getParent() ==
P)
16581 if (B1 && B1->getParent() ==
P)
16585 if (
B &&
A &&
A->hasOneUse()) {
16586 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
16587 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
16588 if (A0 && A0->getParent() ==
P)
16590 if (A1 && A1->getParent() ==
P)
16594 if (Candidates.
size() == 1)
16595 return tryToVectorizeList({Op0, Op1},
R);
16598 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
16599 if (!BestCandidate)
16601 return tryToVectorizeList(
16602 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
16636 ReductionOpsListType ReductionOps;
16648 bool IsSupportedHorRdxIdentityOp =
false;
16659 return isa<SelectInst>(
I) &&
16665 if (Kind == RecurKind::None)
16673 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16677 return I->getFastMathFlags().noNaNs();
16680 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16683 return I->isAssociative();
16692 return I->getOperand(2);
16693 return I->getOperand(
Index);
16701 case RecurKind::Or:
16707 case RecurKind::And:
16713 case RecurKind::Add:
16714 case RecurKind::Mul:
16715 case RecurKind::Xor:
16716 case RecurKind::FAdd:
16717 case RecurKind::FMul:
16720 case RecurKind::FMax:
16722 case RecurKind::FMin:
16724 case RecurKind::FMaximum:
16726 case RecurKind::FMinimum:
16728 case RecurKind::SMax:
16734 case RecurKind::SMin:
16740 case RecurKind::UMax:
16746 case RecurKind::UMin:
16761 const ReductionOpsListType &ReductionOps) {
16762 bool UseSelect = ReductionOps.size() == 2 ||
16764 (ReductionOps.size() == 1 &&
16765 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16766 assert((!UseSelect || ReductionOps.size() != 2 ||
16767 isa<SelectInst>(ReductionOps[1][0])) &&
16768 "Expected cmp + select pairs for reduction");
16771 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
16785 auto *
I = dyn_cast<Instruction>(V);
16787 return RecurKind::None;
16789 return RecurKind::Add;
16791 return RecurKind::Mul;
16794 return RecurKind::And;
16797 return RecurKind::Or;
16799 return RecurKind::Xor;
16801 return RecurKind::FAdd;
16803 return RecurKind::FMul;
16806 return RecurKind::FMax;
16808 return RecurKind::FMin;
16811 return RecurKind::FMaximum;
16813 return RecurKind::FMinimum;
16819 return RecurKind::SMax;
16821 return RecurKind::SMin;
16823 return RecurKind::UMax;
16825 return RecurKind::UMin;
16827 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
16849 if (!isa<ExtractElementInst>(
RHS) ||
16851 return RecurKind::None;
16853 if (!isa<ExtractElementInst>(
LHS) ||
16855 return RecurKind::None;
16857 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
16858 return RecurKind::None;
16862 return RecurKind::None;
16867 return RecurKind::None;
16870 return RecurKind::SMax;
16873 return RecurKind::SMin;
16876 return RecurKind::UMax;
16879 return RecurKind::UMin;
16882 return RecurKind::None;
16886 static unsigned getFirstOperandIndex(
Instruction *
I) {
16887 return isCmpSelMinMax(
I) ? 1 : 0;
16893 return isCmpSelMinMax(
I) ? 3 : 2;
16899 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
16900 auto *Sel = cast<SelectInst>(
I);
16901 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
16902 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
16904 return I->getParent() == BB;
16908 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
16909 if (IsCmpSelMinMax) {
16912 if (
auto *Sel = dyn_cast<SelectInst>(
I))
16913 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
16914 return I->hasNUses(2);
16918 return I->hasOneUse();
16923 if (isCmpSelMinMax(
I))
16924 ReductionOps.assign(2, ReductionOpsType());
16926 ReductionOps.assign(1, ReductionOpsType());
16931 if (isCmpSelMinMax(
I)) {
16932 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
16933 ReductionOps[1].emplace_back(
I);
16935 ReductionOps[0].emplace_back(
I);
16940 int Sz = Data.size();
16941 auto *
I = dyn_cast<Instruction>(Data.front());
16942 return Sz > 1 ||
isConstant(Data.front()) ||
16953 RdxKind = HorizontalReduction::getRdxKind(Root);
16954 if (!isVectorizable(RdxKind, Root))
16965 if (
auto *Sel = dyn_cast<SelectInst>(Root))
16966 if (!Sel->getCondition()->hasOneUse())
16969 ReductionRoot = Root;
16974 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16983 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
16984 getNumberOfOperands(TreeN)))) {
16985 Value *EdgeVal = getRdxOperand(TreeN,
I);
16986 ReducedValsToOps[EdgeVal].push_back(TreeN);
16987 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16990 !hasSameParent(EdgeInst, BB)) {
16991 ExtraArgs.push_back(EdgeVal);
16998 if (!EdgeInst ||
getRdxKind(EdgeInst) != RdxKind ||
16999 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
17000 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
17001 !isVectorizable(RdxKind, EdgeInst) ||
17002 (
R.isAnalyzedReductionRoot(EdgeInst) &&
17003 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
17004 PossibleReducedVals.push_back(EdgeVal);
17007 ReductionOps.push_back(EdgeInst);
17016 PossibleReducedVals;
17017 initReductionOps(Root);
17021 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
17024 auto LIt = LoadsMap.
find(
Ptr);
17025 if (LIt != LoadsMap.
end()) {
17026 for (
LoadInst *RLI : LIt->second) {
17032 for (
LoadInst *RLI : LIt->second) {
17039 if (LIt->second.size() > 2) {
17041 hash_value(LIt->second.back()->getPointerOperand());
17046 LoadKeyUsed.
insert(Key);
17051 while (!Worklist.empty()) {
17056 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
17059 if (
Args.size() < 2) {
17060 addReductionOps(TreeN);
17062 if (!
Args.empty()) {
17063 assert(
Args.size() == 1 &&
"Expected only single argument.");
17064 ExtraArgs[TreeN] =
Args.front();
17068 for (
Value *V : PossibleRedVals) {
17072 ++PossibleReducedVals[
Key][
Idx]
17073 .
insert(std::make_pair(V, 0))
17076 Worklist.append(PossibleReductionOps.
rbegin(),
17077 PossibleReductionOps.
rend());
17082 ++PossibleReducedVals[
Key][
Idx]
17083 .
insert(std::make_pair(TreeN, 0))
17087 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
17090 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
17091 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
17093 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
17096 auto RedValsVect = It->second.takeVector();
17098 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
17099 PossibleRedValsVect.
back().append(Data.second, Data.first);
17101 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
17102 return P1.size() > P2.size();
17107 (!isGoodForReduction(Data) &&
17108 (!isa<LoadInst>(Data.front()) ||
17109 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
17111 cast<LoadInst>(Data.front())->getPointerOperand()) !=
17113 cast<LoadInst>(ReducedVals[NewIdx].front())
17115 NewIdx = ReducedVals.
size();
17118 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
17132 constexpr int ReductionLimit = 4;
17133 constexpr unsigned RegMaxNumber = 4;
17134 constexpr unsigned RedValsMaxNumber = 128;
17138 unsigned NumReducedVals =
17139 std::accumulate(ReducedVals.
begin(), ReducedVals.
end(), 0,
17141 if (!isGoodForReduction(Vals))
17143 return Num + Vals.size();
17145 if (NumReducedVals < ReductionLimit &&
17150 for (ReductionOpsType &RdxOps : ReductionOps)
17151 for (
Value *RdxOp : RdxOps)
17152 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17163 ReducedVals.
size() * ReducedVals.
front().size() + ExtraArgs.size());
17166 ExternallyUsedValues.
reserve(ExtraArgs.size() + 1);
17169 for (
const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
17170 assert(Pair.first &&
"DebugLoc must be set.");
17171 ExternallyUsedValues[Pair.second].push_back(Pair.first);
17172 TrackedVals.
try_emplace(Pair.second, Pair.second);
17177 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
17178 assert(isa<SelectInst>(RdxRootInst) &&
17179 "Expected min/max reduction to have select root instruction");
17180 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
17181 assert(isa<Instruction>(ScalarCond) &&
17182 "Expected min/max reduction to have compare condition");
17183 return cast<Instruction>(ScalarCond);
17187 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
17188 if (VectorizedTree) {
17191 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
17192 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
17195 auto It = ReducedValsToOps.
find(Res);
17196 if (It != ReducedValsToOps.
end() &&
17202 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
17208 bool AnyBoolLogicOp =
17210 return isBoolLogicOp(cast<Instruction>(V));
17214 ExternallyUsedValues[ReductionRoot];
17216 ReductionOps.front().size());
17217 for (ReductionOpsType &RdxOps : ReductionOps)
17218 for (
Value *RdxOp : RdxOps) {
17221 IgnoreList.insert(RdxOp);
17226 for (
Value *U : IgnoreList)
17227 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
17228 RdxFMF &= FPMO->getFastMathFlags();
17229 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
17234 for (
Value *V : Candidates)
17235 TrackedVals.try_emplace(V, V);
17241 Value *VectorizedTree =
nullptr;
17242 bool CheckForReusedReductionOps =
false;
17247 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
17249 InstructionsState S = States[
I];
17253 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
17254 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
17259 auto *Inst = dyn_cast<Instruction>(RdxVal);
17261 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
17262 (S.getOpcode() && !Inst))
17265 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
17267 bool ShuffledExtracts =
false;
17269 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
17271 InstructionsState NextS =
getSameOpcode(ReducedVals[
I + 1], TLI);
17272 if (NextS.getOpcode() == Instruction::ExtractElement &&
17273 !NextS.isAltShuffle()) {
17275 for (
Value *RV : ReducedVals[
I + 1]) {
17276 Value *RdxVal = TrackedVals.find(RV)->second;
17280 if (
auto *Inst = dyn_cast<Instruction>(RdxVal))
17281 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
17283 CommonCandidates.push_back(RdxVal);
17284 TrackedToOrig.try_emplace(RdxVal, RV);
17289 Candidates.
swap(CommonCandidates);
17290 ShuffledExtracts =
true;
17299 ++VectorizedVals.try_emplace(Candidates.
front(), 0).first->getSecond();
17301 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
17302 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17303 if (
auto *ResI = dyn_cast<Instruction>(Res))
17304 V.analyzedReductionRoot(ResI);
17306 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17310 unsigned NumReducedVals = Candidates.
size();
17311 if (NumReducedVals < ReductionLimit &&
17318 IsSupportedHorRdxIdentityOp =
17320 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17323 if (IsSupportedHorRdxIdentityOp)
17324 for (
Value *V : Candidates)
17325 ++SameValuesCounter.
insert(std::make_pair(V, 0)).first->second;
17336 bool SameScaleFactor =
false;
17337 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17338 SameValuesCounter.
size() != Candidates.size();
17339 if (OptReusedScalars) {
17341 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17342 RdxKind == RecurKind::Xor) &&
17344 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
17345 return P.second == SameValuesCounter.
front().second;
17347 Candidates.resize(SameValuesCounter.
size());
17348 transform(SameValuesCounter, Candidates.begin(),
17349 [](
const auto &
P) { return P.first; });
17350 NumReducedVals = Candidates.size();
17352 if (NumReducedVals == 1) {
17353 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17354 unsigned Cnt = SameValuesCounter.
lookup(OrigV);
17356 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17357 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17358 VectorizedVals.try_emplace(OrigV, Cnt);
17363 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
17364 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
17368 unsigned ReduxWidth = std::min<unsigned>(
17370 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17371 RegMaxNumber * RedValsMaxNumber));
17372 unsigned Start = 0;
17373 unsigned Pos = Start;
17375 unsigned PrevReduxWidth = ReduxWidth;
17376 bool CheckForReusedReductionOpsLocal =
false;
17377 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17378 &CheckForReusedReductionOpsLocal,
17379 &PrevReduxWidth, &
V,
17380 &IgnoreList](
bool IgnoreVL =
false) {
17381 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
17382 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17385 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17388 if (Pos < NumReducedVals - ReduxWidth + 1)
17389 return IsAnyRedOpGathered;
17392 return IsAnyRedOpGathered;
17394 bool AnyVectorized =
false;
17395 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17396 ReduxWidth >= ReductionLimit) {
17399 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17401 CheckForReusedReductionOps =
true;
17404 PrevReduxWidth = ReduxWidth;
17407 if (
V.areAnalyzedReductionVals(VL)) {
17408 (void)AdjustReducedVals(
true);
17414 auto *RedValI = dyn_cast<Instruction>(RedVal);
17417 return V.isDeleted(RedValI);
17420 V.buildTree(VL, IgnoreList);
17421 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
17422 if (!AdjustReducedVals())
17423 V.analyzedReductionVals(VL);
17426 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
17427 if (!AdjustReducedVals())
17428 V.analyzedReductionVals(VL);
17431 V.reorderTopToBottom();
17433 V.reorderBottomToTop(
true);
17437 ExternallyUsedValues);
17438 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
17439 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
17441 for (
Value *V : ReducedVals[Cnt])
17442 if (isa<Instruction>(V))
17443 LocalExternallyUsedValues[TrackedVals[
V]];
17445 if (!IsSupportedHorRdxIdentityOp) {
17448 "Reused values counter map is not empty");
17449 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17450 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17452 Value *
V = Candidates[Cnt];
17453 Value *OrigV = TrackedToOrig.find(V)->second;
17454 ++SameValuesCounter[OrigV];
17460 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17461 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17463 Value *RdxVal = Candidates[Cnt];
17464 if (!Visited.
insert(RdxVal).second)
17468 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
17469 LocalExternallyUsedValues[RdxVal];
17472 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17474 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17475 if (NumOps != ReducedValsToOps.
find(OrigV)->second.size())
17476 LocalExternallyUsedValues[RdxVal];
17479 if (!IsSupportedHorRdxIdentityOp)
17480 SameValuesCounter.
clear();
17481 for (
Value *RdxVal : VL)
17482 if (RequiredExtract.
contains(RdxVal))
17483 LocalExternallyUsedValues[RdxVal];
17487 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17488 ReplacementToExternal.
try_emplace(Pair.second, Pair.first);
17489 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17491 auto RIt = ReplacementToExternal.
find(Ext);
17492 while (RIt != ReplacementToExternal.
end()) {
17494 RIt = ReplacementToExternal.
find(Ext);
17496 auto *It = ExternallyUsedValues.
find(Ext);
17497 if (It == ExternallyUsedValues.
end())
17499 LocalExternallyUsedValues[Pair.second].append(It->second);
17501 V.buildExternalUses(LocalExternallyUsedValues);
17503 V.computeMinimumValueSizes();
17504 V.transformNodes();
17509 getReductionCost(
TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17512 <<
" for reduction\n");
17516 V.getORE()->emit([&]() {
17518 SV_NAME,
"HorSLPNotBeneficial",
17519 ReducedValsToOps.
find(VL[0])->second.front())
17520 <<
"Vectorizing horizontal reduction is possible "
17521 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
17522 <<
" and threshold "
17525 if (!AdjustReducedVals())
17526 V.analyzedReductionVals(VL);
17530 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
17531 <<
Cost <<
". (HorRdx)\n");
17532 V.getORE()->emit([&]() {
17534 SV_NAME,
"VectorizedHorizontalReduction",
17535 ReducedValsToOps.
find(VL[0])->second.front())
17536 <<
"Vectorized horizontal reduction with cost "
17537 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
17538 <<
ore::NV(
"TreeSize",
V.getTreeSize());
17545 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17547 if (IsCmpSelMinMax)
17548 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17551 Value *VectorizedRoot =
V.vectorizeTree(LocalExternallyUsedValues,
17552 ReplacedExternals, InsertPt);
17559 if ((isBoolLogicOp(RdxRootInst) ||
17560 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17562 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
17565 if (OptReusedScalars && !SameScaleFactor) {
17566 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
17567 SameValuesCounter, TrackedToOrig);
17570 Value *ReducedSubTree =
17571 emitReduction(VectorizedRoot, Builder, ReduxWidth,
TTI);
17572 if (ReducedSubTree->
getType() != VL.front()->getType()) {
17573 assert(ReducedSubTree->
getType() != VL.front()->getType() &&
17574 "Expected different reduction type.");
17576 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
17577 V.isSignedMinBitwidthRootNode());
17583 if (OptReusedScalars && SameScaleFactor)
17584 ReducedSubTree = emitScaleForReusedOps(
17585 ReducedSubTree, Builder, SameValuesCounter.
front().second);
17587 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17589 for (
Value *RdxVal : VL) {
17590 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17591 if (IsSupportedHorRdxIdentityOp) {
17592 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17595 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17596 if (!
V.isVectorized(RdxVal))
17597 RequiredExtract.
insert(RdxVal);
17602 AnyVectorized =
true;
17604 if (OptReusedScalars && !AnyVectorized) {
17605 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
17606 Value *RedVal = emitScaleForReusedOps(
P.first, Builder,
P.second);
17607 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17608 Value *OrigV = TrackedToOrig.find(
P.first)->second;
17609 VectorizedVals.try_emplace(OrigV,
P.second);
17614 if (VectorizedTree) {
17635 if (!AnyBoolLogicOp)
17637 if (isBoolLogicOp(RedOp1) &&
17638 ((!InitStep &&
LHS == VectorizedTree) ||
17641 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
17642 getRdxOperand(RedOp2, 0) ==
RHS ||
17647 if (
LHS != VectorizedTree)
17658 unsigned Sz = InstVals.
size();
17661 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
17664 Value *RdxVal1 = InstVals[
I].second;
17665 Value *StableRdxVal1 = RdxVal1;
17666 auto It1 = TrackedVals.find(RdxVal1);
17667 if (It1 != TrackedVals.end())
17668 StableRdxVal1 = It1->second;
17669 Value *RdxVal2 = InstVals[
I + 1].second;
17670 Value *StableRdxVal2 = RdxVal2;
17671 auto It2 = TrackedVals.find(RdxVal2);
17672 if (It2 != TrackedVals.end())
17673 StableRdxVal2 = It2->second;
17677 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
17679 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17680 StableRdxVal2,
"op.rdx", ReductionOps);
17681 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
17684 ExtraReds[Sz / 2] = InstVals.
back();
17688 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
17692 for (
Value *RdxVal : Candidates) {
17693 if (!Visited.
insert(RdxVal).second)
17695 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17702 for (
auto &Pair : ExternallyUsedValues) {
17704 for (
auto *
I : Pair.second)
17708 bool InitStep =
true;
17709 while (ExtraReductions.
size() > 1) {
17711 FinalGen(ExtraReductions, InitStep);
17712 ExtraReductions.
swap(NewReds);
17715 VectorizedTree = ExtraReductions.
front().second;
17717 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17726 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
17733 for (
auto *U :
Ignore->users()) {
17735 "All users must be either in the reduction ops list.");
17738 if (!
Ignore->use_empty()) {
17740 Ignore->replaceAllUsesWith(
P);
17743 V.removeInstructionsAndOperands(RdxOps);
17745 }
else if (!CheckForReusedReductionOps) {
17746 for (ReductionOpsType &RdxOps : ReductionOps)
17747 for (
Value *RdxOp : RdxOps)
17748 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17750 return VectorizedTree;
17757 bool IsCmpSelMinMax,
unsigned ReduxWidth,
17760 Type *ScalarTy = ReducedVals.
front()->getType();
17769 int Cnt = ReducedVals.
size();
17770 for (
Value *RdxVal : ReducedVals) {
17775 Cost += GenCostFn();
17780 auto *RdxOp = cast<Instruction>(U);
17781 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17789 Cost += ScalarCost;
17791 Cost += GenCostFn();
17796 case RecurKind::Add:
17797 case RecurKind::Mul:
17798 case RecurKind::Or:
17799 case RecurKind::And:
17800 case RecurKind::Xor:
17801 case RecurKind::FAdd:
17802 case RecurKind::FMul: {
17807 ScalarCost = EvaluateScalarCost([&]() {
17812 case RecurKind::FMax:
17813 case RecurKind::FMin:
17814 case RecurKind::FMaximum:
17815 case RecurKind::FMinimum:
17816 case RecurKind::SMax:
17817 case RecurKind::SMin:
17818 case RecurKind::UMax:
17819 case RecurKind::UMin: {
17823 ScalarCost = EvaluateScalarCost([&]() {
17833 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
17835 <<
" (It is a splitting reduction)\n");
17836 return VectorCost - ScalarCost;
17842 assert(VectorizedValue &&
"Need to have a vectorized tree node");
17844 "We only handle power-of-two reductions for now");
17845 assert(RdxKind != RecurKind::FMulAdd &&
17846 "A call to the llvm.fmuladd intrinsic is not handled yet");
17848 ++NumVectorInstructions;
17855 assert(IsSupportedHorRdxIdentityOp &&
17856 "The optimization of matched scalar identity horizontal reductions "
17857 "must be supported.");
17859 case RecurKind::Add: {
17861 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
17863 << VectorizedValue <<
". (HorRdx)\n");
17864 return Builder.
CreateMul(VectorizedValue, Scale);
17866 case RecurKind::Xor: {
17868 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
17869 <<
". (HorRdx)\n");
17872 return VectorizedValue;
17874 case RecurKind::FAdd: {
17876 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
17878 << VectorizedValue <<
". (HorRdx)\n");
17879 return Builder.
CreateFMul(VectorizedValue, Scale);
17881 case RecurKind::And:
17882 case RecurKind::Or:
17883 case RecurKind::SMax:
17884 case RecurKind::SMin:
17885 case RecurKind::UMax:
17886 case RecurKind::UMin:
17887 case RecurKind::FMax:
17888 case RecurKind::FMin:
17889 case RecurKind::FMaximum:
17890 case RecurKind::FMinimum:
17892 return VectorizedValue;
17893 case RecurKind::Mul:
17894 case RecurKind::FMul:
17895 case RecurKind::FMulAdd:
17896 case RecurKind::IAnyOf:
17897 case RecurKind::FAnyOf:
17898 case RecurKind::None:
17910 assert(IsSupportedHorRdxIdentityOp &&
17911 "The optimization of matched scalar identity horizontal reductions "
17912 "must be supported.");
17914 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
17915 if (VTy->getElementType() != VL.
front()->getType()) {
17919 R.isSignedMinBitwidthRootNode());
17922 case RecurKind::Add: {
17925 for (
Value *V : VL) {
17926 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17927 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
17931 << VectorizedValue <<
". (HorRdx)\n");
17932 return Builder.
CreateMul(VectorizedValue, Scale);
17934 case RecurKind::And:
17935 case RecurKind::Or:
17938 <<
". (HorRdx)\n");
17939 return VectorizedValue;
17940 case RecurKind::SMax:
17941 case RecurKind::SMin:
17942 case RecurKind::UMax:
17943 case RecurKind::UMin:
17944 case RecurKind::FMax:
17945 case RecurKind::FMin:
17946 case RecurKind::FMaximum:
17947 case RecurKind::FMinimum:
17950 <<
". (HorRdx)\n");
17951 return VectorizedValue;
17952 case RecurKind::Xor: {
17958 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
17960 std::iota(
Mask.begin(),
Mask.end(), 0);
17961 bool NeedShuffle =
false;
17962 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
17964 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17965 if (Cnt % 2 == 0) {
17967 NeedShuffle =
true;
17973 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
17977 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
17978 return VectorizedValue;
17980 case RecurKind::FAdd: {
17983 for (
Value *V : VL) {
17984 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17985 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
17988 return Builder.
CreateFMul(VectorizedValue, Scale);
17990 case RecurKind::Mul:
17991 case RecurKind::FMul:
17992 case RecurKind::FMulAdd:
17993 case RecurKind::IAnyOf:
17994 case RecurKind::FAnyOf:
17995 case RecurKind::None:
18005 return HorizontalReduction::getRdxKind(V);
18008 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
18009 return cast<FixedVectorType>(IE->getType())->getNumElements();
18011 unsigned AggregateSize = 1;
18012 auto *
IV = cast<InsertValueInst>(InsertInst);
18013 Type *CurrentType =
IV->getType();
18015 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
18016 for (
auto *Elt : ST->elements())
18017 if (Elt != ST->getElementType(0))
18018 return std::nullopt;
18019 AggregateSize *= ST->getNumElements();
18020 CurrentType = ST->getElementType(0);
18021 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
18022 AggregateSize *= AT->getNumElements();
18023 CurrentType = AT->getElementType();
18024 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
18025 AggregateSize *= VT->getNumElements();
18026 return AggregateSize;
18028 return AggregateSize;
18030 return std::nullopt;
18039 unsigned OperandOffset) {
18042 std::optional<unsigned> OperandIndex =
18046 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
18048 BuildVectorOpds, InsertElts, *OperandIndex);
18051 BuildVectorOpds[*OperandIndex] = InsertedOperand;
18052 InsertElts[*OperandIndex] = LastInsertInst;
18054 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
18055 }
while (LastInsertInst !=
nullptr &&
18056 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
18079 assert((isa<InsertElementInst>(LastInsertInst) ||
18080 isa<InsertValueInst>(LastInsertInst)) &&
18081 "Expected insertelement or insertvalue instruction!");
18084 "Expected empty result vectors!");
18087 if (!AggregateSize)
18089 BuildVectorOpds.
resize(*AggregateSize);
18090 InsertElts.
resize(*AggregateSize);
18095 if (BuildVectorOpds.
size() >= 2)
18113 auto DominatedReduxValue = [&](
Value *R) {
18114 return isa<Instruction>(R) &&
18115 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
18121 if (
P->getIncomingBlock(0) == ParentBB) {
18122 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
18123 }
else if (
P->getIncomingBlock(1) == ParentBB) {
18124 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
18127 if (Rdx && DominatedReduxValue(Rdx))
18140 if (
P->getIncomingBlock(0) == BBLatch) {
18141 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
18142 }
else if (
P->getIncomingBlock(1) == BBLatch) {
18143 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
18146 if (Rdx && DominatedReduxValue(Rdx))
18180 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
18181 isa<IntrinsicInst>(Root)) &&
18182 "Expected binop, select, or intrinsic for reduction matching");
18184 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
18186 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
18188 return dyn_cast<Instruction>(
RHS);
18190 return dyn_cast<Instruction>(
LHS);
18197 Value *Op0 =
nullptr;
18198 Value *Op1 =
nullptr;
18201 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
18207 Value *B0 =
nullptr, *B1 =
nullptr;
18212bool SLPVectorizerPass::vectorizeHorReduction(
18217 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
18219 if (Root->
getParent() != BB || isa<PHINode>(Root))
18223 auto SelectRoot = [&]() {
18242 std::queue<std::pair<Instruction *, unsigned>>
Stack;
18243 Stack.emplace(SelectRoot(), 0);
18247 if (
R.isAnalyzedReductionRoot(Inst))
18252 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
18254 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI);
18256 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
18257 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18264 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
18269 while (!
Stack.empty()) {
18272 std::tie(Inst, Level) =
Stack.front();
18277 if (
R.isDeleted(Inst))
18279 if (
Value *VectorizedV = TryToReduce(Inst)) {
18281 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
18283 Stack.emplace(
I, Level);
18286 if (
R.isDeleted(Inst))
18290 if (!TryAppendToPostponedInsts(Inst)) {
18301 if (VisitedInstrs.
insert(
Op).second)
18302 if (
auto *
I = dyn_cast<Instruction>(
Op))
18305 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
18306 !
R.isDeleted(
I) &&
I->getParent() == BB)
18307 Stack.emplace(
I, Level);
18316 bool Res = vectorizeHorReduction(
P, Root, BB, R,
TTI, PostponedInsts);
18317 Res |= tryToVectorize(PostponedInsts, R);
18324 for (
Value *V : Insts)
18325 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
18326 Res |= tryToVectorize(Inst, R);
18330bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
18333 if (!
R.canMapToVector(IVI->
getType()))
18341 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
18342 R.getORE()->emit([&]() {
18344 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
18345 "trying reduction first.";
18349 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
18351 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
18361 (
llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18365 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
18366 R.getORE()->emit([&]() {
18368 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
18369 "trying reduction first.";
18373 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
18374 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
18377template <
typename T>
18382 bool MaxVFOnly,
BoUpSLP &R) {
18383 bool Changed =
false;
18394 auto *
I = dyn_cast<Instruction>(*IncIt);
18395 if (!
I || R.isDeleted(
I)) {
18399 auto *SameTypeIt = IncIt;
18400 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
18401 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18402 AreCompatible(*SameTypeIt, *IncIt))) {
18403 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
18405 if (
I && !R.isDeleted(
I))
18410 unsigned NumElts = VL.
size();
18411 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
18412 << NumElts <<
")\n");
18422 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
18425 VL.
swap(Candidates);
18426 Candidates.
clear();
18428 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
18434 auto GetMinNumElements = [&R](
Value *V) {
18435 unsigned EltSize = R.getVectorElementSize(V);
18436 return std::max(2U, R.getMaxVecRegSize() / EltSize);
18438 if (NumElts < GetMinNumElements(*IncIt) &&
18439 (Candidates.
empty() ||
18440 Candidates.
front()->getType() == (*IncIt)->getType())) {
18442 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
18448 if (Candidates.
size() > 1 &&
18449 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18450 if (TryToVectorizeHelper(Candidates,
false)) {
18453 }
else if (MaxVFOnly) {
18456 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
18458 auto *
I = dyn_cast<Instruction>(*It);
18459 if (!
I || R.isDeleted(
I)) {
18463 auto *SameTypeIt = It;
18464 while (SameTypeIt !=
End &&
18465 (!isa<Instruction>(*SameTypeIt) ||
18466 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18467 AreCompatible(*SameTypeIt, *It))) {
18468 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
18470 if (
I && !R.isDeleted(
I))
18473 unsigned NumElts = VL.
size();
18474 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
18480 Candidates.
clear();
18484 IncIt = SameTypeIt;
18496template <
bool IsCompatibility>
18501 "Expected valid element types only.");
18503 return IsCompatibility;
18504 auto *CI1 = cast<CmpInst>(V);
18505 auto *CI2 = cast<CmpInst>(V2);
18506 if (CI1->getOperand(0)->getType()->getTypeID() <
18508 return !IsCompatibility;
18509 if (CI1->getOperand(0)->getType()->getTypeID() >
18518 if (BasePred1 < BasePred2)
18519 return !IsCompatibility;
18520 if (BasePred1 > BasePred2)
18523 bool CI1Preds = Pred1 == BasePred1;
18524 bool CI2Preds = Pred2 == BasePred1;
18525 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
18526 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
18527 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
18531 return !IsCompatibility;
18534 if (
auto *I1 = dyn_cast<Instruction>(Op1))
18535 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
18536 if (IsCompatibility) {
18537 if (I1->getParent() != I2->getParent())
18544 return NodeI2 !=
nullptr;
18547 assert((NodeI1 == NodeI2) ==
18549 "Different nodes should have different DFS numbers");
18550 if (NodeI1 != NodeI2)
18554 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18556 if (IsCompatibility)
18558 if (I1->getOpcode() != I2->getOpcode())
18559 return I1->getOpcode() < I2->getOpcode();
18562 return IsCompatibility;
18565template <
typename ItT>
18568 bool Changed =
false;
18571 if (
R.isDeleted(
I))
18574 if (
auto *RootOp = dyn_cast<Instruction>(
Op))
18575 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R,
TTI);
18579 if (
R.isDeleted(
I))
18581 Changed |= tryToVectorize(
I, R);
18588 return compareCmp<false>(V, V2, *TLI, *DT);
18591 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
18594 return compareCmp<true>(V1, V2, *TLI, *DT);
18601 if (Vals.
size() <= 1)
18603 Changed |= tryToVectorizeSequence<Value>(
18604 Vals, CompareSorter, AreCompatibleCompares,
18607 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
18609 auto *Select = dyn_cast<SelectInst>(U);
18611 Select->getParent() != cast<Instruction>(V)->getParent();
18614 if (ArePossiblyReducedInOtherBlock)
18616 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18622bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18624 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18625 "This function only accepts Insert instructions");
18626 bool OpsChanged =
false;
18628 for (
auto *
I :
reverse(Instructions)) {
18630 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
18632 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
18634 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
18635 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
18637 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
18640 if (
R.isDeleted(
I))
18642 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R,
TTI, PostponedInsts);
18643 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
18646 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
18648 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
18649 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
18650 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
18655 OpsChanged |= tryToVectorize(PostponedInsts, R);
18662 bool Changed =
false;
18669 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
18672 "Expected vectorizable types only.");
18681 if (Opcodes1.
size() < Opcodes2.
size())
18683 if (Opcodes1.
size() > Opcodes2.
size())
18685 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
18688 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
18689 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
18694 return NodeI2 !=
nullptr;
18697 assert((NodeI1 == NodeI2) ==
18699 "Different nodes should have different DFS numbers");
18700 if (NodeI1 != NodeI2)
18703 if (S.getOpcode() && !S.isAltShuffle())
18705 return I1->getOpcode() < I2->getOpcode();
18714 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
18715 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
18723 bool U1 = isa<UndefValue>(Opcodes1[
I]);
18724 bool U2 = isa<UndefValue>(Opcodes2[
I]);
18728 auto ValID1 = Opcodes1[
I]->getValueID();
18729 auto ValID2 = Opcodes2[
I]->getValueID();
18730 if (ValID1 == ValID2)
18732 if (ValID1 < ValID2)
18734 if (ValID1 > ValID2)
18743 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
18748 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
18751 if (V1->getType() !=
V2->getType())
18755 if (Opcodes1.
size() != Opcodes2.
size())
18757 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
18759 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
18761 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
18762 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
18763 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
18765 if (
I1->getParent() != I2->getParent())
18772 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
18774 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
18780 bool HaveVectorizedPhiNodes =
false;
18785 auto *
P = dyn_cast<PHINode>(&
I);
18791 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
18804 if (!Opcodes.
empty())
18808 while (!Nodes.empty()) {
18809 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
18812 for (
Value *V :
PHI->incoming_values()) {
18813 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
18814 Nodes.push_back(PHI1);
18822 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18823 Incoming, PHICompare, AreCompatiblePHIs,
18825 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18828 Changed |= HaveVectorizedPhiNodes;
18829 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
18830 auto *
PHI = dyn_cast<PHINode>(
P.first);
18831 return !
PHI ||
R.isDeleted(
PHI);
18833 PHIToOpcodes.
clear();
18835 }
while (HaveVectorizedPhiNodes);
18837 VisitedInstrs.
clear();
18839 InstSetVector PostProcessInserts;
18843 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
18844 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18845 if (VectorizeCmps) {
18846 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
18847 PostProcessCmps.
clear();
18849 PostProcessInserts.clear();
18854 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
18855 return PostProcessCmps.
contains(Cmp);
18856 return isa<InsertElementInst, InsertValueInst>(
I) &&
18857 PostProcessInserts.contains(
I);
18863 return I->use_empty() &&
18864 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
18869 if (isa<ScalableVectorType>(It->getType()))
18873 if (
R.isDeleted(&*It))
18876 if (!VisitedInstrs.
insert(&*It).second) {
18877 if (HasNoUsers(&*It) &&
18878 VectorizeInsertsAndCmps(It->isTerminator())) {
18888 if (isa<DbgInfoIntrinsic>(It))
18892 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
18894 if (
P->getNumIncomingValues() == 2) {
18897 if (Root && vectorizeRootInstruction(
P, Root, BB, R,
TTI)) {
18906 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
18911 if (BB ==
P->getIncomingBlock(
I) ||
18917 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
18918 PI && !IsInPostProcessInstrs(PI)) {
18919 bool Res = vectorizeRootInstruction(
nullptr, PI,
18920 P->getIncomingBlock(
I), R,
TTI);
18922 if (Res &&
R.isDeleted(
P)) {
18932 if (HasNoUsers(&*It)) {
18933 bool OpsChanged =
false;
18934 auto *
SI = dyn_cast<StoreInst>(It);
18944 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
18945 SI->getValueOperand()->hasOneUse();
18947 if (TryToVectorizeRoot) {
18948 for (
auto *V : It->operand_values()) {
18951 if (
auto *VI = dyn_cast<Instruction>(V);
18952 VI && !IsInPostProcessInstrs(VI))
18954 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R,
TTI);
18961 VectorizeInsertsAndCmps(It->isTerminator());
18972 if (isa<InsertElementInst, InsertValueInst>(It))
18973 PostProcessInserts.insert(&*It);
18974 else if (isa<CmpInst>(It))
18975 PostProcessCmps.
insert(cast<CmpInst>(&*It));
18982 auto Changed =
false;
18983 for (
auto &Entry : GEPs) {
18986 if (
Entry.second.size() < 2)
18989 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
18990 <<
Entry.second.size() <<
".\n");
18998 return !R.isDeleted(GEP);
19000 if (It ==
Entry.second.end())
19002 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
19003 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
19004 if (MaxVecRegSize < EltSize)
19007 unsigned MaxElts = MaxVecRegSize / EltSize;
19008 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
19009 auto Len = std::min<unsigned>(BE - BI, MaxElts);
19022 Candidates.remove_if([&R](
Value *
I) {
19023 return R.isDeleted(cast<Instruction>(
I)) ||
19024 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
19032 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
19033 auto *GEPI = GEPList[
I];
19034 if (!Candidates.count(GEPI))
19036 auto *SCEVI = SE->
getSCEV(GEPList[
I]);
19037 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
19038 auto *GEPJ = GEPList[J];
19039 auto *SCEVJ = SE->
getSCEV(GEPList[J]);
19040 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
19041 Candidates.remove(GEPI);
19042 Candidates.remove(GEPJ);
19043 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
19044 Candidates.remove(GEPJ);
19051 if (Candidates.
size() < 2)
19058 auto BundleIndex = 0
u;
19059 for (
auto *V : Candidates) {
19060 auto *
GEP = cast<GetElementPtrInst>(V);
19061 auto *GEPIdx =
GEP->idx_begin()->get();
19062 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
19063 Bundle[BundleIndex++] = GEPIdx;
19075 Changed |= tryToVectorizeList(Bundle, R);
19081bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
19082 bool Changed =
false;
19087 if (
V->getValueOperand()->getType()->getTypeID() <
19088 V2->getValueOperand()->getType()->getTypeID())
19090 if (
V->getValueOperand()->getType()->getTypeID() >
19091 V2->getValueOperand()->getType()->getTypeID())
19093 if (
V->getPointerOperandType()->getTypeID() <
19094 V2->getPointerOperandType()->getTypeID())
19096 if (
V->getPointerOperandType()->getTypeID() >
19097 V2->getPointerOperandType()->getTypeID())
19100 if (isa<UndefValue>(
V->getValueOperand()) ||
19101 isa<UndefValue>(
V2->getValueOperand()))
19103 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
19104 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
19108 DT->
getNode(I2->getParent());
19109 assert(NodeI1 &&
"Should only process reachable instructions");
19110 assert(NodeI2 &&
"Should only process reachable instructions");
19111 assert((NodeI1 == NodeI2) ==
19113 "Different nodes should have different DFS numbers");
19114 if (NodeI1 != NodeI2)
19119 return I1->getOpcode() < I2->getOpcode();
19121 if (isa<Constant>(
V->getValueOperand()) &&
19122 isa<Constant>(
V2->getValueOperand()))
19124 return V->getValueOperand()->getValueID() <
19125 V2->getValueOperand()->getValueID();
19137 isa<UndefValue>(
V2->getValueOperand()))
19140 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
19141 if (
I1->getParent() != I2->getParent())
19144 return S.getOpcode() > 0;
19147 isa<Constant>(
V2->getValueOperand()))
19150 V2->getValueOperand()->getValueID();
19155 for (
auto &Pair : Stores) {
19156 if (Pair.second.size() < 2)
19160 << Pair.second.size() <<
".\n");
19169 Pair.second.rend());
19170 Changed |= tryToVectorizeSequence<StoreInst>(
19171 ReversedStores, StoreSorter, AreCompatibleStores,
19173 return vectorizeStores(Candidates, R, Attempted);
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Loop::LoopBounds::Direction Direction
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
value_type & FindAndConstruct(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
size_type count(const KeyT &Key) const
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T get() const
Returns the value of the specified pointer type.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
void clear()
Completely clear the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Value * getOperand(unsigned i) const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
unsigned getTreeSize() const
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.