73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Only vectorize if you gain more than this "
123 cl::desc(
"Attempt to vectorize horizontal reductions"));
128 "Attempt to vectorize horizontal reductions feeding into a store"));
134 cl::desc(
"Allow optimization of original scalar identity operations on "
135 "matched horizontal reductions."));
139 cl::desc(
"Attempt to vectorize for this register size in bits"));
143 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
151 cl::desc(
"Limit the size of the SLP scheduling region per block"));
155 cl::desc(
"Attempt to vectorize for this register size in bits"));
159 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
163 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
169 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
178 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
182 cl::desc(
"The minimum number of loads, which should be considered strided, "
183 "if the stride is > 1 or is runtime value"));
187 cl::desc(
"The maximum stride, considered to be profitable."));
191 cl::desc(
"Display the SLP trees with Graphviz"));
218 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
225 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
232 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
233 !isa<ExtractValueInst, UndefValue>(V))
235 auto *
I = dyn_cast<Instruction>(V);
236 if (!
I || isa<ExtractValueInst>(
I))
238 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
240 if (isa<ExtractElementInst>(
I))
242 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
251 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
267 for (
int I = 1,
E = VL.
size();
I <
E;
I++) {
268 auto *II = dyn_cast<Instruction>(VL[
I]);
289 Value *FirstNonUndef =
nullptr;
290 for (
Value *V : VL) {
291 if (isa<UndefValue>(V))
293 if (!FirstNonUndef) {
297 if (V != FirstNonUndef)
300 return FirstNonUndef !=
nullptr;
305 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
306 return Cmp->isCommutative();
307 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
308 return BO->isCommutative();
320 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
321 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
324 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
327 if (CI->getValue().uge(VT->getNumElements()))
329 Index *= VT->getNumElements();
330 Index += CI->getZExtValue();
334 const auto *
IV = cast<InsertValueInst>(InsertInst);
335 Type *CurrentType =
IV->getType();
336 for (
unsigned I :
IV->indices()) {
337 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
338 Index *= ST->getNumElements();
339 CurrentType = ST->getElementType(
I);
340 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
341 Index *= AT->getNumElements();
342 CurrentType = AT->getElementType();
375 if (MaskArg == UseMask::UndefsAsMask)
379 if (MaskArg == UseMask::FirstArg &&
Value < VF)
380 UseMask.reset(
Value);
381 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
382 UseMask.reset(
Value - VF);
390template <
bool IsPoisonOnly = false>
394 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
397 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
400 auto *
C = dyn_cast<Constant>(V);
402 if (!UseMask.empty()) {
404 while (
auto *II = dyn_cast<InsertElementInst>(
Base)) {
405 Base = II->getOperand(0);
406 if (isa<T>(II->getOperand(1)))
413 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
421 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
428 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
429 if (
Constant *Elem =
C->getAggregateElement(
I))
431 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
459static std::optional<TargetTransformInfo::ShuffleKind>
462 find_if(VL, [](
Value *V) {
return isa<ExtractElementInst>(V); });
465 auto *EI0 = cast<ExtractElementInst>(*It);
466 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
469 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
470 Value *Vec1 =
nullptr;
471 Value *Vec2 =
nullptr;
473 ShuffleMode CommonShuffleMode =
Unknown;
475 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
477 if (isa<UndefValue>(VL[
I]))
479 auto *EI = cast<ExtractElementInst>(VL[
I]);
480 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
482 auto *Vec = EI->getVectorOperand();
487 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
489 if (isa<UndefValue>(EI->getIndexOperand()))
491 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
497 unsigned IntIdx =
Idx->getValue().getZExtValue();
501 if (!Vec1 || Vec1 == Vec) {
503 }
else if (!Vec2 || Vec2 == Vec) {
509 if (CommonShuffleMode == Permute)
514 CommonShuffleMode = Permute;
517 CommonShuffleMode =
Select;
520 if (CommonShuffleMode ==
Select && Vec2)
530 unsigned Opcode =
E->getOpcode();
531 assert((Opcode == Instruction::ExtractElement ||
532 Opcode == Instruction::ExtractValue) &&
533 "Expected extractelement or extractvalue instruction.");
534 if (Opcode == Instruction::ExtractElement) {
535 auto *CI = dyn_cast<ConstantInt>(
E->getOperand(1));
538 return CI->getZExtValue();
540 auto *EI = cast<ExtractValueInst>(
E);
541 if (EI->getNumIndices() != 1)
543 return *EI->idx_begin();
549struct InstructionsState {
551 Value *OpValue =
nullptr;
562 unsigned getAltOpcode()
const {
567 bool isAltShuffle()
const {
return AltOp != MainOp; }
570 unsigned CheckedOpcode =
I->getOpcode();
571 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
574 InstructionsState() =
delete;
576 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
585 auto *
I = dyn_cast<Instruction>(
Op);
586 if (
I && S.isOpcodeOrAlt(
I))
605 unsigned BaseIndex = 0);
613 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
614 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
615 BaseOp0 == Op0 || BaseOp1 == Op1 ||
626 "Assessing comparisons of different types?");
636 return (BasePred == Pred &&
638 (BasePred == SwappedPred &&
647 unsigned BaseIndex) {
650 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
652 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
653 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
654 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
656 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
658 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
659 unsigned AltOpcode = Opcode;
660 unsigned AltIndex = BaseIndex;
664 auto *IBase = cast<Instruction>(VL[BaseIndex]);
667 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
671 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
673 for (
int Cnt = 0,
E = VL.
size(); Cnt <
E; Cnt++) {
674 auto *
I = cast<Instruction>(VL[Cnt]);
675 unsigned InstOpcode =
I->getOpcode();
676 if (IsBinOp && isa<BinaryOperator>(
I)) {
677 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
681 AltOpcode = InstOpcode;
685 }
else if (IsCastOp && isa<CastInst>(
I)) {
686 Value *Op0 = IBase->getOperand(0);
688 Value *Op1 =
I->getOperand(0);
691 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
693 if (Opcode == AltOpcode) {
696 "Cast isn't safe for alternation, logic needs to be updated!");
697 AltOpcode = InstOpcode;
702 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
703 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
704 Type *Ty0 = BaseInst->getOperand(0)->getType();
705 Type *Ty1 = Inst->getOperand(0)->getType();
707 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
715 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
720 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
721 if (AltIndex != BaseIndex) {
724 }
else if (BasePred != CurrentPred) {
727 "CmpInst isn't safe for alternation, logic needs to be updated!");
732 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
733 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
736 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
737 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
738 if (Gep->getNumOperands() != 2 ||
739 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
740 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
741 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
743 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
744 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
745 auto *BaseLI = cast<LoadInst>(IBase);
746 if (!LI->isSimple() || !BaseLI->isSimple())
747 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
748 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
749 auto *
CallBase = cast<CallInst>(IBase);
751 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
752 if (Call->hasOperandBundles() &&
753 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
754 Call->op_begin() + Call->getBundleOperandsEndIndex(),
757 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
760 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
763 if (Mappings.
size() != BaseMappings.
size() ||
764 Mappings.
front().ISA != BaseMappings.
front().ISA ||
765 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
766 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
767 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
768 Mappings.
front().Shape.Parameters !=
769 BaseMappings.
front().Shape.Parameters)
770 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
775 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
778 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
779 cast<Instruction>(VL[AltIndex]));
795 case Instruction::Load: {
796 LoadInst *LI = cast<LoadInst>(UserInst);
799 case Instruction::Store: {
800 StoreInst *SI = cast<StoreInst>(UserInst);
801 return (SI->getPointerOperand() == Scalar);
803 case Instruction::Call: {
804 CallInst *CI = cast<CallInst>(UserInst);
807 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
808 Arg.value().get() == Scalar;
820 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
827 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
828 return LI->isSimple();
830 return SI->isSimple();
832 return !
MI->isVolatile();
840 bool ExtendingManyInputs =
false) {
844 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
846 (SubMask.
size() == Mask.size() &&
847 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
848 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
849 "SubMask with many inputs support must be larger than the mask.");
851 Mask.append(SubMask.
begin(), SubMask.
end());
855 int TermValue = std::min(Mask.size(), SubMask.
size());
856 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
858 (!ExtendingManyInputs &&
859 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
861 NewMask[
I] = Mask[SubMask[
I]];
877 const unsigned Sz = Order.
size();
880 for (
unsigned I = 0;
I < Sz; ++
I) {
882 UnusedIndices.
reset(Order[
I]);
884 MaskedIndices.
set(
I);
886 if (MaskedIndices.
none())
889 "Non-synced masked/available indices.");
893 assert(
Idx >= 0 &&
"Indices must be synced.");
905 const unsigned E = Indices.
size();
907 for (
unsigned I = 0;
I <
E; ++
I)
908 Mask[Indices[
I]] =
I;
914 assert(!Mask.empty() &&
"Expected non-empty mask.");
918 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
920 Scalars[Mask[
I]] = Prev[
I];
928 auto *
I = dyn_cast<Instruction>(V);
933 auto *IO = dyn_cast<Instruction>(V);
936 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
945 auto *
I = dyn_cast<Instruction>(V);
949 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
951 auto *IU = dyn_cast<Instruction>(U);
954 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
970 return !VL.
empty() &&
974namespace slpvectorizer {
1004 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li),
1005 DT(Dt), AC(AC), DB(DB),
DL(
DL), ORE(ORE), Builder(Se->getContext()) {
1058 return !VectorizableTree.
empty() &&
1059 !VectorizableTree.
front()->UserTreeIndices.empty();
1064 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1065 return VectorizableTree.
front()->Scalars;
1077 VectorizableTree.
clear();
1078 ScalarToTreeEntry.clear();
1079 MultiNodeScalars.clear();
1081 EntryToLastInstruction.clear();
1082 ExternalUses.
clear();
1083 for (
auto &Iter : BlocksSchedules) {
1084 BlockScheduling *BS = Iter.second.get();
1088 InstrElementSize.clear();
1089 UserIgnoreList =
nullptr;
1090 PostponedGathers.
clear();
1091 ValueToGatherNodes.
clear();
1148 return MaxVecRegSize;
1153 return MinVecRegSize;
1161 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1163 return MaxVF ? MaxVF : UINT_MAX;
1207 bool TryRecursiveCheck =
true)
const;
1231 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1232 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1254 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1255 MaxLevel(MaxLevel) {}
1309 if (isa<LoadInst>(V1)) {
1311 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1316 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1318 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1321 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1324 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1326 ((
int)V1->getNumUses() == NumLanes ||
1327 AllUsersAreInternal(V1, V2)))
1333 auto *LI1 = dyn_cast<LoadInst>(V1);
1334 auto *LI2 = dyn_cast<LoadInst>(V2);
1336 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1341 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1342 LI2->getPointerOperand(),
DL, SE,
true);
1343 if (!Dist || *Dist == 0) {
1346 R.TTI->isLegalMaskedGather(
1354 if (std::abs(*Dist) > NumLanes / 2)
1363 auto *C1 = dyn_cast<Constant>(V1);
1364 auto *C2 = dyn_cast<Constant>(V2);
1378 if (isa<UndefValue>(V2))
1382 Value *EV2 =
nullptr;
1395 int Dist = Idx2 - Idx1;
1398 if (std::abs(Dist) == 0)
1400 if (std::abs(Dist) > NumLanes / 2)
1410 auto *I1 = dyn_cast<Instruction>(V1);
1411 auto *I2 = dyn_cast<Instruction>(V2);
1413 if (I1->getParent() != I2->getParent())
1421 if (S.getOpcode() &&
1422 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1423 !S.isAltShuffle()) &&
1425 return cast<Instruction>(V)->getNumOperands() ==
1426 S.MainOp->getNumOperands();
1432 if (isa<UndefValue>(V2))
1469 int ShallowScoreAtThisLevel =
1478 auto *I1 = dyn_cast<Instruction>(
LHS);
1479 auto *I2 = dyn_cast<Instruction>(
RHS);
1480 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1482 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1483 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1484 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1485 ShallowScoreAtThisLevel))
1486 return ShallowScoreAtThisLevel;
1487 assert(I1 && I2 &&
"Should have early exited.");
1494 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1495 OpIdx1 != NumOperands1; ++OpIdx1) {
1497 int MaxTmpScore = 0;
1498 unsigned MaxOpIdx2 = 0;
1499 bool FoundBest =
false;
1503 ? I2->getNumOperands()
1504 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1505 assert(FromIdx <= ToIdx &&
"Bad index");
1506 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1508 if (Op2Used.
count(OpIdx2))
1513 I1, I2, CurrLevel + 1, std::nullopt);
1516 TmpScore > MaxTmpScore) {
1517 MaxTmpScore = TmpScore;
1524 Op2Used.
insert(MaxOpIdx2);
1525 ShallowScoreAtThisLevel += MaxTmpScore;
1528 return ShallowScoreAtThisLevel;
1559 struct OperandData {
1560 OperandData() =
default;
1561 OperandData(
Value *V,
bool APO,
bool IsUsed)
1562 : V(V), APO(APO), IsUsed(IsUsed) {}
1572 bool IsUsed =
false;
1581 enum class ReorderingMode {
1600 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1601 return OpsVec[OpIdx][Lane];
1605 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1606 return OpsVec[OpIdx][Lane];
1611 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1612 OpIdx != NumOperands; ++OpIdx)
1613 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1615 OpsVec[OpIdx][Lane].IsUsed =
false;
1619 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1620 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1632 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1633 Value *IdxLaneV = getData(
Idx, Lane).V;
1634 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1637 for (
unsigned Ln = 0,
E = getNumLanes(); Ln <
E; ++Ln) {
1640 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1641 if (!isa<Instruction>(OpIdxLnV))
1643 Uniques.
insert(OpIdxLnV);
1645 int UniquesCount = Uniques.
size();
1646 int UniquesCntWithIdxLaneV =
1647 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1648 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1649 int UniquesCntWithOpIdxLaneV =
1650 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1651 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1654 UniquesCntWithOpIdxLaneV) -
1655 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1664 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1665 Value *IdxLaneV = getData(
Idx, Lane).V;
1666 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1675 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1676 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1678 return R.areAllUsersVectorized(IdxLaneI)
1686 static const int ScoreScaleFactor = 10;
1694 int Lane,
unsigned OpIdx,
unsigned Idx,
1704 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1705 if (Score <= -SplatScore) {
1710 Score += SplatScore;
1716 Score *= ScoreScaleFactor;
1717 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1735 std::optional<unsigned>
1736 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1739 unsigned NumOperands = getNumOperands();
1742 Value *OpLastLane = getData(OpIdx, LastLane).V;
1745 ReorderingMode RMode = ReorderingModes[OpIdx];
1746 if (RMode == ReorderingMode::Failed)
1747 return std::nullopt;
1750 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1756 std::optional<unsigned>
Idx;
1760 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1767 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1769 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1771 OperandData &OpData = getData(
Idx, Lane);
1773 bool OpAPO = OpData.APO;
1782 if (OpAPO != OpIdxAPO)
1787 case ReorderingMode::Load:
1788 case ReorderingMode::Constant:
1789 case ReorderingMode::Opcode: {
1790 bool LeftToRight = Lane > LastLane;
1791 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1792 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1793 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1794 OpIdx,
Idx, IsUsed);
1795 if (Score >
static_cast<int>(BestOp.Score)) {
1797 BestOp.Score = Score;
1798 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1802 case ReorderingMode::Splat:
1803 if (
Op == OpLastLane)
1806 case ReorderingMode::Failed:
1812 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1816 return std::nullopt;
1823 unsigned getBestLaneToStartReordering()
const {
1824 unsigned Min = UINT_MAX;
1825 unsigned SameOpNumber = 0;
1836 for (
int I = getNumLanes();
I > 0; --
I) {
1837 unsigned Lane =
I - 1;
1838 OperandsOrderData NumFreeOpsHash =
1839 getMaxNumOperandsThatCanBeReordered(Lane);
1842 if (NumFreeOpsHash.NumOfAPOs < Min) {
1843 Min = NumFreeOpsHash.NumOfAPOs;
1844 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1846 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1847 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1848 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1851 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1852 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1853 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1854 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1855 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
1856 if (It == HashMap.
end())
1857 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1863 unsigned BestLane = 0;
1864 unsigned CntMin = UINT_MAX;
1866 if (
Data.second.first < CntMin) {
1867 CntMin =
Data.second.first;
1868 BestLane =
Data.second.second;
1875 struct OperandsOrderData {
1878 unsigned NumOfAPOs = UINT_MAX;
1881 unsigned NumOpsWithSameOpcodeParent = 0;
1895 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1896 unsigned CntTrue = 0;
1897 unsigned NumOperands = getNumOperands();
1907 bool AllUndefs =
true;
1908 unsigned NumOpsWithSameOpcodeParent = 0;
1912 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1913 const OperandData &OpData = getData(OpIdx, Lane);
1918 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
1920 I->getParent() != Parent) {
1921 if (NumOpsWithSameOpcodeParent == 0) {
1922 NumOpsWithSameOpcodeParent = 1;
1924 Parent =
I->getParent();
1926 --NumOpsWithSameOpcodeParent;
1929 ++NumOpsWithSameOpcodeParent;
1933 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1934 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1938 OperandsOrderData
Data;
1939 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1940 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1948 assert((empty() || VL.
size() == getNumLanes()) &&
1949 "Expected same number of lanes");
1950 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
1951 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1952 OpsVec.
resize(NumOperands);
1953 unsigned NumLanes = VL.
size();
1954 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1955 OpsVec[OpIdx].
resize(NumLanes);
1956 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1957 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
1968 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
1969 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
1970 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1977 unsigned getNumOperands()
const {
return OpsVec.
size(); }
1980 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
1983 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
1984 return getData(OpIdx, Lane).V;
1988 bool empty()
const {
return OpsVec.
empty(); }
1991 void clear() { OpsVec.
clear(); }
1996 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
1997 bool OpAPO = getData(OpIdx, Lane).APO;
1998 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2002 bool FoundCandidate =
false;
2003 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2004 OperandData &
Data = getData(OpI, Ln);
2005 if (
Data.APO != OpAPO ||
Data.IsUsed)
2008 FoundCandidate =
true;
2013 if (!FoundCandidate)
2023 : TLI(TLI),
DL(
DL), SE(SE), R(R) {
2025 appendOperandsOfVL(RootVL);
2032 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2033 "Expected same num of lanes across all operands");
2034 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2035 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2043 unsigned NumOperands = getNumOperands();
2044 unsigned NumLanes = getNumLanes();
2064 unsigned FirstLane = getBestLaneToStartReordering();
2067 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2068 Value *OpLane0 = getValue(OpIdx, FirstLane);
2071 if (isa<LoadInst>(OpLane0))
2072 ReorderingModes[OpIdx] = ReorderingMode::Load;
2073 else if (isa<Instruction>(OpLane0)) {
2075 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2076 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2078 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2080 else if (isa<Constant>(OpLane0))
2081 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2082 else if (isa<Argument>(OpLane0))
2084 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2087 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2094 auto &&SkipReordering = [
this]() {
2097 for (
const OperandData &
Data : Op0)
2100 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2119 if (SkipReordering())
2122 bool StrategyFailed =
false;
2130 for (
unsigned I = 0;
I < NumOperands; ++
I)
2131 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2133 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2136 int Lane = FirstLane +
Direction * Distance;
2137 if (Lane < 0 || Lane >= (
int)NumLanes)
2140 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2143 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2145 std::optional<unsigned> BestIdx = getBestOperand(
2146 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2153 swap(OpIdx, *BestIdx, Lane);
2156 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2158 StrategyFailed =
true;
2161 if (MainAltOps[OpIdx].
size() != 2) {
2162 OperandData &AltOp = getData(OpIdx, Lane);
2163 InstructionsState OpS =
2165 if (OpS.getOpcode() && OpS.isAltShuffle())
2172 if (!StrategyFailed)
2177#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2180 case ReorderingMode::Load:
2182 case ReorderingMode::Opcode:
2184 case ReorderingMode::Constant:
2186 case ReorderingMode::Splat:
2188 case ReorderingMode::Failed:
2209 const unsigned Indent = 2;
2212 OS <<
"Operand " << Cnt++ <<
"\n";
2213 for (
const OperandData &OpData : OpDataVec) {
2215 if (
Value *V = OpData.V)
2219 OS <<
", APO:" << OpData.APO <<
"}\n";
2241 int BestScore = Limit;
2242 std::optional<int>
Index;
2243 for (
int I : seq<int>(0, Candidates.size())) {
2245 Candidates[
I].second,
2248 if (Score > BestScore) {
2263 DeletedInstructions.insert(
I);
2269 return AnalyzedReductionsRoots.count(
I);
2274 AnalyzedReductionsRoots.insert(
I);
2288 AnalyzedReductionsRoots.clear();
2289 AnalyzedReductionVals.
clear();
2309 bool collectValuesToDemote(
2322 canReorderOperands(TreeEntry *UserTE,
2329 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2333 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2335 TreeEntry *TE =
nullptr;
2337 TE = getTreeEntry(V);
2338 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2340 auto It = MultiNodeScalars.find(V);
2341 if (It != MultiNodeScalars.end()) {
2342 for (TreeEntry *
E : It->second) {
2343 if (
is_contained(
E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2351 if (It != VL.
end()) {
2352 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2360 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2361 unsigned OpIdx)
const {
2362 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2363 const_cast<TreeEntry *
>(UserTE), OpIdx);
2367 bool areAllUsersVectorized(
2376 const TreeEntry *getOperandEntry(
const TreeEntry *
E,
unsigned Idx)
const;
2385 const EdgeInfo &EI);
2396 bool ResizeAllowed =
false)
const;
2407 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx,
bool PostponedPHIs);
2412 template <
typename BVTy,
typename ResTy,
typename...
Args>
2413 ResTy processBuildVector(
const TreeEntry *
E, Args &...Params);
2418 Value *createBuildVector(
const TreeEntry *
E);
2424 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
2431 std::optional<TargetTransformInfo::ShuffleKind>
2443 unsigned NumParts)
const;
2455 std::optional<TargetTransformInfo::ShuffleKind>
2456 isGatherShuffledSingleRegisterEntry(
2473 isGatherShuffledEntry(
2476 unsigned NumParts,
bool ForOrder =
false);
2486 void setInsertPointAfterBundle(
const TreeEntry *
E);
2494 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2498 static void reorderInputsAccordingToOpcode(
2507 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2523 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2527 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2544 [Scalars](
Value *V,
int Idx) {
2545 return (isa<UndefValue>(V) &&
2546 Idx == PoisonMaskElem) ||
2547 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2550 if (!ReorderIndices.empty()) {
2557 return IsSame(Scalars, Mask);
2558 if (VL.
size() == ReuseShuffleIndices.size()) {
2560 return IsSame(Scalars, Mask);
2564 return IsSame(Scalars, ReuseShuffleIndices);
2567 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2568 return State == TreeEntry::NeedToGather &&
2569 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2570 UserTreeIndices.front().UserTE == UserEI.UserTE;
2574 bool hasEqualOperands(
const TreeEntry &TE)
const {
2575 if (
TE.getNumOperands() != getNumOperands())
2578 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
2579 unsigned PrevCount =
Used.count();
2580 for (
unsigned K = 0;
K <
E; ++
K) {
2583 if (getOperand(K) ==
TE.getOperand(
I)) {
2589 if (PrevCount ==
Used.count())
2598 unsigned getVectorFactor()
const {
2599 if (!ReuseShuffleIndices.empty())
2600 return ReuseShuffleIndices.size();
2601 return Scalars.
size();
2636 VecTreeTy &Container;
2660 assert(Operands[OpIdx].empty() &&
"Already resized?");
2662 "Number of operands is greater than the number of scalars.");
2668 void setOperandsInOrder() {
2670 auto *I0 = cast<Instruction>(Scalars[0]);
2671 Operands.resize(I0->getNumOperands());
2672 unsigned NumLanes = Scalars.size();
2673 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2674 OpIdx != NumOperands; ++OpIdx) {
2676 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2677 auto *
I = cast<Instruction>(Scalars[Lane]);
2678 assert(
I->getNumOperands() == NumOperands &&
2679 "Expected same number of operands");
2680 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2704 unsigned getNumOperands()
const {
return Operands.size(); }
2707 Value *getSingleOperand(
unsigned OpIdx)
const {
2709 assert(!Operands[OpIdx].empty() &&
"No operand available");
2714 bool isAltShuffle()
const {
return MainOp != AltOp; }
2717 unsigned CheckedOpcode =
I->getOpcode();
2718 return (getOpcode() == CheckedOpcode ||
2719 getAltOpcode() == CheckedOpcode);
2726 auto *
I = dyn_cast<Instruction>(
Op);
2727 if (
I && isOpcodeOrAlt(
I))
2732 void setOperations(
const InstructionsState &S) {
2746 unsigned getOpcode()
const {
2747 return MainOp ? MainOp->
getOpcode() : 0;
2750 unsigned getAltOpcode()
const {
2756 int findLaneForValue(
Value *V)
const {
2757 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2758 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2759 if (!ReorderIndices.
empty())
2760 FoundLane = ReorderIndices[FoundLane];
2761 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2762 if (!ReuseShuffleIndices.
empty()) {
2763 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2764 find(ReuseShuffleIndices, FoundLane));
2781 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2782 dbgs() <<
"Operand " << OpI <<
":\n";
2783 for (
const Value *V : Operands[OpI])
2786 dbgs() <<
"Scalars: \n";
2787 for (
Value *V : Scalars)
2789 dbgs() <<
"State: ";
2792 dbgs() <<
"Vectorize\n";
2794 case ScatterVectorize:
2795 dbgs() <<
"ScatterVectorize\n";
2797 case StridedVectorize:
2798 dbgs() <<
"StridedVectorize\n";
2801 dbgs() <<
"NeedToGather\n";
2804 dbgs() <<
"MainOp: ";
2806 dbgs() << *MainOp <<
"\n";
2809 dbgs() <<
"AltOp: ";
2811 dbgs() << *AltOp <<
"\n";
2814 dbgs() <<
"VectorizedValue: ";
2815 if (VectorizedValue)
2816 dbgs() << *VectorizedValue <<
"\n";
2819 dbgs() <<
"ReuseShuffleIndices: ";
2820 if (ReuseShuffleIndices.
empty())
2823 for (
int ReuseIdx : ReuseShuffleIndices)
2824 dbgs() << ReuseIdx <<
", ";
2826 dbgs() <<
"ReorderIndices: ";
2827 for (
unsigned ReorderIdx : ReorderIndices)
2828 dbgs() << ReorderIdx <<
", ";
2830 dbgs() <<
"UserTreeIndices: ";
2831 for (
const auto &EInfo : UserTreeIndices)
2832 dbgs() << EInfo <<
", ";
2842 dbgs() <<
"SLP: " << Banner <<
":\n";
2844 dbgs() <<
"SLP: Costs:\n";
2845 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2846 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2847 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2848 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2849 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
2855 std::optional<ScheduleData *> Bundle,
2856 const InstructionsState &S,
2857 const EdgeInfo &UserTreeIdx,
2860 TreeEntry::EntryState EntryState =
2861 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2862 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2863 ReuseShuffleIndices, ReorderIndices);
2867 TreeEntry::EntryState EntryState,
2868 std::optional<ScheduleData *> Bundle,
2869 const InstructionsState &S,
2870 const EdgeInfo &UserTreeIdx,
2873 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2874 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2875 "Need to vectorize gather entry?");
2876 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
2877 TreeEntry *
Last = VectorizableTree.
back().get();
2878 Last->Idx = VectorizableTree.
size() - 1;
2879 Last->State = EntryState;
2880 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2881 ReuseShuffleIndices.end());
2882 if (ReorderIndices.
empty()) {
2884 Last->setOperations(S);
2887 Last->Scalars.assign(VL.
size(),
nullptr);
2890 if (Idx >= VL.size())
2891 return UndefValue::get(VL.front()->getType());
2895 Last->setOperations(S);
2896 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
2898 if (
Last->State != TreeEntry::NeedToGather) {
2899 for (
Value *V : VL) {
2900 const TreeEntry *
TE = getTreeEntry(V);
2902 "Scalar already in tree!");
2905 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
2908 ScalarToTreeEntry[
V] =
Last;
2911 ScheduleData *BundleMember = *Bundle;
2912 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2915 "Bundle and VL out of sync");
2917 for (
Value *V : VL) {
2922 BundleMember->TE =
Last;
2923 BundleMember = BundleMember->NextInBundle;
2926 assert(!BundleMember &&
"Bundle and VL out of sync");
2928 MustGather.
insert(VL.begin(), VL.end());
2935 if (UserTreeIdx.UserTE)
2936 Last->UserTreeIndices.push_back(UserTreeIdx);
2943 TreeEntry::VecTreeTy VectorizableTree;
2948 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2949 VectorizableTree[
Id]->dump();
2955 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
2957 const TreeEntry *getTreeEntry(
Value *V)
const {
2958 return ScalarToTreeEntry.lookup(V);
2963 TreeEntry::EntryState getScalarsVectorizationState(
2993 using ValueToGatherNodesMap =
2995 ValueToGatherNodesMap ValueToGatherNodes;
2998 struct ExternalUser {
3022 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3023 auto It = AliasCache.
find(Key);
3024 if (It != AliasCache.
end())
3029 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3033 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3061 UserList ExternalUses;
3077 struct ScheduleData {
3080 enum { InvalidDeps = -1 };
3082 ScheduleData() =
default;
3084 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3085 FirstInBundle =
this;
3086 NextInBundle =
nullptr;
3087 NextLoadStore =
nullptr;
3088 IsScheduled =
false;
3089 SchedulingRegionID = BlockSchedulingRegionID;
3090 clearDependencies();
3097 if (hasValidDependencies()) {
3098 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3100 assert(UnscheduledDeps == Dependencies &&
"invariant");
3104 assert(isSchedulingEntity() &&
3105 "unexpected scheduled state");
3106 for (
const ScheduleData *BundleMember =
this; BundleMember;
3107 BundleMember = BundleMember->NextInBundle) {
3108 assert(BundleMember->hasValidDependencies() &&
3109 BundleMember->UnscheduledDeps == 0 &&
3110 "unexpected scheduled state");
3111 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3112 "only bundle is marked scheduled");
3116 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3117 "all bundle members must be in same basic block");
3123 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3127 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3131 bool isPartOfBundle()
const {
3132 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3137 bool isReady()
const {
3138 assert(isSchedulingEntity() &&
3139 "can't consider non-scheduling entity for ready list");
3140 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3146 int incrementUnscheduledDeps(
int Incr) {
3147 assert(hasValidDependencies() &&
3148 "increment of unscheduled deps would be meaningless");
3149 UnscheduledDeps += Incr;
3150 return FirstInBundle->unscheduledDepsInBundle();
3155 void resetUnscheduledDeps() {
3156 UnscheduledDeps = Dependencies;
3160 void clearDependencies() {
3161 Dependencies = InvalidDeps;
3162 resetUnscheduledDeps();
3163 MemoryDependencies.clear();
3164 ControlDependencies.clear();
3167 int unscheduledDepsInBundle()
const {
3168 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3170 for (
const ScheduleData *BundleMember =
this; BundleMember;
3171 BundleMember = BundleMember->NextInBundle) {
3172 if (BundleMember->UnscheduledDeps == InvalidDeps)
3174 Sum += BundleMember->UnscheduledDeps;
3180 if (!isSchedulingEntity()) {
3181 os <<
"/ " << *Inst;
3182 }
else if (NextInBundle) {
3184 ScheduleData *SD = NextInBundle;
3186 os <<
';' << *SD->Inst;
3187 SD = SD->NextInBundle;
3198 Value *OpValue =
nullptr;
3201 TreeEntry *
TE =
nullptr;
3205 ScheduleData *FirstInBundle =
nullptr;
3209 ScheduleData *NextInBundle =
nullptr;
3213 ScheduleData *NextLoadStore =
nullptr;
3227 int SchedulingRegionID = 0;
3230 int SchedulingPriority = 0;
3236 int Dependencies = InvalidDeps;
3242 int UnscheduledDeps = InvalidDeps;
3246 bool IsScheduled =
false;
3251 const BoUpSLP::ScheduleData &SD) {
3276 struct BlockScheduling {
3278 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3282 ScheduleStart =
nullptr;
3283 ScheduleEnd =
nullptr;
3284 FirstLoadStoreInRegion =
nullptr;
3285 LastLoadStoreInRegion =
nullptr;
3286 RegionHasStackSave =
false;
3290 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3293 ScheduleRegionSize = 0;
3297 ++SchedulingRegionID;
3301 if (BB !=
I->getParent())
3304 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3305 if (SD && isInSchedulingRegion(SD))
3310 ScheduleData *getScheduleData(
Value *V) {
3311 if (
auto *
I = dyn_cast<Instruction>(V))
3312 return getScheduleData(
I);
3316 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3318 return getScheduleData(V);
3319 auto I = ExtraScheduleDataMap.find(V);
3320 if (
I != ExtraScheduleDataMap.end()) {
3321 ScheduleData *SD =
I->second.lookup(Key);
3322 if (SD && isInSchedulingRegion(SD))
3328 bool isInSchedulingRegion(ScheduleData *SD)
const {
3329 return SD->SchedulingRegionID == SchedulingRegionID;
3334 template <
typename ReadyListType>
3335 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3336 SD->IsScheduled =
true;
3339 for (ScheduleData *BundleMember = SD; BundleMember;
3340 BundleMember = BundleMember->NextInBundle) {
3341 if (BundleMember->Inst != BundleMember->OpValue)
3347 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3348 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3349 if (OpDef && OpDef->hasValidDependencies() &&
3350 OpDef->incrementUnscheduledDeps(-1) == 0) {
3354 ScheduleData *DepBundle = OpDef->FirstInBundle;
3355 assert(!DepBundle->IsScheduled &&
3356 "already scheduled bundle gets ready");
3357 ReadyList.insert(DepBundle);
3359 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3367 if (TreeEntry *TE = BundleMember->TE) {
3369 int Lane = std::distance(
TE->Scalars.begin(),
3370 find(
TE->Scalars, BundleMember->Inst));
3371 assert(Lane >= 0 &&
"Lane not set");
3379 auto *
In = BundleMember->Inst;
3381 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3382 In->getNumOperands() ==
TE->getNumOperands()) &&
3383 "Missed TreeEntry operands?");
3386 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3387 OpIdx != NumOperands; ++OpIdx)
3388 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3393 for (
Use &U : BundleMember->Inst->operands())
3394 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3398 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3399 if (MemoryDepSD->hasValidDependencies() &&
3400 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3403 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3404 assert(!DepBundle->IsScheduled &&
3405 "already scheduled bundle gets ready");
3406 ReadyList.insert(DepBundle);
3408 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3412 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3413 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3416 ScheduleData *DepBundle = DepSD->FirstInBundle;
3417 assert(!DepBundle->IsScheduled &&
3418 "already scheduled bundle gets ready");
3419 ReadyList.insert(DepBundle);
3421 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3432 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3433 ScheduleStart->comesBefore(ScheduleEnd) &&
3434 "Not a valid scheduling region?");
3436 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3437 auto *SD = getScheduleData(
I);
3440 assert(isInSchedulingRegion(SD) &&
3441 "primary schedule data not in window?");
3442 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3443 "entire bundle in window!");
3445 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3448 for (
auto *SD : ReadyInsts) {
3449 assert(SD->isSchedulingEntity() && SD->isReady() &&
3450 "item in ready list not ready?");
3455 void doForAllOpcodes(
Value *V,
3457 if (ScheduleData *SD = getScheduleData(V))
3459 auto I = ExtraScheduleDataMap.find(V);
3460 if (
I != ExtraScheduleDataMap.end())
3461 for (
auto &
P :
I->second)
3462 if (isInSchedulingRegion(
P.second))
3467 template <
typename ReadyListType>
3468 void initialFillReadyList(ReadyListType &ReadyList) {
3469 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3470 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3471 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3473 ReadyList.insert(SD);
3475 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3490 std::optional<ScheduleData *>
3492 const InstructionsState &S);
3498 ScheduleData *allocateScheduleDataChunks();
3502 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3507 ScheduleData *PrevLoadStore,
3508 ScheduleData *NextLoadStore);
3512 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3516 void resetSchedule();
3537 ExtraScheduleDataMap;
3550 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3554 ScheduleData *LastLoadStoreInRegion =
nullptr;
3559 bool RegionHasStackSave =
false;
3562 int ScheduleRegionSize = 0;
3571 int SchedulingRegionID = 1;
3579 void scheduleBlock(BlockScheduling *BS);
3586 struct OrdersTypeDenseMapInfo {
3599 static unsigned getHashValue(
const OrdersType &V) {
3620 unsigned MaxVecRegSize;
3621 unsigned MinVecRegSize;
3646 struct ChildIteratorType
3648 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3659 return R.VectorizableTree[0].get();
3663 return {
N->UserTreeIndices.begin(),
N->Container};
3667 return {
N->UserTreeIndices.end(),
N->Container};
3672 class nodes_iterator {
3683 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3687 return nodes_iterator(R->VectorizableTree.begin());
3691 return nodes_iterator(R->VectorizableTree.end());
3694 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3705 OS << Entry->Idx <<
".\n";
3708 for (
auto *V : Entry->Scalars) {
3710 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3711 return EU.Scalar == V;
3721 if (Entry->State == TreeEntry::NeedToGather)
3723 if (Entry->State == TreeEntry::ScatterVectorize ||
3724 Entry->State == TreeEntry::StridedVectorize)
3725 return "color=blue";
3734 for (
auto *
I : DeletedInstructions) {
3735 for (
Use &U :
I->operands()) {
3736 auto *
Op = dyn_cast<Instruction>(U.get());
3737 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
3741 I->dropAllReferences();
3743 for (
auto *
I : DeletedInstructions) {
3745 "trying to erase instruction with users.");
3746 I->eraseFromParent();
3752#ifdef EXPENSIVE_CHECKS
3763 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3764 "Expected non-empty mask.");
3767 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
3769 Reuses[Mask[
I]] = Prev[
I];
3777 bool BottomOrder =
false) {
3778 assert(!Mask.empty() &&
"Expected non-empty mask.");
3779 unsigned Sz = Mask.size();
3782 if (Order.
empty()) {
3784 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
3786 PrevOrder.
swap(Order);
3789 for (
unsigned I = 0;
I < Sz; ++
I)
3791 Order[
I] = PrevOrder[Mask[
I]];
3793 return Data.value() == Sz ||
Data.index() ==
Data.value();
3802 if (Order.
empty()) {
3804 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
3814 for (
unsigned I = 0;
I < Sz; ++
I)
3816 Order[MaskOrder[
I]] =
I;
3820std::optional<BoUpSLP::OrdersType>
3822 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3826 Type *ScalarTy = GatheredScalars.
front()->getType();
3827 int NumScalars = GatheredScalars.
size();
3829 return std::nullopt;
3832 if (NumParts == 0 || NumParts >= NumScalars)
3838 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
3840 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
3843 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
3844 return std::nullopt;
3845 OrdersType CurrentOrder(NumScalars, NumScalars);
3846 if (GatherShuffles.
size() == 1 &&
3848 Entries.front().front()->isSame(TE.Scalars)) {
3851 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
3852 return CurrentOrder;
3856 return all_of(Mask, [&](
int I) {
3863 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
3864 (Entries.size() != 1 ||
3865 Entries.front().front()->ReorderIndices.empty())) ||
3866 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
3867 return std::nullopt;
3872 for (
int I : seq<int>(0, NumParts)) {
3873 if (ShuffledSubMasks.
test(
I))
3875 const int VF = GetVF(
I);
3880 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
3881 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
3882 ShuffledSubMasks.
set(
I);
3886 int FirstMin = INT_MAX;
3887 int SecondVecFound =
false;
3888 for (
int K : seq<int>(0, PartSz)) {
3889 int Idx = Mask[
I * PartSz + K];
3891 Value *V = GatheredScalars[
I * PartSz + K];
3893 SecondVecFound =
true;
3902 SecondVecFound =
true;
3906 FirstMin = (FirstMin / PartSz) * PartSz;
3908 if (SecondVecFound) {
3909 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
3910 ShuffledSubMasks.
set(
I);
3913 for (
int K : seq<int>(0, PartSz)) {
3914 int Idx = Mask[
I * PartSz + K];
3918 if (
Idx >= PartSz) {
3919 SecondVecFound =
true;
3922 if (CurrentOrder[
I * PartSz +
Idx] >
3923 static_cast<unsigned>(
I * PartSz + K) &&
3924 CurrentOrder[
I * PartSz +
Idx] !=
3925 static_cast<unsigned>(
I * PartSz +
Idx))
3926 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
3929 if (SecondVecFound) {
3930 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
3931 ShuffledSubMasks.
set(
I);
3936 int PartSz = NumScalars / NumParts;
3937 if (!ExtractShuffles.
empty())
3938 TransformMaskToOrder(
3939 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
3940 if (!ExtractShuffles[
I])
3943 for (
unsigned Idx : seq<unsigned>(0, PartSz)) {
3944 int K =
I * PartSz +
Idx;
3947 if (!TE.ReuseShuffleIndices.empty())
3948 K = TE.ReuseShuffleIndices[K];
3949 if (!TE.ReorderIndices.empty())
3950 K = std::distance(TE.ReorderIndices.begin(),
3951 find(TE.ReorderIndices, K));
3952 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
3955 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
3957 .getKnownMinValue());
3962 if (GatherShuffles.
size() == 1 && NumParts != 1) {
3963 if (ShuffledSubMasks.
any())
3964 return std::nullopt;
3965 PartSz = NumScalars;
3968 if (!Entries.empty())
3969 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
3970 if (!GatherShuffles[
I])
3972 return std::max(Entries[
I].front()->getVectorFactor(),
3973 Entries[
I].back()->getVectorFactor());
3976 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
3977 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
3978 return std::nullopt;
3979 return std::move(CurrentOrder);
3984 bool CompareOpcodes =
true) {
3987 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3990 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3993 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3997 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4002template <
typename T>
4004 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4006 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4007 return CommonAlignment;
4012 unsigned Sz = Order.
size();
4014 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4025static std::optional<Value *>
4031 const SCEV *PtrSCEVLowest =
nullptr;
4032 const SCEV *PtrSCEVHighest =
nullptr;
4038 return std::nullopt;
4040 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4041 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4045 if (isa<SCEVCouldNotCompute>(Diff))
4046 return std::nullopt;
4048 PtrSCEVLowest = PtrSCEV;
4052 if (isa<SCEVCouldNotCompute>(Diff1))
4053 return std::nullopt;
4055 PtrSCEVHighest = PtrSCEV;
4061 if (isa<SCEVCouldNotCompute>(Dist))
4062 return std::nullopt;
4063 int Size =
DL.getTypeStoreSize(ElemTy);
4064 auto TryGetStride = [&](
const SCEV *Dist,
4065 const SCEV *Multiplier) ->
const SCEV * {
4066 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4067 if (M->getOperand(0) == Multiplier)
4068 return M->getOperand(1);
4069 if (M->getOperand(1) == Multiplier)
4070 return M->getOperand(0);
4073 if (Multiplier == Dist)
4078 const SCEV *Stride =
nullptr;
4079 if (
Size != 1 || SCEVs.
size() > 2) {
4081 Stride = TryGetStride(Dist, Sz);
4083 return std::nullopt;
4085 if (!Stride || isa<SCEVConstant>(Stride))
4086 return std::nullopt;
4089 using DistOrdPair = std::pair<int64_t, int>;
4091 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4093 bool IsConsecutive =
true;
4094 for (
const SCEV *PtrSCEV : SCEVs) {
4096 if (PtrSCEV != PtrSCEVLowest) {
4098 const SCEV *Coeff = TryGetStride(Diff, Stride);
4100 return std::nullopt;
4101 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4102 if (!SC || isa<SCEVCouldNotCompute>(SC))
4103 return std::nullopt;
4107 return std::nullopt;
4108 Dist = SC->getAPInt().getZExtValue();
4112 return std::nullopt;
4113 auto Res = Offsets.emplace(Dist, Cnt);
4115 return std::nullopt;
4117 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4120 if (Offsets.size() != SCEVs.
size())
4121 return std::nullopt;
4122 SortedIndices.
clear();
4123 if (!IsConsecutive) {
4127 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4128 SortedIndices[Cnt] = Pair.second;
4149 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4155 const unsigned Sz = VL.
size();
4157 auto *POIter = PointerOps.
begin();
4158 for (
Value *V : VL) {
4159 auto *L = cast<LoadInst>(V);
4162 *POIter = L->getPointerOperand();
4170 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4181 if (Order.
empty()) {
4182 Ptr0 = PointerOps.
front();
4183 PtrN = PointerOps.
back();
4185 Ptr0 = PointerOps[Order.
front()];
4186 PtrN = PointerOps[Order.
back()];
4188 std::optional<int> Diff =
4191 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4194 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4206 (
static_cast<unsigned>(std::abs(*Diff)) <=
4209 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4210 *Diff == -(
static_cast<int>(Sz) - 1))) {
4211 int Stride = *Diff /
static_cast<int>(Sz - 1);
4212 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4224 else if (
Ptr != Ptr0)
4229 if (((Dist / Stride) * Stride) != Dist ||
4230 !Dists.
insert(Dist).second)
4233 if (Dists.
size() == Sz)
4239 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment) {
4240 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4242 unsigned MaxVF = std::max<unsigned>(
bit_floor(VL.
size() / 2), MinVF);
4243 MaxVF = std::min(
getMaximumVF(Sz, Instruction::Load), MaxVF);
4244 for (
unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4245 unsigned VectorizedCnt = 0;
4247 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End;
4248 Cnt += VF, ++VectorizedCnt) {
4266 if (VectorizedCnt == VL.
size() / VF) {
4270 Instruction::Load, VecTy,
4276 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
4280 Instruction::Load, SubVecTy, LI0->getAlign(),
4281 LI0->getPointerAddressSpace(),
CostKind,
4286 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4291 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4296 "Expected only consecutive, strided or masked gather loads.");
4299 for (
int Idx : seq<int>(0, VL.
size()))
4303 ShuffleMask,
CostKind,
I * VF, SubVecTy);
4308 if (MaskedGatherCost > VecLdCost)
4318 bool ProfitableGatherPointers =
4321 return L->isLoopInvariant(V);
4323 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
4324 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
4326 (
GEP &&
GEP->getNumOperands() == 2 &&
4327 isa<Constant, Instruction>(
GEP->getOperand(1)));
4329 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4334 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4353 "Expected list of pointer operands.");
4358 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4363 std::optional<int> Diff =
4369 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4375 if (Bases.
size() > VL.
size() / 2 - 1)
4379 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4385 bool AnyConsecutive =
false;
4386 for (
auto &
Base : Bases) {
4387 auto &Vec =
Base.second;
4388 if (Vec.size() > 1) {
4390 const std::tuple<Value *, int, unsigned> &
Y) {
4391 return std::get<1>(
X) < std::get<1>(
Y);
4393 int InitialOffset = std::get<1>(Vec[0]);
4395 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4401 SortedIndices.
clear();
4402 if (!AnyConsecutive)
4405 for (
auto &
Base : Bases) {
4406 for (
auto &
T :
Base.second)
4411 "Expected SortedIndices to be the size of VL");
4415std::optional<BoUpSLP::OrdersType>
4417 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
4418 Type *ScalarTy = TE.Scalars[0]->getType();
4421 Ptrs.
reserve(TE.Scalars.size());
4422 for (
Value *V : TE.Scalars) {
4423 auto *L = dyn_cast<LoadInst>(V);
4424 if (!L || !L->isSimple())
4425 return std::nullopt;
4431 return std::move(Order);
4432 return std::nullopt;
4443 if (VU->
getType() != V->getType())
4446 if (!VU->
hasOneUse() && !V->hasOneUse())
4452 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4458 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
4459 bool IsReusedIdx =
false;
4461 if (IE2 == VU && !IE1)
4463 if (IE1 == V && !IE2)
4464 return V->hasOneUse();
4465 if (IE1 && IE1 != V) {
4467 IsReusedIdx |= ReusedIdx.
test(Idx1);
4468 ReusedIdx.
set(Idx1);
4469 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4472 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4474 if (IE2 && IE2 != VU) {
4476 IsReusedIdx |= ReusedIdx.
test(Idx2);
4477 ReusedIdx.
set(Idx2);
4478 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4481 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4483 }
while (!IsReusedIdx && (IE1 || IE2));
4487std::optional<BoUpSLP::OrdersType>
4491 if (!TE.ReuseShuffleIndices.empty()) {
4493 return std::nullopt;
4501 unsigned Sz = TE.Scalars.size();
4502 if (TE.State == TreeEntry::NeedToGather) {
4503 if (std::optional<OrdersType> CurrentOrder =
4508 ::addMask(Mask, TE.ReuseShuffleIndices);
4509 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4510 unsigned Sz = TE.Scalars.size();
4511 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
4514 Res[
Idx + K * Sz] =
I + K * Sz;
4516 return std::move(Res);
4519 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4521 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4522 return std::nullopt;
4526 if (TE.ReorderIndices.empty())
4527 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4530 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4531 unsigned VF = ReorderMask.
size();
4533 unsigned NumParts = VF / Sz;
4535 for (
unsigned I = 0;
I < VF;
I += Sz) {
4537 unsigned UndefCnt = 0;
4546 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
4548 return std::nullopt;
4550 for (
unsigned K = 0; K < NumParts; ++K)
4551 ResOrder[Val + Sz * K] =
I + K;
4553 return std::move(ResOrder);
4555 unsigned VF = TE.getVectorFactor();
4558 TE.ReuseShuffleIndices.end());
4559 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4561 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4562 return Idx && *Idx < Sz;
4565 if (TE.ReorderIndices.empty())
4566 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4569 for (
unsigned I = 0;
I < VF; ++
I) {
4570 int &
Idx = ReusedMask[
I];
4573 Value *V = TE.Scalars[ReorderMask[
Idx]];
4575 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
4581 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
4582 auto *It = ResOrder.
begin();
4583 for (
unsigned K = 0; K < VF; K += Sz) {
4587 std::iota(SubMask.begin(), SubMask.end(), 0);
4589 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
4590 std::advance(It, Sz);
4592 if (TE.State == TreeEntry::NeedToGather &&
4594 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
4595 return std::nullopt;
4596 return std::move(ResOrder);
4598 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4599 any_of(TE.UserTreeIndices,
4601 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4603 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
4604 return std::nullopt;
4605 if ((TE.State == TreeEntry::Vectorize ||
4606 TE.State == TreeEntry::StridedVectorize) &&
4607 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4608 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4610 return TE.ReorderIndices;
4611 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4612 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
4613 Value *V1 = TE.Scalars[I1];
4614 Value *V2 = TE.Scalars[I2];
4615 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
4621 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
4622 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4623 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4624 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4631 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4632 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4633 if (EE1->getOperand(0) != EE2->getOperand(0))
4639 auto IsIdentityOrder = [](
const OrdersType &Order) {
4640 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4645 if (!TE.ReorderIndices.empty())
4646 return TE.ReorderIndices;
4649 std::iota(Phis.begin(), Phis.end(), 0);
4651 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4654 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4655 ResOrder[Id] = PhiToId[Phis[Id]];
4656 if (IsIdentityOrder(ResOrder))
4657 return std::nullopt;
4658 return std::move(ResOrder);
4660 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4664 if ((TE.getOpcode() == Instruction::ExtractElement ||
4667 return isa<UndefValue, ExtractElementInst>(V);
4670 [](
Value *V) { return isa<ExtractElementInst>(V); }))) &&
4672 auto *EE = dyn_cast<ExtractElementInst>(V);
4673 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4678 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4680 if (Reuse || !CurrentOrder.
empty())
4681 return std::move(CurrentOrder);
4689 int Sz = TE.Scalars.size();
4693 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
4694 if (It == TE.Scalars.begin())
4697 if (It != TE.Scalars.end()) {
4699 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4714 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4717 return std::move(Order);
4722 return std::nullopt;
4723 if (TE.Scalars.size() >= 4)
4727 return CurrentOrder;
4729 return std::nullopt;
4739 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
4741 if (Cluster != FirstCluster)
4747void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
4750 const unsigned Sz =
TE.Scalars.size();
4752 if (
TE.State != TreeEntry::NeedToGather ||
4759 addMask(NewMask,
TE.ReuseShuffleIndices);
4761 TE.ReorderIndices.clear();
4768 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
4769 *
End =
TE.ReuseShuffleIndices.end();
4770 It !=
End; std::advance(It, Sz))
4771 std::iota(It, std::next(It, Sz), 0);
4777 "Expected same size of orders");
4778 unsigned Sz = Order.
size();
4780 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
4781 if (Order[
Idx] != Sz)
4782 UsedIndices.
set(Order[
Idx]);
4784 if (SecondaryOrder.
empty()) {
4785 for (
unsigned Idx : seq<unsigned>(0, Sz))
4786 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
4789 for (
unsigned Idx : seq<unsigned>(0, Sz))
4790 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
4791 !UsedIndices.
test(SecondaryOrder[
Idx]))
4792 Order[
Idx] = SecondaryOrder[
Idx];
4812 ExternalUserReorderMap;
4818 for_each(VectorizableTree, [
this, &TTIRef, &VFToOrderedEntries,
4819 &GathersToOrders, &ExternalUserReorderMap,
4820 &AltShufflesToOrders, &PhisToOrders](
4821 const std::unique_ptr<TreeEntry> &TE) {
4824 findExternalStoreUsersReorderIndices(TE.get());
4825 if (!ExternalUserReorderIndices.
empty()) {
4826 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4828 std::move(ExternalUserReorderIndices));
4834 if (TE->isAltShuffle()) {
4837 unsigned Opcode0 = TE->getOpcode();
4838 unsigned Opcode1 = TE->getAltOpcode();
4841 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4842 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4843 OpcodeMask.
set(Lane);
4846 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4852 if (std::optional<OrdersType> CurrentOrder =
4862 const TreeEntry *UserTE = TE.get();
4864 if (UserTE->UserTreeIndices.size() != 1)
4867 return EI.UserTE->State == TreeEntry::Vectorize &&
4868 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4871 UserTE = UserTE->UserTreeIndices.back().UserTE;
4874 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4875 if (!(TE->State == TreeEntry::Vectorize ||
4876 TE->State == TreeEntry::StridedVectorize) ||
4877 !TE->ReuseShuffleIndices.empty())
4878 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4879 if (TE->State == TreeEntry::Vectorize &&
4880 TE->getOpcode() == Instruction::PHI)
4881 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
4886 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4888 auto It = VFToOrderedEntries.
find(VF);
4889 if (It == VFToOrderedEntries.
end())
4901 for (
const TreeEntry *OpTE : OrderedEntries) {
4904 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
4907 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4909 if (OpTE->State == TreeEntry::NeedToGather ||
4910 !OpTE->ReuseShuffleIndices.empty()) {
4911 auto It = GathersToOrders.find(OpTE);
4912 if (It != GathersToOrders.end())
4915 if (OpTE->isAltShuffle()) {
4916 auto It = AltShufflesToOrders.find(OpTE);
4917 if (It != AltShufflesToOrders.end())
4920 if (OpTE->State == TreeEntry::Vectorize &&
4921 OpTE->getOpcode() == Instruction::PHI) {
4922 auto It = PhisToOrders.
find(OpTE);
4923 if (It != PhisToOrders.
end())
4926 return OpTE->ReorderIndices;
4929 auto It = ExternalUserReorderMap.
find(OpTE);
4930 if (It != ExternalUserReorderMap.
end()) {
4931 const auto &ExternalUserReorderIndices = It->second;
4935 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4936 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
4937 ExternalUserReorderIndices.size();
4939 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
4940 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4947 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4948 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4951 unsigned E = Order.size();
4954 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4957 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4959 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4962 if (OrdersUses.empty())
4965 const unsigned Sz = Order.size();
4966 for (
unsigned Idx : seq<unsigned>(0, Sz))
4967 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
4972 unsigned IdentityCnt = 0;
4973 unsigned FilledIdentityCnt = 0;
4975 for (
auto &Pair : OrdersUses) {
4976 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
4977 if (!Pair.first.empty())
4978 FilledIdentityCnt += Pair.second;
4979 IdentityCnt += Pair.second;
4984 unsigned Cnt = IdentityCnt;
4985 for (
auto &Pair : OrdersUses) {
4989 if (Cnt < Pair.second ||
4990 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
4991 Cnt == Pair.second && !BestOrder.
empty() &&
4992 IsIdentityOrder(BestOrder))) {
4994 BestOrder = Pair.first;
5001 if (IsIdentityOrder(BestOrder))
5007 unsigned E = BestOrder.
size();
5009 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5012 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5014 if (TE->Scalars.size() != VF) {
5015 if (TE->ReuseShuffleIndices.size() == VF) {
5021 return EI.UserTE->Scalars.size() == VF ||
5022 EI.UserTE->Scalars.size() ==
5025 "All users must be of VF size.");
5028 reorderNodeWithReuses(*TE, Mask);
5032 if ((TE->State == TreeEntry::Vectorize ||
5033 TE->State == TreeEntry::StridedVectorize) &&
5036 !TE->isAltShuffle()) {
5040 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5041 TE->reorderOperands(Mask);
5044 TE->reorderOperands(Mask);
5045 assert(TE->ReorderIndices.empty() &&
5046 "Expected empty reorder sequence.");
5049 if (!TE->ReuseShuffleIndices.empty()) {
5056 addMask(NewReuses, TE->ReuseShuffleIndices);
5057 TE->ReuseShuffleIndices.swap(NewReuses);
5063bool BoUpSLP::canReorderOperands(
5064 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5067 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
5068 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
5069 return OpData.first ==
I &&
5070 (OpData.second->State == TreeEntry::Vectorize ||
5071 OpData.second->State == TreeEntry::StridedVectorize);
5074 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
5076 if (
any_of(TE->UserTreeIndices,
5077 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5081 Edges.emplace_back(
I, TE);
5087 if (TE->State != TreeEntry::Vectorize &&
5088 TE->State != TreeEntry::StridedVectorize &&
5089 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5093 TreeEntry *
Gather =
nullptr;
5095 [&
Gather, UserTE,
I](TreeEntry *TE) {
5096 assert(TE->State != TreeEntry::Vectorize &&
5097 TE->State != TreeEntry::StridedVectorize &&
5098 "Only non-vectorized nodes are expected.");
5099 if (
any_of(TE->UserTreeIndices,
5100 [UserTE,
I](
const EdgeInfo &EI) {
5101 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5103 assert(TE->isSame(UserTE->getOperand(
I)) &&
5104 "Operand entry does not match operands.");
5125 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5126 if (TE->State != TreeEntry::Vectorize &&
5127 TE->State != TreeEntry::StridedVectorize)
5129 if (std::optional<OrdersType> CurrentOrder =
5131 OrderedEntries.
insert(TE.get());
5132 if (!(TE->State == TreeEntry::Vectorize ||
5133 TE->State == TreeEntry::StridedVectorize) ||
5134 !TE->ReuseShuffleIndices.empty())
5135 GathersToOrders.
insert(TE.get());
5144 while (!OrderedEntries.
empty()) {
5149 for (TreeEntry *TE : OrderedEntries) {
5150 if (!(TE->State == TreeEntry::Vectorize ||
5151 TE->State == TreeEntry::StridedVectorize ||
5152 (TE->State == TreeEntry::NeedToGather &&
5154 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5157 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5159 !Visited.
insert(TE).second) {
5165 for (
EdgeInfo &EI : TE->UserTreeIndices) {
5166 TreeEntry *UserTE = EI.
UserTE;
5167 auto It =
Users.find(UserTE);
5168 if (It ==
Users.end())
5169 It =
Users.insert({UserTE, {}}).first;
5170 It->second.emplace_back(EI.
EdgeIdx, TE);
5174 for (TreeEntry *TE : Filtered)
5175 OrderedEntries.remove(TE);
5177 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5179 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
5180 return Data1.first->Idx > Data2.first->Idx;
5182 for (
auto &
Data : UsersVec) {
5185 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
5187 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5188 OrderedEntries.remove(
Op.second);
5201 for (
const auto &
Op :
Data.second) {
5202 TreeEntry *OpTE =
Op.second;
5203 if (!VisitedOps.
insert(OpTE).second)
5205 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5207 const auto Order = [&]() ->
const OrdersType {
5208 if (OpTE->State == TreeEntry::NeedToGather ||
5209 !OpTE->ReuseShuffleIndices.empty())
5212 return OpTE->ReorderIndices;
5216 if (Order.size() == 1)
5219 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
5220 return P.second == OpTE;
5223 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5224 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5227 unsigned E = Order.size();
5230 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5233 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5236 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5238 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
5239 const auto AllowsReordering = [&](
const TreeEntry *TE) {
5240 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5241 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5242 (IgnoreReorder && TE->Idx == 0))
5244 if (TE->State == TreeEntry::NeedToGather) {
5253 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
5254 TreeEntry *UserTE = EI.
UserTE;
5255 if (!VisitedUsers.
insert(UserTE).second)
5260 if (AllowsReordering(UserTE))
5268 if (
static_cast<unsigned>(
count_if(
5269 Ops, [UserTE, &AllowsReordering](
5270 const std::pair<unsigned, TreeEntry *> &
Op) {
5271 return AllowsReordering(
Op.second) &&
5274 return EI.UserTE == UserTE;
5276 })) <= Ops.
size() / 2)
5277 ++Res.first->second;
5280 if (OrdersUses.empty()) {
5281 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5282 OrderedEntries.remove(
Op.second);
5286 const unsigned Sz = Order.size();
5287 for (
unsigned Idx : seq<unsigned>(0, Sz))
5288 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5293 unsigned IdentityCnt = 0;
5294 unsigned VF =
Data.second.front().second->getVectorFactor();
5296 for (
auto &Pair : OrdersUses) {
5297 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5298 IdentityCnt += Pair.second;
5303 unsigned Cnt = IdentityCnt;
5304 for (
auto &Pair : OrdersUses) {
5308 if (Cnt < Pair.second) {
5310 BestOrder = Pair.first;
5317 if (IsIdentityOrder(BestOrder)) {
5318 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5319 OrderedEntries.remove(
Op.second);
5328 unsigned E = BestOrder.
size();
5330 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5332 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
5333 TreeEntry *TE =
Op.second;
5334 OrderedEntries.remove(TE);
5335 if (!VisitedOps.
insert(TE).second)
5337 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
5338 reorderNodeWithReuses(*TE, Mask);
5342 if (TE->State != TreeEntry::Vectorize &&
5343 TE->State != TreeEntry::StridedVectorize &&
5344 (TE->State != TreeEntry::ScatterVectorize ||
5345 TE->ReorderIndices.empty()))
5347 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
5348 TE->ReorderIndices.empty()) &&
5349 "Non-matching sizes of user/operand entries.");
5351 if (IgnoreReorder && TE == VectorizableTree.front().get())
5352 IgnoreReorder =
false;
5355 for (TreeEntry *
Gather : GatherOps) {
5357 "Unexpected reordering of gathers.");
5358 if (!
Gather->ReuseShuffleIndices.empty()) {
5364 OrderedEntries.remove(
Gather);
5368 if (
Data.first->State != TreeEntry::Vectorize ||
5369 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5370 Data.first->getMainOp()) ||
5371 Data.first->isAltShuffle())
5372 Data.first->reorderOperands(Mask);
5373 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
5374 Data.first->isAltShuffle() ||
5375 Data.first->State == TreeEntry::StridedVectorize) {
5379 if (
Data.first->ReuseShuffleIndices.empty() &&
5380 !
Data.first->ReorderIndices.empty() &&
5381 !
Data.first->isAltShuffle()) {
5384 OrderedEntries.insert(
Data.first);
5392 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5393 VectorizableTree.front()->ReuseShuffleIndices.empty())
5394 VectorizableTree.front()->ReorderIndices.clear();
5400 for (
auto &TEPtr : VectorizableTree) {
5401 TreeEntry *Entry = TEPtr.get();
5404 if (Entry->State == TreeEntry::NeedToGather)
5408 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5409 Value *Scalar = Entry->Scalars[Lane];
5410 if (!isa<Instruction>(Scalar))
5412 int FoundLane = Entry->findLaneForValue(Scalar);
5415 const auto *ExtI = ExternallyUsedValues.
find(Scalar);
5416 if (ExtI != ExternallyUsedValues.
end()) {
5417 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
5418 << Lane <<
" from " << *Scalar <<
".\n");
5419 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
5421 for (
User *U : Scalar->users()) {
5429 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5433 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5437 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5439 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5440 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
5442 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
5449 <<
" from lane " << Lane <<
" from " << *Scalar
5451 ExternalUses.emplace_back(Scalar, U, FoundLane);
5458BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
5460 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5461 Value *V = TE->Scalars[Lane];
5467 for (
User *U : V->users()) {
5468 auto *SI = dyn_cast<StoreInst>(U);
5469 if (SI ==
nullptr || !SI->isSimple() ||
5473 if (getTreeEntry(U))
5477 auto &StoresVec = PtrToStoresMap[
Ptr];
5480 if (StoresVec.size() > Lane)
5483 if (!StoresVec.empty() &&
5484 SI->getParent() != StoresVec.back()->getParent())
5487 if (!StoresVec.empty() &&
5488 SI->getValueOperand()->getType() !=
5489 StoresVec.back()->getValueOperand()->getType())
5491 StoresVec.push_back(SI);
5494 return PtrToStoresMap;
5498 OrdersType &ReorderIndices)
const {
5506 StoreOffsetVec[0] = {S0, 0};
5509 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
5511 std::optional<int> Diff =
5513 SI->getPointerOperand(), *
DL, *SE,
5518 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
5523 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
5524 const std::pair<StoreInst *, int> &Pair2) {
5525 int Offset1 = Pair1.second;
5526 int Offset2 = Pair2.second;
5527 return Offset1 < Offset2;
5531 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5532 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx - 1].second + 1)
5537 ReorderIndices.reserve(StoresVec.
size());
5540 [SI](
const std::pair<StoreInst *, int> &Pair) {
5541 return Pair.first ==
SI;
5543 StoreOffsetVec.begin();
5544 ReorderIndices.push_back(
Idx);
5549 auto IsIdentityOrder = [](
const OrdersType &Order) {
5550 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
5555 if (IsIdentityOrder(ReorderIndices))
5556 ReorderIndices.clear();
5563 for (
unsigned Idx : Order)
5570BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
5571 unsigned NumLanes =
TE->Scalars.size();
5574 collectUserStores(TE);
5583 for (
const auto &Pair : PtrToStoresMap) {
5584 auto &StoresVec = Pair.second;
5586 if (StoresVec.size() != NumLanes)
5591 if (!canFormVector(StoresVec, ReorderIndices))
5596 ExternalReorderIndices.
push_back(ReorderIndices);
5598 return ExternalReorderIndices;
5604 UserIgnoreList = &UserIgnoreLst;
5607 buildTree_rec(Roots, 0,
EdgeInfo());
5614 buildTree_rec(Roots, 0,
EdgeInfo());
5621 Value *NeedsScheduling =
nullptr;
5622 for (
Value *V : VL) {
5625 if (!NeedsScheduling) {
5626 NeedsScheduling = V;
5631 return NeedsScheduling;
5642 bool AllowAlternate) {
5646 if (
auto *LI = dyn_cast<LoadInst>(V)) {
5649 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
5654 if (isa<ExtractElementInst, UndefValue>(V))
5656 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
5658 !isa<UndefValue>(EI->getIndexOperand()))
5661 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
5664 if ((isa<BinaryOperator, CastInst>(
I)) &&
5674 : cast<CastInst>(
I)->getOperand(0)->getType()));
5676 if (isa<CastInst>(
I)) {
5677 std::pair<size_t, size_t> OpVals =
5683 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
5685 if (CI->isCommutative())
5691 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
5705 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
5706 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5707 SubKey =
hash_value(Gep->getPointerOperand());
5711 !isa<ConstantInt>(
I->getOperand(1))) {
5719 return std::make_pair(Key, SubKey);
5729BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5732 assert(S.MainOp &&
"Expected instructions with same/alternate opcodes only.");
5734 unsigned ShuffleOrOp =
5735 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
5736 auto *VL0 = cast<Instruction>(S.OpValue);
5737 switch (ShuffleOrOp) {
5738 case Instruction::PHI: {
5741 for (
Value *
Incoming : cast<PHINode>(V)->incoming_values()) {
5743 if (Term &&
Term->isTerminator()) {
5745 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
5746 return TreeEntry::NeedToGather;
5750 return TreeEntry::Vectorize;
5752 case Instruction::ExtractValue:
5753 case Instruction::ExtractElement: {
5754 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
5755 if (Reuse || !CurrentOrder.empty())
5756 return TreeEntry::Vectorize;
5758 return TreeEntry::NeedToGather;
5760 case Instruction::InsertElement: {
5764 for (
Value *V : VL) {
5765 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
5767 "Non-constant or undef index?");
5771 return !SourceVectors.contains(V);
5774 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
5775 "different source vectors.\n");
5776 return TreeEntry::NeedToGather;
5779 return TreeEntry::Vectorize;
5781 case Instruction::Load: {
5790 return TreeEntry::Vectorize;
5792 return TreeEntry::ScatterVectorize;
5794 return TreeEntry::StridedVectorize;
5797 Type *ScalarTy = VL0->getType();
5798 if (
DL->getTypeSizeInBits(ScalarTy) !=
5799 DL->getTypeAllocSizeInBits(ScalarTy))
5800 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
5802 [](
Value *V) {
return !cast<LoadInst>(V)->isSimple(); }))
5807 return TreeEntry::NeedToGather;
5811 case Instruction::ZExt:
5812 case Instruction::SExt:
5813 case Instruction::FPToUI:
5814 case Instruction::FPToSI:
5815 case Instruction::FPExt:
5816 case Instruction::PtrToInt:
5817 case Instruction::IntToPtr:
5818 case Instruction::SIToFP:
5819 case Instruction::UIToFP:
5820 case Instruction::Trunc:
5821 case Instruction::FPTrunc:
5822 case Instruction::BitCast: {
5823 Type *SrcTy = VL0->getOperand(0)->getType();
5824 for (
Value *V : VL) {
5825 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
5828 dbgs() <<
"SLP: Gathering casts with different src types.\n");
5829 return TreeEntry::NeedToGather;
5832 return TreeEntry::Vectorize;
5834 case Instruction::ICmp:
5835 case Instruction::FCmp: {
5839 Type *ComparedTy = VL0->getOperand(0)->getType();
5840 for (
Value *V : VL) {
5842 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
5843 Cmp->getOperand(0)->getType() != ComparedTy) {
5844 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
5845 return TreeEntry::NeedToGather;
5848 return TreeEntry::Vectorize;
5850 case Instruction::Select:
5851 case Instruction::FNeg:
5852 case Instruction::Add:
5853 case Instruction::FAdd:
5854 case Instruction::Sub:
5855 case Instruction::FSub:
5856 case Instruction::Mul:
5857 case Instruction::FMul:
5858 case Instruction::UDiv:
5859 case Instruction::SDiv:
5860 case Instruction::FDiv:
5861 case Instruction::URem:
5862 case Instruction::SRem:
5863 case Instruction::FRem:
5864 case Instruction::Shl:
5865 case Instruction::LShr:
5866 case Instruction::AShr:
5867 case Instruction::And:
5868 case Instruction::Or:
5869 case Instruction::Xor:
5870 return TreeEntry::Vectorize;
5871 case Instruction::GetElementPtr: {
5873 for (
Value *V : VL) {
5874 auto *
I = dyn_cast<GetElementPtrInst>(V);
5877 if (
I->getNumOperands() != 2) {
5878 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
5879 return TreeEntry::NeedToGather;
5885 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
5886 for (
Value *V : VL) {
5887 auto *
GEP = dyn_cast<GEPOperator>(V);
5890 Type *CurTy =
GEP->getSourceElementType();
5892 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
5893 return TreeEntry::NeedToGather;
5898 Type *Ty1 = VL0->getOperand(1)->getType();
5899 for (
Value *V : VL) {
5900 auto *
I = dyn_cast<GetElementPtrInst>(V);
5903 auto *
Op =
I->getOperand(1);
5904 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
5905 (
Op->getType() != Ty1 &&
5906 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
5907 Op->getType()->getScalarSizeInBits() >
5908 DL->getIndexSizeInBits(
5909 V->getType()->getPointerAddressSpace())))) {
5911 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
5912 return TreeEntry::NeedToGather;
5916 return TreeEntry::Vectorize;
5918 case Instruction::Store: {
5920 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
5923 if (
DL->getTypeSizeInBits(ScalarTy) !=
5924 DL->getTypeAllocSizeInBits(ScalarTy)) {
5925 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
5926 return TreeEntry::NeedToGather;
5930 for (
Value *V : VL) {
5931 auto *
SI = cast<StoreInst>(V);
5932 if (!
SI->isSimple()) {
5934 return TreeEntry::NeedToGather;
5943 if (CurrentOrder.empty()) {
5944 Ptr0 = PointerOps.
front();
5945 PtrN = PointerOps.
back();
5947 Ptr0 = PointerOps[CurrentOrder.front()];
5948 PtrN = PointerOps[CurrentOrder.back()];
5950 std::optional<int> Dist =
5953 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
5954 return TreeEntry::Vectorize;
5958 return TreeEntry::NeedToGather;
5960 case Instruction::Call: {
5963 CallInst *CI = cast<CallInst>(VL0);
5974 return TreeEntry::NeedToGather;
5979 for (
unsigned J = 0; J != NumArgs; ++J)
5982 for (
Value *V : VL) {
5983 CallInst *CI2 = dyn_cast<CallInst>(V);
5989 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
5991 return TreeEntry::NeedToGather;
5995 for (
unsigned J = 0; J != NumArgs; ++J) {
5998 if (ScalarArgs[J] != A1J) {
6000 <<
"SLP: mismatched arguments in call:" << *CI
6001 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
6002 return TreeEntry::NeedToGather;
6011 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
6012 <<
"!=" << *V <<
'\n');
6013 return TreeEntry::NeedToGather;
6017 return TreeEntry::Vectorize;
6019 case Instruction::ShuffleVector: {
6022 if (!S.isAltShuffle()) {
6023 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
6024 return TreeEntry::NeedToGather;
6026 return TreeEntry::Vectorize;
6030 return TreeEntry::NeedToGather;
6035 const EdgeInfo &UserTreeIdx) {
6041 auto TryToFindDuplicates = [&](
const InstructionsState &S,
6042 bool DoNotFail =
false) {
6045 for (
Value *V : VL) {
6052 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
6057 size_t NumUniqueScalarValues = UniqueValues.
size();
6058 if (NumUniqueScalarValues == VL.size()) {
6059 ReuseShuffleIndicies.
clear();
6062 if (NumUniqueScalarValues <= 1 ||
6063 (UniquePositions.size() == 1 &&
all_of(UniqueValues,
6065 return isa<UndefValue>(V) ||
6068 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6069 if (DoNotFail && UniquePositions.size() > 1 &&
6070 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6072 return isa<ExtractElementInst>(V) ||
6073 areAllUsersVectorized(cast<Instruction>(V),
6077 if (PWSz == VL.size()) {
6078 ReuseShuffleIndicies.
clear();
6080 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
6081 NonUniqueValueVL.
append(PWSz - UniqueValues.
size(),
6082 UniqueValues.
back());
6083 VL = NonUniqueValueVL;
6088 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6099 if (!EphValues.
empty()) {
6100 for (
Value *V : VL) {
6101 if (EphValues.
count(V)) {
6103 <<
") is ephemeral.\n");
6104 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6114 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6119 cast<Instruction>(
I)->getOpcode() ==
6120 cast<Instruction>(S.MainOp)->getOpcode();
6122 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
6123 if (TryToFindDuplicates(S))
6124 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6125 ReuseShuffleIndicies);
6130 if (S.getOpcode() == Instruction::ExtractElement &&
6131 isa<ScalableVectorType>(
6132 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6133 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
6134 if (TryToFindDuplicates(S))
6135 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6136 ReuseShuffleIndicies);
6141 if (S.OpValue->getType()->isVectorTy() &&
6142 !isa<InsertElementInst>(S.OpValue)) {
6144 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6148 if (
StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6149 if (
SI->getValueOperand()->getType()->isVectorTy()) {
6150 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
6151 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6160 auto &&NotProfitableForVectorization = [&S,
this,
6162 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6171 for (
Value *V : VL) {
6172 auto *
I = cast<Instruction>(V);
6174 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6178 if ((IsCommutative &&
6179 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6181 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
6183 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
6185 auto *
I1 = cast<Instruction>(VL.front());
6186 auto *I2 = cast<Instruction>(VL.back());
6187 for (
int Op = 0,
E = S.MainOp->getNumOperands();
Op <
E; ++
Op)
6189 I2->getOperand(
Op));
6190 if (
static_cast<unsigned>(
count_if(
6191 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6193 })) >= S.MainOp->getNumOperands() / 2)
6195 if (S.MainOp->getNumOperands() > 2)
6197 if (IsCommutative) {
6200 for (
int Op = 0,
E = S.MainOp->getNumOperands();
Op <
E; ++
Op)
6202 I2->getOperand((
Op + 1) %
E));
6204 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6213 bool IsScatterVectorizeUserTE =
6214 UserTreeIdx.UserTE &&
6215 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6216 bool AreAllSameInsts =
6218 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6222 auto *
I = dyn_cast<GetElementPtrInst>(V);
6226 BB =
I->getParent();
6227 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
6233 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6236 NotProfitableForVectorization(VL)) {
6237 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
6238 if (TryToFindDuplicates(S))
6239 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6240 ReuseShuffleIndicies);
6248 if (TreeEntry *
E = getTreeEntry(S.OpValue)) {
6249 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.OpValue <<
".\n");
6250 if (!
E->isSame(VL)) {
6251 auto It = MultiNodeScalars.
find(S.OpValue);
6252 if (It != MultiNodeScalars.
end()) {
6253 auto *TEIt =
find_if(It->getSecond(),
6254 [&](TreeEntry *ME) { return ME->isSame(VL); });
6255 if (TEIt != It->getSecond().end())
6265 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
6266 if (TryToFindDuplicates(S))
6267 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6268 ReuseShuffleIndicies);
6274 E->UserTreeIndices.push_back(UserTreeIdx);
6275 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.OpValue
6282 for (
Value *V : VL) {
6283 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6286 if (getTreeEntry(V)) {
6288 <<
") is already in tree.\n");
6289 if (TryToFindDuplicates(S))
6290 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6291 ReuseShuffleIndicies);
6297 if (UserIgnoreList && !UserIgnoreList->empty()) {
6298 for (
Value *V : VL) {
6299 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6300 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
6301 if (TryToFindDuplicates(S))
6302 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6303 ReuseShuffleIndicies);
6311 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6312 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6314 assert(S.OpValue->getType()->isPointerTy() &&
6315 count_if(VL, [](
Value *V) {
return isa<GetElementPtrInst>(V); }) >=
6317 "Expected pointers only.");
6319 const auto *It =
find_if(VL, [](
Value *V) {
return isa<GetElementPtrInst>(V); });
6320 assert(It != VL.end() &&
"Expected at least one GEP.");
6326 auto *VL0 = cast<Instruction>(S.OpValue);
6333 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6342 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6347 if (!TryToFindDuplicates(S,
true))
6353 TreeEntry::EntryState State = getScalarsVectorizationState(
6354 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6355 if (State == TreeEntry::NeedToGather) {
6356 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6357 ReuseShuffleIndicies);
6361 auto &BSRef = BlocksSchedules[BB];
6363 BSRef = std::make_unique<BlockScheduling>(BB);
6365 BlockScheduling &BS = *BSRef;
6367 std::optional<ScheduleData *> Bundle =
6368 BS.tryScheduleBundle(UniqueValues,
this, S);
6369#ifdef EXPENSIVE_CHECKS
6374 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
6375 assert((!BS.getScheduleData(VL0) ||
6376 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6377 "tryScheduleBundle should cancelScheduling on failure");
6378 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6379 ReuseShuffleIndicies);
6382 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
6384 unsigned ShuffleOrOp = S.isAltShuffle() ?
6385 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
6386 switch (ShuffleOrOp) {
6387 case Instruction::PHI: {
6388 auto *PH = cast<PHINode>(VL0);
6391 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6396 for (
unsigned I = 0,
E = PH->getNumIncomingValues();
I <
E; ++
I) {
6406 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
6407 PH->getIncomingBlock(
I)));
6411 for (
unsigned OpIdx = 0, OpE = OperandsVec.
size(); OpIdx != OpE; ++OpIdx)
6412 buildTree_rec(OperandsVec[OpIdx],
Depth + 1, {
TE, OpIdx});
6415 case Instruction::ExtractValue:
6416 case Instruction::ExtractElement: {
6417 if (CurrentOrder.empty()) {
6418 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
6419 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6420 ReuseShuffleIndicies);
6424 Op0.
assign(VL.size(), VL0->getOperand(0));
6425 VectorizableTree.back()->setOperand(0, Op0);
6429 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
6431 for (
unsigned Idx : CurrentOrder)
6438 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6439 ReuseShuffleIndicies, CurrentOrder);
6443 Op0.
assign(VL.size(), VL0->getOperand(0));
6444 VectorizableTree.back()->setOperand(0, Op0);
6447 case Instruction::InsertElement: {
6448 assert(ReuseShuffleIndicies.
empty() &&
"All inserts should be unique");
6450 auto OrdCompare = [](
const std::pair<int, int> &P1,
6451 const std::pair<int, int> &P2) {
6452 return P1.first > P2.first;
6455 decltype(OrdCompare)>
6456 Indices(OrdCompare);
6457 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
6459 Indices.emplace(
Idx,
I);
6461 OrdersType CurrentOrder(VL.size(), VL.size());
6462 bool IsIdentity =
true;
6463 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
6464 CurrentOrder[Indices.top().second] =
I;
6465 IsIdentity &= Indices.top().second ==
I;
6469 CurrentOrder.clear();
6470 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6471 std::nullopt, CurrentOrder);
6474 constexpr int NumOps = 2;
6476 for (
int I = 0;
I < NumOps; ++
I) {
6478 VectorOperands[
I].
push_back(cast<Instruction>(V)->getOperand(
I));
6480 TE->setOperand(
I, VectorOperands[
I]);
6482 buildTree_rec(VectorOperands[NumOps - 1],
Depth + 1, {
TE, NumOps - 1});
6485 case Instruction::Load: {
6492 TreeEntry *
TE =
nullptr;
6495 case TreeEntry::Vectorize:
6496 if (CurrentOrder.empty()) {
6498 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6499 ReuseShuffleIndicies);
6503 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6504 ReuseShuffleIndicies, CurrentOrder);
6507 TE->setOperandsInOrder();
6509 case TreeEntry::StridedVectorize:
6511 if (CurrentOrder.empty()) {
6512 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6513 UserTreeIdx, ReuseShuffleIndicies);
6515 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6516 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6518 TE->setOperandsInOrder();
6521 case TreeEntry::ScatterVectorize:
6523 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6524 UserTreeIdx, ReuseShuffleIndicies);
6525 TE->setOperandsInOrder();
6526 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
6527 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
6529 case TreeEntry::NeedToGather:
6534 case Instruction::ZExt:
6535 case Instruction::SExt:
6536 case Instruction::FPToUI:
6537 case Instruction::FPToSI:
6538 case Instruction::FPExt:
6539 case Instruction::PtrToInt:
6540 case Instruction::IntToPtr:
6541 case Instruction::SIToFP:
6542 case Instruction::UIToFP:
6543 case Instruction::Trunc:
6544 case Instruction::FPTrunc:
6545 case Instruction::BitCast: {
6546 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6547 ReuseShuffleIndicies);
6550 TE->setOperandsInOrder();
6551 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6555 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
6561 case Instruction::ICmp:
6562 case Instruction::FCmp: {
6565 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6566 ReuseShuffleIndicies);
6574 "Commutative Predicate mismatch");
6575 reorderInputsAccordingToOpcode(VL,
Left,
Right, *TLI, *
DL, *SE, *
this);
6578 for (
Value *V : VL) {
6579 auto *
Cmp = cast<CmpInst>(V);
6582 if (
Cmp->getPredicate() != P0)
6594 case Instruction::Select:
6595 case Instruction::FNeg:
6596 case Instruction::Add:
6597 case Instruction::FAdd:
6598 case Instruction::Sub:
6599 case Instruction::FSub:
6600 case Instruction::Mul:
6601 case Instruction::FMul:
6602 case Instruction::UDiv:
6603 case Instruction::SDiv:
6604 case Instruction::FDiv:
6605 case Instruction::URem:
6606 case Instruction::SRem:
6607 case Instruction::FRem:
6608 case Instruction::Shl:
6609 case Instruction::LShr:
6610 case Instruction::AShr:
6611 case Instruction::And:
6612 case Instruction::Or:
6613 case Instruction::Xor: {
6614 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6615 ReuseShuffleIndicies);
6620 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
6622 reorderInputsAccordingToOpcode(VL,
Left,
Right, *TLI, *
DL, *SE, *
this);
6630 TE->setOperandsInOrder();
6631 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6635 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
6641 case Instruction::GetElementPtr: {
6642 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6643 ReuseShuffleIndicies);
6647 for (
Value *V : VL) {
6648 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
6653 Operands.front().push_back(
GEP->getPointerOperand());
6662 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
6664 [VL0Ty, IndexIdx](
Value *V) {
6665 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
6668 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
6671 :
DL->getIndexType(cast<GetElementPtrInst>(VL0)
6672 ->getPointerOperandType()
6675 for (
Value *V : VL) {
6676 auto *
I = dyn_cast<GetElementPtrInst>(V);
6679 ConstantInt::get(Ty, 0,
false));
6682 auto *
Op =
I->getOperand(IndexIdx);
6683 auto *CI = dyn_cast<ConstantInt>(
Op);
6688 CI, Ty, CI->getValue().isSignBitSet(), *
DL));
6692 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
6696 case Instruction::Store: {
6700 for (
Value *V : VL) {
6701 auto *
SI = cast<StoreInst>(V);
6702 *OIter =
SI->getValueOperand();
6706 if (CurrentOrder.empty()) {
6708 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6709 ReuseShuffleIndicies);
6710 TE->setOperandsInOrder();
6715 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6716 ReuseShuffleIndicies, CurrentOrder);
6717 TE->setOperandsInOrder();
6719 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");
6723 case Instruction::Call: {
6726 CallInst *CI = cast<CallInst>(VL0);
6729 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6730 ReuseShuffleIndicies);
6731 TE->setOperandsInOrder();
6732 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
6739 for (
Value *V : VL) {
6740 auto *CI2 = cast<CallInst>(V);
6747 case Instruction::ShuffleVector: {
6748 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6749 ReuseShuffleIndicies);
6753 auto *CI = dyn_cast<CmpInst>(VL0);
6754 if (isa<BinaryOperator>(VL0) || CI) {
6757 return cast<CmpInst>(V)->isCommutative();
6759 reorderInputsAccordingToOpcode(VL,
Left,
Right, *TLI, *
DL, *SE,
6762 auto *MainCI = cast<CmpInst>(S.MainOp);
6763 auto *AltCI = cast<CmpInst>(S.AltOp);
6767 "Expected different main/alternate predicates.");
6770 for (
Value *V : VL) {
6771 auto *
Cmp = cast<CmpInst>(V);
6793 TE->setOperandsInOrder();
6794 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6798 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
6814 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
6815 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
6817 for (
const auto *Ty : ST->elements())
6818 if (Ty != *ST->element_begin())
6820 N *= ST->getNumElements();
6821 EltTy = *ST->element_begin();
6822 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
6823 N *= AT->getNumElements();
6824 EltTy = AT->getElementType();
6826 auto *VT = cast<FixedVectorType>(EltTy);
6827 N *= VT->getNumElements();
6828 EltTy = VT->getElementType();
6835 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
6836 VTSize !=
DL->getTypeStoreSizeInBits(
T))
6843 bool ResizeAllowed)
const {
6845 return isa<ExtractElementInst, ExtractValueInst>(V);
6847 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
6848 auto *E0 = cast<Instruction>(*It);
6851 return isa<UndefValue, ExtractElementInst, ExtractValueInst>(
6857 Value *Vec = E0->getOperand(0);
6859 CurrentOrder.
clear();
6863 if (E0->getOpcode() == Instruction::ExtractValue) {
6868 LoadInst *LI = dyn_cast<LoadInst>(Vec);
6872 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
6875 unsigned E = VL.
size();
6876 if (!ResizeAllowed && NElts !=
E)
6879 unsigned MinIdx = NElts, MaxIdx = 0;
6881 auto *Inst = dyn_cast<Instruction>(V);
6884 if (Inst->getOperand(0) != Vec)
6886 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
6887 if (isa<UndefValue>(EE->getIndexOperand()))
6892 const unsigned ExtIdx = *
Idx;
6893 if (ExtIdx >= NElts)
6895 Indices[
I] = ExtIdx;
6896 if (MinIdx > ExtIdx)
6898 if (MaxIdx < ExtIdx)
6901 if (MaxIdx - MinIdx + 1 >
E)
6903 if (MaxIdx + 1 <=
E)
6907 bool ShouldKeepOrder =
true;
6914 for (
unsigned I = 0;
I <
E; ++
I) {
6917 const unsigned ExtIdx = Indices[
I] - MinIdx;
6918 if (CurrentOrder[ExtIdx] !=
E) {
6919 CurrentOrder.
clear();
6922 ShouldKeepOrder &= ExtIdx ==
I;
6923 CurrentOrder[ExtIdx] =
I;
6925 if (ShouldKeepOrder)
6926 CurrentOrder.
clear();
6928 return ShouldKeepOrder;
6931bool BoUpSLP::areAllUsersVectorized(
6933 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
6935 return ScalarToTreeEntry.contains(U) ||
6936 isVectorLikeInstWithConstOps(U) ||
6937 (isa<ExtractElementInst>(U) && MustGather.contains(U));
6941static std::pair<InstructionCost, InstructionCost>
6952 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
6953 FMF = FPCI->getFastMathFlags();
6956 dyn_cast<IntrinsicInst>(CI));
6957 auto IntrinsicCost =
6964 auto LibCost = IntrinsicCost;
6971 return {IntrinsicCost, LibCost};
6974void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
6978 unsigned Sz = Scalars.size();
6981 if (!ReorderIndices.empty())
6983 for (
unsigned I = 0;
I < Sz; ++
I) {
6985 if (!ReorderIndices.empty())
6987 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
6988 if (IsAltOp(OpInst)) {
6998 if (!ReuseShuffleIndices.empty()) {
7001 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7011 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7012 auto *AltCI = cast<CmpInst>(AltOp);
7015 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
7016 auto *CI = cast<CmpInst>(
I);
7024 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
7025 "CmpInst expected to match either main or alternate predicate or "
7028 return MainP !=
P && MainP != SwappedP;
7035 const auto *Op0 = Ops.
front();
7041 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
7045 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
7047 if (
auto *CI = dyn_cast<ConstantInt>(V))
7048 return CI->getValue().isPowerOf2();
7051 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
7053 if (
auto *CI = dyn_cast<ConstantInt>(V))
7054 return CI->getValue().isNegatedPowerOf2();
7059 if (IsConstant && IsUniform)
7061 else if (IsConstant)
7075class BaseShuffleAnalysis {
7082 int Limit =
Mask.size();
7094 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
7110 unsigned VF =
Mask.size();
7112 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
7115 int MaskedIdx =
Mask[ExtMask[
I] % VF];
7156 bool SinglePermute) {
7160 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
7162 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7168 if (isIdentityMask(Mask, SVTy,
false)) {
7169 if (!IdentityOp || !SinglePermute ||
7170 (isIdentityMask(Mask, SVTy,
true) &&
7172 IdentityMask.
size()))) {
7177 IdentityMask.
assign(Mask);
7197 if (SV->isZeroEltSplat()) {
7199 IdentityMask.
assign(Mask);
7201 int LocalVF =
Mask.size();
7203 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7204 LocalVF = SVOpTy->getNumElements();
7208 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
7210 ExtMask[
Idx] = SV->getMaskValue(
I);
7220 if (!IsOp1Undef && !IsOp2Undef) {
7222 for (
int &
I : Mask) {
7225 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
7232 SV->getShuffleMask().end());
7233 combineMasks(LocalVF, ShuffleMask, Mask);
7234 Mask.swap(ShuffleMask);
7236 Op = SV->getOperand(0);
7238 Op = SV->getOperand(1);
7240 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
7241 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7246 "Expected masks of same sizes.");
7251 Mask.swap(IdentityMask);
7252 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7253 return SinglePermute &&
7254 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
7256 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
7257 Shuffle->isZeroEltSplat() &&
7270 template <
typename T,
typename ShuffleBuilderTy>
7272 ShuffleBuilderTy &Builder) {
7273 assert(V1 &&
"Expected at least one vector value.");
7275 Builder.resizeToMatch(V1, V2);
7276 int VF =
Mask.size();
7277 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
7278 VF = FTy->getNumElements();
7285 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
7288 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7290 CombinedMask1[
I] =
Mask[
I];
7292 CombinedMask2[
I] =
Mask[
I] - VF;
7299 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
7300 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
7303 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7304 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7309 ExtMask1[
Idx] = SV1->getMaskValue(
I);
7312 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7314 ExtMask1, UseMask::SecondArg);
7319 ExtMask2[
Idx] = SV2->getMaskValue(
I);
7322 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7324 ExtMask2, UseMask::SecondArg);
7325 if (SV1->getOperand(0)->getType() ==
7326 SV2->getOperand(0)->getType() &&
7327 SV1->getOperand(0)->getType() != SV1->getType() &&
7330 Op1 = SV1->getOperand(0);
7331 Op2 = SV2->getOperand(0);
7333 SV1->getShuffleMask().end());
7334 int LocalVF = ShuffleMask1.size();
7335 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
7336 LocalVF = FTy->getNumElements();
7337 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7338 CombinedMask1.swap(ShuffleMask1);
7340 SV2->getShuffleMask().end());
7341 LocalVF = ShuffleMask2.size();
7342 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
7343 LocalVF = FTy->getNumElements();
7344 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7345 CombinedMask2.swap(ShuffleMask2);
7348 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
7349 Builder.resizeToMatch(Op1, Op2);
7350 VF = std::max(cast<VectorType>(Op1->
getType())
7352 .getKnownMinValue(),
7353 cast<VectorType>(Op2->
getType())
7355 .getKnownMinValue());
7356 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7359 "Expected undefined mask element");
7360 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
7366 isa<ShuffleVectorInst>(Op1) &&
7367 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7369 return Builder.createIdentity(Op1);
7370 return Builder.createShuffleVector(
7374 if (isa<PoisonValue>(V1))
7375 return Builder.createPoison(
7376 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
7378 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
7379 assert(V1 &&
"Expected non-null value after looking through shuffles.");
7382 return Builder.createShuffleVector(V1, NewMask);
7383 return Builder.createIdentity(V1);
7399 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7402 Mask, NumSrcElts, NumSubElts,
Index)) {
7403 if (
Index + NumSubElts > NumSrcElts &&
7404 Index + NumSrcElts <=
static_cast<int>(Mask.size()))
7414static std::pair<InstructionCost, InstructionCost>
7425 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7435 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7439 for (
Value *V : Ptrs) {
7444 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7449 if (!
Ptr || !
Ptr->hasOneUse())
7453 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
7459 TTI::PointersChainInfo::getKnownStride(),
7469 [](
const Value *V) {
7470 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7471 return Ptr && !
Ptr->hasAllConstantIndices();
7473 ? TTI::PointersChainInfo::getUnknownStride()
7474 : TTI::PointersChainInfo::getKnownStride();
7478 if (
auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7481 BaseGEP->getPointerOperand(), Indices, VecTy,
7486 return std::make_pair(ScalarCost, VecCost);
7496 bool IsFinalized =
false;
7509 bool SameNodesEstimated =
true;
7518 if (
auto *VTy = dyn_cast<VectorType>(Ty))
7534 const unsigned Sz = R.DL->getTypeSizeInBits(VL.
front()->getType());
7535 unsigned MinVF = R.getMinVF(2 * Sz);
7536 if (VL.
size() > 2 &&
7537 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
7538 (InVectors.
empty() &&
7541 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
7542 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
7543 return S.getOpcode() == Instruction::Load &&
7546 !
all_of(Gathers, [&](
Value *V) {
return R.getTreeEntry(V); }) &&
7552 unsigned StartIdx = 0;
7553 unsigned VF = VL.
size() / 2;
7554 for (; VF >= MinVF; VF /= 2) {
7555 for (
unsigned Cnt = StartIdx,
End = VL.
size(); Cnt + VF <=
End;
7558 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
7560 if (SliceS.getOpcode() != Instruction::Load ||
7561 SliceS.isAltShuffle())
7569 CurrentOrder, PointerOps);
7579 CurrentOrder.
empty()) ||
7588 if (Cnt == StartIdx)
7597 if (StartIdx >= VL.
size())
7600 if (!VectorizedLoads.
empty())
7603 if (!VectorizedLoads.
empty()) {
7605 bool NeedInsertSubvectorAnalysis =
7606 !NumParts || (VL.
size() / VF) > NumParts;
7611 GatherCost += getBuildVectorCost(VL.
slice(
I, VF), Root);
7618 for (
Value *V : VectorizedLoads) {
7619 auto *LI = cast<LoadInst>(V);
7626 for (
const std::pair<unsigned, LoadsState> &
P : VectorizedStarts) {
7627 auto *LI = cast<LoadInst>(VL[
P.first]);
7636 false, Alignment, CostKind, LI);
7640 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
7641 auto [ScalarGEPCost, VectorGEPCost] =
7643 Instruction::Load, CostKind, LI->
getType(), LoadTy);
7644 GatherCost += VectorGEPCost - ScalarGEPCost;
7646 for (
unsigned P : ScatterVectorized) {
7647 auto *LI0 = cast<LoadInst>(VL[
P]);
7649 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
7651 Instruction::Load, LoadTy, LI0->getPointerOperand(),
7652 false, CommonAlignment, CostKind, LI0);
7656 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
7664 auto [ScalarGEPCost, VectorGEPCost] =
7666 CostKind, ScalarTy, VecTy);
7667 GatherCost += VectorGEPCost - ScalarGEPCost;
7668 if (!Order.
empty()) {
7672 VecTy, Mask, CostKind);
7675 GatherCost += R.getGatherCost(PointerOps,
true);
7678 if (NeedInsertSubvectorAnalysis) {
7681 for (
unsigned I = VF,
E = VL.
size();
I <
E;
I += VF) {
7682 for (
unsigned Idx : seq<unsigned>(0,
E))
7685 ShuffleMask, CostKind,
I, LoadTy);
7688 GatherCost -= ScalarsCost;
7690 GatherCost = std::min(BaseCost, GatherCost);
7691 }
else if (!Root &&
isSplat(VL)) {
7695 find_if(VL, [](
Value *V) {
return !isa<UndefValue>(V); });
7696 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
7699 count(VL, *It) > 1 &&
7703 CostKind, std::distance(VL.
begin(), It),
7708 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
7711 Instruction::InsertElement, VecTy, CostKind, 0,
7715 ShuffleMask, CostKind, 0,
7721 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers)));
7728 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
7729 unsigned NumParts) {
7730 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
7732 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
7733 auto *EE = dyn_cast<ExtractElementInst>(V);
7736 auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
7737 return std::max(Sz, VecTy->getNumElements());
7741 if (NumSrcRegs == 0)
7746 auto CheckPerRegistersShuffle =
7751 int FirstRegId = -1;
7752 for (
int &
I : Mask) {
7755 int RegId = (
I / NumElts) * NumParts + (
I % NumElts) / EltsPerVector;
7758 RegIndices.
insert(RegId);
7759 if (RegIndices.
size() > 2)
7760 return std::nullopt;
7761 if (RegIndices.
size() == 2)
7763 I = (
I % NumElts) % EltsPerVector +
7764 (RegId == FirstRegId ? 0 : EltsPerVector);
7773 for (
unsigned Part = 0; Part < NumParts; ++Part) {
7774 if (!ShuffleKinds[Part])
7777 Mask.slice(Part * EltsPerVector,
7778 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
7779 ? Mask.size() % EltsPerVector
7783 std::optional<TTI::ShuffleKind> RegShuffleKind =
7784 CheckPerRegistersShuffle(SubMask);
7785 if (!RegShuffleKind) {
7787 TTI, *ShuffleKinds[Part],
7794 TTI, *RegShuffleKind,
7805 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
7812 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
7814 unsigned SliceSize) {
7815 if (SameNodesEstimated) {
7821 if ((InVectors.
size() == 2 &&
7822 InVectors.
front().get<
const TreeEntry *>() == &E1 &&
7823 InVectors.
back().get<
const TreeEntry *>() == E2) ||
7824 (!E2 && InVectors.
front().get<
const TreeEntry *>() == &E1)) {
7827 "Expected all poisoned elements.");
7830 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
7835 Cost += createShuffle(InVectors.
front(),
7836 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
7838 transformMaskAfterShuffle(CommonMask, CommonMask);
7840 SameNodesEstimated =
false;
7841 if (!E2 && InVectors.
size() == 1) {
7842 unsigned VF = E1.getVectorFactor();
7845 cast<FixedVectorType>(V1->
getType())->getNumElements());
7847 const auto *
E = InVectors.
front().get<
const TreeEntry *>();
7848 VF = std::max(VF,
E->getVectorFactor());
7850 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
7852 CommonMask[
Idx] = Mask[
Idx] + VF;
7853 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
7854 transformMaskAfterShuffle(CommonMask, CommonMask);
7856 Cost += createShuffle(&E1, E2, Mask);
7857 transformMaskAfterShuffle(CommonMask, Mask);
7861 class ShuffleCostBuilder {
7864 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
7866 return Mask.empty() ||
7867 (VF == Mask.size() &&
7875 ~ShuffleCostBuilder() =
default;
7880 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
7881 if (isEmptyOrIdentity(Mask, VF))
7884 cast<VectorType>(V1->
getType()), Mask);
7889 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
7890 if (isEmptyOrIdentity(Mask, VF))
7893 cast<VectorType>(V1->
getType()), Mask);
7899 void resizeToMatch(
Value *&,
Value *&)
const {}
7909 ShuffleCostBuilder Builder(
TTI);
7912 unsigned CommonVF = Mask.size();
7913 if (!V1 && !V2 && !P2.
isNull()) {
7915 const TreeEntry *
E = P1.
get<
const TreeEntry *>();
7916 unsigned VF =
E->getVectorFactor();
7917 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
7918 CommonVF = std::max(VF, E2->getVectorFactor());
7921 return Idx < 2 * static_cast<int>(CommonVF);
7923 "All elements in mask must be less than 2 * CommonVF.");
7924 if (
E->Scalars.size() == E2->Scalars.size()) {
7928 for (
int &
Idx : CommonMask) {
7931 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
7933 else if (
Idx >=
static_cast<int>(CommonVF))
7934 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
7938 CommonVF =
E->Scalars.size();
7942 V2 = getAllOnesValue(
7944 }
else if (!V1 && P2.
isNull()) {
7946 const TreeEntry *
E = P1.
get<
const TreeEntry *>();
7947 unsigned VF =
E->getVectorFactor();
7951 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
7952 "All elements in mask must be less than CommonVF.");
7953 if (
E->Scalars.size() == Mask.size() && VF != Mask.size()) {
7955 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
7956 for (
int &
Idx : CommonMask) {
7960 CommonVF =
E->Scalars.size();
7965 if (!
E->ReorderIndices.empty() && CommonVF ==
E->ReorderIndices.size() &&
7966 CommonVF == CommonMask.
size() &&
7968 [](
const auto &&
P) {
7970 static_cast<unsigned>(
P.value()) !=
P.index();
7978 }
else if (V1 && P2.
isNull()) {
7980 CommonVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
7983 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
7984 "All elements in mask must be less than CommonVF.");
7985 }
else if (V1 && !V2) {
7987 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
7988 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
7989 CommonVF = std::max(VF, E2->getVectorFactor());
7992 return Idx < 2 * static_cast<int>(CommonVF);
7994 "All elements in mask must be less than 2 * CommonVF.");
7995 if (E2->Scalars.size() == VF && VF != CommonVF) {
7997 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
7998 for (
int &
Idx : CommonMask) {
8001 if (
Idx >=
static_cast<int>(CommonVF))
8002 Idx = E2Mask[
Idx - CommonVF] + VF;
8008 V2 = getAllOnesValue(
8011 }
else if (!V1 && V2) {
8013 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8014 const TreeEntry *E1 = P1.
get<
const TreeEntry *>();
8015 CommonVF = std::max(VF, E1->getVectorFactor());
8018 return Idx < 2 * static_cast<int>(CommonVF);
8020 "All elements in mask must be less than 2 * CommonVF.");
8021 if (E1->Scalars.size() == VF && VF != CommonVF) {
8023 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
8024 for (
int &
Idx : CommonMask) {
8027 if (
Idx >=
static_cast<int>(CommonVF))
8028 Idx = E1Mask[
Idx - CommonVF] + VF;
8036 V2 = getAllOnesValue(
8040 assert(V1 && V2 &&
"Expected both vectors.");
8041 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8043 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8046 return Idx < 2 * static_cast<int>(CommonVF);
8048 "All elements in mask must be less than 2 * CommonVF.");
8049 if (V1->
getType() != V2->getType()) {
8051 cast<FixedVectorType>(V1->
getType())->getElementType(), CommonVF));
8052 V2 = getAllOnesValue(
8054 cast<FixedVectorType>(V1->
getType())->getElementType(),
8059 cast<FixedVectorType>(V1->
getType())->getElementType(),
8060 CommonMask.
size()));
8061 if (InVectors.
size() == 2)
8063 return BaseShuffleAnalysis::createShuffle<InstructionCost>(
8064 V1, V2, CommonMask, Builder);
8071 :
TTI(
TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
8072 R(R), CheckedExtracts(CheckedExtracts) {}
8074 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8075 unsigned NumParts,
bool &UseVecBaseAsInput) {
8076 UseVecBaseAsInput =
false;
8079 Value *VecBase =
nullptr;
8082 if (NumParts == VL.
size())
8086 bool PrevNodeFound =
any_of(
8088 [&](
const std::unique_ptr<TreeEntry> &TE) {
8089 return ((!TE->isAltShuffle() &&
8090 TE->getOpcode() == Instruction::ExtractElement) ||
8091 TE->State == TreeEntry::NeedToGather) &&
8092 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8093 return VL.size() > Data.index() &&
8094 (Mask[Data.index()] == PoisonMaskElem ||
8095 isa<UndefValue>(VL[Data.index()]) ||
8096 Data.value() == VL[Data.index()]);
8100 unsigned SliceSize = VL.
size() / NumParts;
8101 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8102 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8103 for (
auto [
I, V] :
enumerate(VL.
slice(Part * SliceSize, SliceSize))) {
8105 if (isa<UndefValue>(V) ||
8114 auto *EE = cast<ExtractElementInst>(V);
8115 VecBase = EE->getVectorOperand();
8116 UniqueBases.
insert(VecBase);
8117 const TreeEntry *VE = R.getTreeEntry(V);
8118 if (!CheckedExtracts.
insert(V).second ||
8119 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8125 unsigned Idx = *EEIdx;
8127 if (EE->hasOneUse() || !PrevNodeFound) {
8129 if (isa<SExtInst, ZExtInst>(Ext) &&
all_of(Ext->users(), [](
User *U) {
8130 return isa<GetElementPtrInst>(U);
8136 EE->getVectorOperandType(),
Idx);
8139 Ext->getOpcode(), Ext->getType(), EE->getType(),
8155 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8158 transformMaskAfterShuffle(CommonMask, CommonMask);
8159 SameNodesEstimated =
false;
8160 if (NumParts != 1 && UniqueBases.
size() != 1) {
8161 UseVecBaseAsInput =
true;
8169 std::optional<InstructionCost>
8173 return std::nullopt;
8179 return Idx < static_cast<int>(E1.getVectorFactor());
8181 "Expected single vector shuffle mask.");
8185 if (InVectors.
empty()) {
8186 CommonMask.
assign(Mask.begin(), Mask.end());
8187 InVectors.
assign({&E1, &E2});
8190 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8194 if (NumParts == 0 || NumParts >= Mask.size())
8196 unsigned SliceSize = Mask.size() / NumParts;
8199 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8200 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8203 if (InVectors.
empty()) {
8204 CommonMask.
assign(Mask.begin(), Mask.end());
8205 InVectors.
assign(1, &E1);
8208 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8212 if (NumParts == 0 || NumParts >= Mask.size())
8214 unsigned SliceSize = Mask.size() / NumParts;
8217 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8218 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
8219 if (!SameNodesEstimated && InVectors.
size() == 1)
8232 cast<ExtractElementInst>(InVectors.
front()
8233 .get<
const TreeEntry *>()
8234 ->Scalars[
P.index()]);
8235 return EI->getVectorOperand() == V1 ||
8236 EI->getVectorOperand() == V2;
8238 "Expected extractelement vectors.");
8242 if (InVectors.
empty()) {
8244 "Expected empty input mask/vectors.");
8245 CommonMask.
assign(Mask.begin(), Mask.end());
8252 InVectors.
front().is<
const TreeEntry *>() && !CommonMask.
empty() &&
8256 .get<const TreeEntry *>()
8257 ->Scalars[
P.index()];
8259 return P.value() == Mask[
P.index()] ||
8260 isa<UndefValue>(Scalar);
8261 if (isa<Constant>(V1))
8263 auto *EI = cast<ExtractElementInst>(Scalar);
8264 return EI->getVectorOperand() == V1;
8266 "Expected only tree entry for extractelement vectors.");
8270 "Expected only tree entries from extracts/reused buildvectors.");
8271 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8272 if (InVectors.
size() == 2) {
8273 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
8274 transformMaskAfterShuffle(CommonMask, CommonMask);
8275 VF = std::max<unsigned>(VF, CommonMask.
size());
8276 }
else if (
const auto *InTE =
8277 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
8278 VF = std::max(VF, InTE->getVectorFactor());
8282 ->getNumElements());
8285 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8287 CommonMask[
Idx] = Mask[
Idx] + VF;
8290 Value *Root =
nullptr) {
8291 Cost += getBuildVectorCost(VL, Root);
8295 unsigned VF = VL.
size();
8297 VF = std::min(VF, MaskVF);
8299 if (isa<UndefValue>(V)) {
8309 cast<FixedVectorType>(Root->
getType())->getNumElements()),
8310 getAllOnesValue(*R.DL, VL.
front()->getType()));
8320 if (InVectors.
size() == 2)
8321 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
8323 Cost += createShuffle(Vec,
nullptr, CommonMask);
8324 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8328 "Expected vector length for the final value before action.");
8330 Action(V, CommonMask);
8331 InVectors.
front() = V;
8334 if (CommonMask.
empty()) {
8335 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
8339 createShuffle(InVectors.
front(),
8340 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
8346 "Shuffle construction must be finalized.");
8350const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *
E,
8351 unsigned Idx)
const {
8353 if (
const TreeEntry *TE = getTreeEntry(
Op)) {
8354 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8355 return EI.EdgeIdx == Idx && EI.UserTE == E;
8356 }) != TE->UserTreeIndices.end())
8358 auto MIt = MultiNodeScalars.
find(
Op);
8359 if (MIt != MultiNodeScalars.
end()) {
8360 for (
const TreeEntry *TE : MIt->second) {
8361 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8362 return EI.EdgeIdx == Idx && EI.UserTE == E;
8363 }) != TE->UserTreeIndices.end())
8369 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8370 return TE->State == TreeEntry::NeedToGather &&
8371 find_if(
TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8372 return EI.EdgeIdx == Idx && EI.UserTE == E;
8373 }) !=
TE->UserTreeIndices.end();
8375 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
8384 Type *ScalarTy = VL[0]->getType();
8385 if (
E->State != TreeEntry::NeedToGather) {
8386 if (
auto *SI = dyn_cast<StoreInst>(VL[0]))
8387 ScalarTy =
SI->getValueOperand()->getType();
8388 else if (
auto *CI = dyn_cast<CmpInst>(VL[0]))
8390 else if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
8391 ScalarTy =
IE->getOperand(1)->getType();
8400 auto It = MinBWs.
find(
E);
8401 if (It != MinBWs.
end()) {
8405 unsigned EntryVF =
E->getVectorFactor();
8408 bool NeedToShuffleReuses = !
E->ReuseShuffleIndices.empty();
8409 if (
E->State == TreeEntry::NeedToGather) {
8412 if (isa<InsertElementInst>(VL[0]))
8414 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
8415 E, *
TTI, VectorizedVals, *
this, CheckedExtracts);
8420 if (!
E->ReorderIndices.empty() &&
8421 (
E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
8423 if (
E->getOpcode() == Instruction::Store) {
8425 NewMask.
resize(
E->ReorderIndices.size());
8432 if (NeedToShuffleReuses)
8437 assert((
E->State == TreeEntry::Vectorize ||
8438 E->State == TreeEntry::ScatterVectorize ||
8439 E->State == TreeEntry::StridedVectorize) &&
8443 (
E->getOpcode() == Instruction::GetElementPtr &&
8444 E->getMainOp()->getType()->isPointerTy())) &&
8447 unsigned ShuffleOrOp =
8448 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector :
E->getOpcode();
8450 const unsigned Sz = UniqueValues.
size();
8452 for (
unsigned I = 0;
I < Sz; ++
I) {
8453 if (getTreeEntry(UniqueValues[
I]) ==
E)
8457 auto GetCastContextHint = [&](
Value *
V) {
8458 if (
const TreeEntry *OpTE = getTreeEntry(V)) {
8459 if (OpTE->State == TreeEntry::ScatterVectorize ||
8460 OpTE->State == TreeEntry::StridedVectorize)
8462 if (OpTE->State == TreeEntry::Vectorize &&
8463 OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
8464 if (OpTE->ReorderIndices.empty())
8472 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
8473 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
8483 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
8487 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
8489 for (
unsigned I = 0;
I < Sz; ++
I) {
8490 if (UsedScalars.test(
I))
8492 ScalarCost += ScalarEltCost(
I);
8500 const EdgeInfo &EI =
E->UserTreeIndices.front();
8501 if ((EI.UserTE->getOpcode() != Instruction::Select ||
8503 It != MinBWs.
end()) {
8504 auto UserBWIt = MinBWs.
find(EI.UserTE);
8505 Type *UserScalarTy =
8506 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
8507 if (UserBWIt != MinBWs.
end())
8509 UserBWIt->second.first);
8510 if (ScalarTy != UserScalarTy) {
8511 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
8512 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
8517 VecOpcode = Instruction::Trunc;
8520 It->second.second ? Instruction::SExt : Instruction::ZExt;
8529 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
8530 ScalarCost,
"Calculated costs for Tree"));
8531 return VecCost - ScalarCost;
8536 assert((
E->State == TreeEntry::Vectorize ||
8537 E->State == TreeEntry::StridedVectorize) &&
8538 "Entry state expected to be Vectorize or StridedVectorize here.");
8542 *
TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, ScalarTy, VecTy);
8543 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
8544 "Calculated GEPs cost for Tree"));
8546 return VecCost - ScalarCost;
8549 switch (ShuffleOrOp) {
8550 case Instruction::PHI: {
8554 for (
Value *V : UniqueValues) {
8555 auto *
PHI = dyn_cast<PHINode>(V);
8560 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
8564 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
8566 if (!OpTE->ReuseShuffleIndices.empty())
8567 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
8568 OpTE->Scalars.size());
8571 return CommonCost - ScalarCost;
8573 case Instruction::ExtractValue:
8574 case Instruction::ExtractElement: {
8575 auto GetScalarCost = [&](
unsigned Idx) {
8576 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
8578 if (ShuffleOrOp == Instruction::ExtractElement) {
8579 auto *EE = cast<ExtractElementInst>(
I);
8580 SrcVecTy = EE->getVectorOperandType();
8582 auto *EV = cast<ExtractValueInst>(
I);
8583 Type *AggregateTy = EV->getAggregateOperand()->getType();
8585 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
8586 NumElts = ATy->getNumElements();
8591 if (
I->hasOneUse()) {
8593 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
8595 [](
User *U) { return isa<GetElementPtrInst>(U); })) {
8602 Ext->getOpcode(),
Ext->getType(),
I->getType(),
8610 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
8611 return GetCostDiff(GetScalarCost, GetVectorCost);
8613 case Instruction::InsertElement: {
8614 assert(
E->ReuseShuffleIndices.empty() &&
8615 "Unique insertelements only are expected.");
8616 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
8617 unsigned const NumElts = SrcVecTy->getNumElements();
8618 unsigned const NumScalars = VL.
size();
8624 unsigned OffsetEnd = OffsetBeg;
8625 InsertMask[OffsetBeg] = 0;
8628 if (OffsetBeg >
Idx)
8630 else if (OffsetEnd <
Idx)
8632 InsertMask[
Idx] =
I + 1;
8636 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
8637 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
8639 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
8640 unsigned InsertVecSz = std::min<unsigned>(
8642 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
8643 bool IsWholeSubvector =
8644 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
8648 if (OffsetBeg + InsertVecSz > VecSz) {
8651 InsertVecSz = VecSz;
8657 if (!
E->ReorderIndices.empty()) {
8662 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
8664 bool IsIdentity =
true;
8666 Mask.swap(PrevMask);
8667 for (
unsigned I = 0;
I < NumScalars; ++
I) {
8669 DemandedElts.
setBit(InsertIdx);
8670 IsIdentity &= InsertIdx - OffsetBeg ==
I;
8671 Mask[InsertIdx - OffsetBeg] =
I;
8673 assert(
Offset < NumElts &&
"Failed to find vector index offset");
8688 auto *FirstInsert = cast<Instruction>(*
find_if(
E->Scalars, [
E](
Value *V) {
8689 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
8697 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
8698 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
8699 if (InsertVecSz != VecSz) {
8711 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
8720 case Instruction::ZExt:
8721 case Instruction::SExt:
8722 case Instruction::FPToUI:
8723 case Instruction::FPToSI:
8724 case Instruction::FPExt:
8725 case Instruction::PtrToInt:
8726 case Instruction::IntToPtr:
8727 case Instruction::SIToFP:
8728 case Instruction::UIToFP:
8729 case Instruction::Trunc:
8730 case Instruction::FPTrunc:
8731 case Instruction::BitCast: {
8732 auto SrcIt = MinBWs.
find(getOperandEntry(
E, 0));
8735 unsigned Opcode = ShuffleOrOp;
8736 unsigned VecOpcode = Opcode;
8738 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
8740 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
8741 if (SrcIt != MinBWs.
end()) {
8742 SrcBWSz = SrcIt->second.first;
8746 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
8747 if (BWSz == SrcBWSz) {
8748 VecOpcode = Instruction::BitCast;
8749 }
else if (BWSz < SrcBWSz) {
8750 VecOpcode = Instruction::Trunc;
8751 }
else if (It != MinBWs.
end()) {
8752 assert(BWSz > SrcBWSz &&
"Invalid cast!");
8753 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
8759 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
8762 ? cast<Instruction>(UniqueValues[
Idx])
8771 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
8773 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
8777 VecOpcode == Opcode ? VI :
nullptr);
8779 return GetCostDiff(GetScalarCost, GetVectorCost);
8781 case Instruction::FCmp:
8782 case Instruction::ICmp:
8783 case Instruction::Select: {
8787 match(VL0, MatchCmp))
8793 auto GetScalarCost = [&](
unsigned Idx) {
8794 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
8800 !
match(VI, MatchCmp)) ||
8801 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
8807 Builder.getInt1Ty(), CurrentPred,
CostKind,
8814 E->getOpcode(), VecTy, MaskTy, VecPred,
CostKind, VL0);
8826 if (IntrinsicAndUse.second)
8829 VecCost = std::min(VecCost, IntrinsicCost);
8831 return VecCost + CommonCost;
8833 return GetCostDiff(GetScalarCost, GetVectorCost);
8835 case Instruction::FNeg:
8836 case Instruction::Add:
8837 case Instruction::FAdd:
8838 case Instruction::Sub:
8839 case Instruction::FSub:
8840 case Instruction::Mul:
8841 case Instruction::FMul:
8842 case Instruction::UDiv:
8843 case Instruction::SDiv:
8844 case Instruction::FDiv:
8845 case Instruction::URem:
8846 case Instruction::SRem:
8847 case Instruction::FRem:
8848 case Instruction::Shl:
8849 case Instruction::LShr:
8850 case Instruction::AShr:
8851 case Instruction::And:
8852 case Instruction::Or:
8853 case Instruction::Xor: {
8854 auto GetScalarCost = [&](
unsigned Idx) {
8855 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
8856 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
8865 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
8869 Op2Info, std::nullopt,
nullptr, TLI) +
8872 return GetCostDiff(GetScalarCost, GetVectorCost);
8874 case Instruction::GetElementPtr: {
8875 return CommonCost + GetGEPCostDiff(VL, VL0);
8877 case Instruction::Load: {
8878 auto GetScalarCost = [&](
unsigned Idx) {
8879 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
8884 auto *LI0 = cast<LoadInst>(VL0);
8887 if (
E->State == TreeEntry::Vectorize) {
8889 Instruction::Load, VecTy, LI0->getAlign(),
8891 }
else if (
E->State == TreeEntry::StridedVectorize) {
8892 Align CommonAlignment =
8893 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
8895 Instruction::Load, VecTy, LI0->getPointerOperand(),
8898 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unknown EntryState");
8899 Align CommonAlignment =
8900 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
8902 Instruction::Load, VecTy, LI0->getPointerOperand(),
8905 return VecLdCost + CommonCost;
8911 if (
E->State == TreeEntry::ScatterVectorize)
8917 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8918 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
8920 case Instruction::Store: {
8921 bool IsReorder = !
E->ReorderIndices.empty();
8922 auto GetScalarCost = [=](
unsigned Idx) {
8923 auto *
VI = cast<StoreInst>(VL[
Idx]);
8930 cast<StoreInst>(IsReorder ? VL[
E->ReorderIndices.front()] : VL0);
8935 BaseSI->getPointerAddressSpace(),
CostKind,
8941 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
8942 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
8945 return GetCostDiff(GetScalarCost, GetVectorCost) +
8946 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
8948 case Instruction::Call: {
8949 auto GetScalarCost = [&](
unsigned Idx) {
8950 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
8961 auto *CI = cast<CallInst>(VL0);
8963 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
8965 return GetCostDiff(GetScalarCost, GetVectorCost);
8967 case Instruction::ShuffleVector: {
8973 (isa<CmpInst>(VL0) && isa<CmpInst>(
E->getAltOp()))) &&
8974 "Invalid Shuffle Vector Operand");
8977 auto TryFindNodeWithEqualOperands = [=]() {
8978 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8981 if (
TE->isAltShuffle() &&
8982 ((
TE->getOpcode() ==
E->getOpcode() &&
8983 TE->getAltOpcode() ==
E->getAltOpcode()) ||
8984 (
TE->getOpcode() ==
E->getAltOpcode() &&
8985 TE->getAltOpcode() ==
E->getOpcode())) &&
8986 TE->hasEqualOperands(*
E))
8991 auto GetScalarCost = [&](
unsigned Idx) {
8992 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
8993 assert(
E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
9005 if (TryFindNodeWithEqualOperands()) {
9007 dbgs() <<
"SLP: diamond match for alternate node found.\n";
9017 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9020 CI0->getPredicate(),
CostKind, VL0);
9022 E->getOpcode(), VecTy, MaskTy,
9023 cast<CmpInst>(
E->getAltOp())->getPredicate(),
CostKind,
9026 Type *Src0SclTy =
E->getMainOp()->getOperand(0)->getType();
9027 Type *Src1SclTy =
E->getAltOp()->getOperand(0)->getType();
9037 E->buildAltOpShuffleMask(
9039 assert(
E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
9040 return I->getOpcode() ==
E->getAltOpcode();
9049 unsigned Opcode0 =
E->getOpcode();
9050 unsigned Opcode1 =
E->getAltOpcode();
9053 for (
unsigned Lane : seq<unsigned>(0,
E->Scalars.size()))
9054 if (cast<Instruction>(
E->Scalars[Lane])->getOpcode() == Opcode1)
9055 OpcodeMask.set(Lane);
9060 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
9061 return AltVecCost < VecCost ? AltVecCost : VecCost;
9066 return GetCostDiff(GetScalarCost, GetVectorCost);
9073bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
9075 << VectorizableTree.size() <<
" is fully vectorizable .\n");
9077 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
9079 return TE->State == TreeEntry::NeedToGather &&
9081 [
this](
Value *V) { return EphValues.contains(V); }) &&
9083 TE->Scalars.size() < Limit ||
9084 ((
TE->getOpcode() == Instruction::ExtractElement ||
9087 return isa<ExtractElementInst, UndefValue>(V);
9090 (
TE->State == TreeEntry::NeedToGather &&
9091 TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()));
9095 if (VectorizableTree.size() == 1 &&
9096 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9098 AreVectorizableGathers(VectorizableTree[0].
get(),
9099 VectorizableTree[0]->Scalars.size()) &&
9100 VectorizableTree[0]->getVectorFactor() > 2)))
9103 if (VectorizableTree.size() != 2)
9111 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9112 AreVectorizableGathers(VectorizableTree[1].
get(),
9113 VectorizableTree[0]->Scalars.size()))
9117 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9118 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9119 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9120 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9128 bool MustMatchOrInst) {
9132 Value *ZextLoad = Root;
9133 const APInt *ShAmtC;
9134 bool FoundOr =
false;
9135 while (!isa<ConstantExpr>(ZextLoad) &&
9138 ShAmtC->
urem(8) == 0))) {
9139 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9140 ZextLoad = BinOp->getOperand(0);
9141 if (BinOp->getOpcode() == Instruction::Or)
9146 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9153 Type *SrcTy = Load->getType();
9160 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
9161 << *(cast<Instruction>(Root)) <<
"\n");
9170 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9171 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9179 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9180 for (
Value *Scalar : VectorizableTree[0]->Scalars) {
9191 if (VectorizableTree.size() == 2 &&
9192 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9193 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9194 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9195 !(
isSplat(VectorizableTree[1]->Scalars) ||
9203 constexpr int Limit = 4;
9205 !VectorizableTree.empty() &&
9206 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9207 return (TE->State == TreeEntry::NeedToGather &&
9208 TE->getOpcode() != Instruction::ExtractElement &&
9210 [](
Value *V) { return isa<ExtractElementInst>(V); }) <=
9212 TE->getOpcode() == Instruction::PHI;
9223 if (isFullyVectorizableTinyTree(ForReduction))
9228 if (
any_of(VectorizableTree, [](
const std::unique_ptr<TreeEntry> &TE) {
9229 return TE->State == TreeEntry::NeedToGather &&
9231 return isa<ExtractElementInst, UndefValue>(V) ||
9232 (!V->hasNUsesOrMore(UsesLimit) &&
9233 any_of(V->users(), [](User *U) {
9234 return isa<InsertElementInst>(U);
9240 assert(VectorizableTree.empty()
9241 ? ExternalUses.empty()
9242 :
true &&
"We shouldn't have any external users");
9254 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9267 for (
const auto &TEPtr : VectorizableTree) {
9268 if (TEPtr->State != TreeEntry::Vectorize)
9270 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9276 auto *NodeA = DT->
getNode(
A->getParent());
9277 auto *NodeB = DT->
getNode(
B->getParent());
9278 assert(NodeA &&
"Should only process reachable instructions");
9279 assert(NodeB &&
"Should only process reachable instructions");
9280 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9281 "Different nodes should have different DFS numbers");
9283 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9284 return B->comesBefore(
A);
9294 LiveValues.
erase(PrevInst);
9295 for (
auto &J : PrevInst->
operands()) {
9296 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
9297 LiveValues.
insert(cast<Instruction>(&*J));
9301 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
9302 for (
auto *
X : LiveValues)
9303 dbgs() <<
" " <<
X->getName();
9304 dbgs() <<
", Looking at ";
9309 unsigned NumCalls = 0;
9313 while (InstIt != PrevInstIt) {
9315 PrevInstIt = Inst->getParent()->rbegin();
9320 if (
auto *II = dyn_cast<IntrinsicInst>(
I)) {
9321 if (II->isAssumeLikeIntrinsic())
9325 for (
auto &ArgOp : II->args())
9327 if (
auto *FPMO = dyn_cast<FPMathOperator>(II))
9328 FMF = FPMO->getFastMathFlags();
9335 if (IntrCost < CallCost)
9342 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
9343 &*PrevInstIt != PrevInst)
9351 for (
auto *II : LiveValues) {
9352 auto *ScalarTy = II->getType();
9353 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
9354 ScalarTy = VectorTy->getElementType();
9372 const auto *I1 = IE1;
9373 const auto *I2 = IE2;
9385 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
9387 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
9388 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
9390 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
9391 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
9399 template <
typename U>
9400 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
9403 template <
typename U>
9404 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
9422template <
typename T>
9428 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
9430 auto VMIt = std::next(ShuffleMask.begin());
9433 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
9435 if (!IsBaseUndef.
all()) {
9437 std::pair<T *, bool> Res =
9438 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
9440 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
9444 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
9446 auto *V = ValueSelect::get<T *>(
Base);
9448 assert((!V || GetVF(V) == Mask.size()) &&
9449 "Expected base vector of VF number of elements.");
9450 Prev = Action(Mask, {
nullptr, Res.first});
9451 }
else if (ShuffleMask.size() == 1) {
9454 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
9460 Prev = Action(Mask, {ShuffleMask.begin()->first});
9464 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
9465 unsigned Vec2VF = GetVF(VMIt->first);
9466 if (Vec1VF == Vec2VF) {
9470 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9473 Mask[
I] = SecMask[
I] + Vec1VF;
9476 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
9479 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
9481 std::pair<T *, bool> Res2 =
9482 ResizeAction(VMIt->first, VMIt->second,
false);
9484 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9491 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
9494 Prev = Action(Mask, {Res1.first, Res2.first});
9496 VMIt = std::next(VMIt);
9498 bool IsBaseNotUndef = !IsBaseUndef.
all();
9499 (void)IsBaseNotUndef;
9501 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
9503 std::pair<T *, bool> Res =
9504 ResizeAction(VMIt->first, VMIt->second,
false);
9506 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9509 "Multiple uses of scalars.");
9510 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
9515 Prev = Action(Mask, {Prev, Res.first});
9523 << VectorizableTree.size() <<
".\n");
9525 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
9528 for (
unsigned I = 0,
E = VectorizableTree.size();
I <
E; ++
I) {
9529 TreeEntry &TE = *VectorizableTree[
I];
9530 if (TE.State == TreeEntry::NeedToGather) {
9531 if (
const TreeEntry *
E = getTreeEntry(TE.getMainOp());
9532 E &&
E->getVectorFactor() == TE.getVectorFactor() &&
9533 E->isSame(TE.Scalars)) {
9538 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
9547 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
9557 for (ExternalUser &EU : ExternalUses) {
9559 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
9560 !ExtractCostCalculated.
insert(EU.Scalar).second)
9566 if (EphValues.
count(EU.User))
9570 if (isa<FixedVectorType>(EU.Scalar->getType()))
9575 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
9576 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
9577 if (!UsedInserts.
insert(VU).second)
9581 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
9584 [
this, VU](
const std::pair<Value *, const TreeEntry *> &Pair) {
9586 VU, cast<InsertElementInst>(Pair.first),
9588 Value *Op0 = II->getOperand(0);
9589 if (getTreeEntry(II) && !getTreeEntry(Op0))
9595 if (It == FirstUsers.
end()) {
9602 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
9603 if (IEBase != EU.User &&
9604 (!IEBase->hasOneUse() ||
9608 if (
const TreeEntry *
E = getTreeEntry(IEBase)) {
9611 IEBase = cast<InsertElementInst>(
Base);
9614 "InsertElementInstruction used already.");
9616 Base = IEBase->getOperand(0);
9617 }
while (
E == getTreeEntry(
Base));
9620 Base = cast<InsertElementInst>(
Base)->getOperand(0);
9624 VecId = FirstUsers.
size() - 1;
9625 auto It = MinBWs.
find(ScalarTE);
9626 if (It != MinBWs.
end() &&
9628 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
9630 unsigned BWSz = It->second.first;
9631 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
9634 VecOpcode = Instruction::Trunc;
9637 It->second.second ? Instruction::SExt : Instruction::ZExt;
9643 FTy->getNumElements()),
9646 <<
" for extending externally used vector with "
9647 "non-equal minimum bitwidth.\n");
9653 VecId = std::distance(FirstUsers.
begin(), It);
9655 int InIdx = *InsertIdx;
9659 Mask[InIdx] = EU.Lane;
9660 DemandedElts[VecId].setBit(InIdx);
9671 auto It = MinBWs.
find(getTreeEntry(EU.Scalar));
9672 if (It != MinBWs.
end()) {
9675 It->second.second ? Instruction::SExt : Instruction::ZExt;
9685 if (!VectorizedVals.
empty()) {
9686 const TreeEntry &Root = *VectorizableTree.front().get();
9687 auto BWIt = MinBWs.
find(&Root);
9688 if (BWIt != MinBWs.
end()) {
9689 Type *DstTy = Root.Scalars.front()->getType();
9690 unsigned OriginalSz =
DL->getTypeSizeInBits(DstTy);
9691 if (OriginalSz != BWIt->second.first) {
9692 unsigned Opcode = Instruction::Trunc;
9693 if (OriginalSz < BWIt->second.first)
9694 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
9704 Cost += SpillCost + ExtractCost;
9708 unsigned VF = Mask.size();
9709 unsigned VecVF = TE->getVectorFactor();
9711 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
9714 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
9720 dbgs() <<
"SLP: Adding cost " <<
C
9721 <<
" for final shuffle of insertelement external users.\n";
9722 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
9724 return std::make_pair(TE,
true);
9726 return std::make_pair(TE,
false);
9729 for (
int I = 0,
E = FirstUsers.
size();
I <
E; ++
I) {
9730 Value *
Base = cast<Instruction>(FirstUsers[
I].first)->getOperand(0);
9731 auto Vector = ShuffleMasks[
I].takeVector();
9735 assert((TEs.size() == 1 || TEs.size() == 2) &&
9736 "Expected exactly 1 or 2 tree entries.");
9737 if (TEs.size() == 1) {
9739 VF = TEs.front()->getVectorFactor();
9745 (
Data.index() < VF &&
9746 static_cast<int>(
Data.index()) ==
Data.value());
9751 <<
" for final shuffle of insertelement "
9752 "external users.\n";
9753 TEs.front()->
dump();
9754 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
9760 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
9761 VF = TEs.front()->getVectorFactor();
9770 <<
" for final shuffle of vector node and external "
9771 "insertelement users.\n";
9772 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
9773 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
9779 (void)performExtractsShuffleAction<const TreeEntry>(
9781 [](
const TreeEntry *
E) {
return E->getVectorFactor(); }, ResizeToVF,
9782 EstimateShufflesCost);
9784 cast<FixedVectorType>(FirstUsers[
I].first->getType()), DemandedElts[
I],
9793 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
9794 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
9795 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
9799 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
9810std::optional<TTI::ShuffleKind>
9811BoUpSLP::tryToGatherSingleRegisterExtractElements(
9817 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
9818 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
9820 if (isa<UndefValue>(VL[
I]))
9824 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
9825 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
9834 ExtractMask.reset(*
Idx);
9839 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
9843 for (
const auto &
Data : VectorOpToIdx)
9844 VFToVector[cast<FixedVectorType>(
Data.first->getType())->getNumElements()]
9845 .push_back(
Data.first);
9846 for (
auto &
Data : VFToVector) {
9848 return VectorOpToIdx.find(V1)->second.size() >
9849 VectorOpToIdx.find(V2)->second.size();
9854 const int UndefSz = UndefVectorExtracts.
size();
9855 unsigned SingleMax = 0;
9856 Value *SingleVec =
nullptr;
9857 unsigned PairMax = 0;
9858 std::pair<Value *, Value *> PairVec(
nullptr,
nullptr);
9859 for (
auto &
Data : VFToVector) {
9861 if (SingleMax < VectorOpToIdx[V1].
size() + UndefSz) {
9862 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
9866 if (
Data.second.size() > 1)
9867 V2 = *std::next(
Data.second.begin());
9868 if (V2 && PairMax < VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() +
9870 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[
V2].size() + UndefSz;
9871 PairVec = std::make_pair(V1, V2);
9874 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
9875 return std::nullopt;
9881 if (SingleMax >= PairMax && SingleMax) {
9882 for (
int Idx : VectorOpToIdx[SingleVec])
9885 for (
Value *V : {PairVec.first, PairVec.second})
9886 for (
int Idx : VectorOpToIdx[V])
9890 for (
int Idx : UndefVectorExtracts)
9894 std::optional<TTI::ShuffleKind> Res =
9900 return std::nullopt;
9904 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
9906 isa<UndefValue>(GatheredExtracts[
I])) {
9910 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
9911 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
9912 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
9927 unsigned NumParts)
const {
9928 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
9931 unsigned SliceSize = VL.
size() / NumParts;
9932 for (
unsigned Part = 0; Part < NumParts; ++Part) {
9938 std::optional<TTI::ShuffleKind> Res =
9939 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
9940 ShufflesRes[Part] = Res;
9941 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
9943 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
9944 return Res.has_value();
9946 ShufflesRes.clear();
9950std::optional<TargetTransformInfo::ShuffleKind>
9951BoUpSLP::isGatherShuffledSingleRegisterEntry(
9957 const EdgeInfo &TEUseEI =
TE->UserTreeIndices.front();
9958 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
9962 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
9963 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
9966 TEInsertBlock = TEInsertPt->
getParent();
9969 return std::nullopt;
9970 auto *NodeUI = DT->
getNode(TEInsertBlock);
9971 assert(NodeUI &&
"Should only process reachable instructions");
9973 auto CheckOrdering = [&](
const Instruction *InsertPt) {
9987 auto *NodeEUI = DT->
getNode(InsertBlock);
9990 assert((NodeUI == NodeEUI) ==
9991 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
9992 "Different nodes should have different DFS numbers");
9994 if (TEInsertPt->
getParent() != InsertBlock &&
9997 if (TEInsertPt->
getParent() == InsertBlock &&
10011 for (
Value *V : VL) {
10016 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10020 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
10021 "Must contain at least single gathered value.");
10022 assert(TEPtr->UserTreeIndices.size() == 1 &&
10023 "Expected only single user of a gather node.");
10024 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10026 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10029 : &getLastInstructionInBundle(UseEI.UserTE);
10030 if (TEInsertPt == InsertPt) {
10034 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10038 if (TEUseEI.UserTE != UseEI.UserTE &&
10039 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10045 if ((TEInsertBlock != InsertPt->
getParent() ||
10046 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10047 !CheckOrdering(InsertPt))
10051 if (
const TreeEntry *VTE = getTreeEntry(V)) {
10053 if (VTE->State != TreeEntry::Vectorize) {
10054 auto It = MultiNodeScalars.
find(V);
10055 if (It == MultiNodeScalars.
end())
10057 VTE = *It->getSecond().begin();
10059 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
10060 return MTE->State == TreeEntry::Vectorize;
10062 if (MIt == It->getSecond().end())
10067 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10068 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10070 auto It = MinBWs.
find(VTE);
10072 if (It != MinBWs.
end() &&
10073 It->second.first !=
DL->getTypeSizeInBits(
V->getType()))
10077 if (VToTEs.
empty())
10079 if (UsedTEs.
empty()) {
10093 if (!VToTEs.
empty()) {
10099 VToTEs = SavedVToTEs;
10108 if (UsedTEs.
size() == 2)
10110 UsedTEs.push_back(SavedVToTEs);
10117 if (UsedTEs.
empty()) {
10119 return std::nullopt;
10123 if (UsedTEs.
size() == 1) {
10126 UsedTEs.front().
end());
10127 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10128 return TE1->Idx < TE2->Idx;
10131 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
10132 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
10134 if (It != FirstEntries.end() &&
10135 ((*It)->getVectorFactor() == VL.size() ||
10136 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
10137 TE->ReuseShuffleIndices.size() == VL.size() &&
10138 (*It)->isSame(
TE->Scalars)))) {
10139 Entries.push_back(*It);
10140 if ((*It)->getVectorFactor() == VL.size()) {
10141 std::iota(std::next(
Mask.begin(), Part * VL.size()),
10142 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
10148 for (
int I = 0, Sz = VL.size();
I < Sz; ++
I)
10149 if (isa<PoisonValue>(VL[
I]))
10155 Entries.push_back(FirstEntries.front());
10158 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
10161 for (
const TreeEntry *TE : UsedTEs.front()) {
10162 unsigned VF =
TE->getVectorFactor();
10163 auto It = VFToTE.
find(VF);
10164 if (It != VFToTE.
end()) {
10165 if (It->second->Idx >
TE->Idx)
10166 It->getSecond() =
TE;
10173 UsedTEs.back().
end());
10174 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10175 return TE1->Idx < TE2->Idx;
10177 for (
const TreeEntry *TE : SecondEntries) {
10178 auto It = VFToTE.
find(
TE->getVectorFactor());
10179 if (It != VFToTE.
end()) {
10181 Entries.push_back(It->second);
10182 Entries.push_back(TE);
10188 if (Entries.empty()) {
10190 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10191 return TE1->Idx < TE2->Idx;
10193 Entries.push_back(SecondEntries.front());
10194 VF = std::max(Entries.front()->getVectorFactor(),
10195 Entries.back()->getVectorFactor());
10202 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
10203 auto *
PHI = cast<PHINode>(V);
10204 auto *PHI1 = cast<PHINode>(V1);
10209 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
10211 Value *In1 = PHI1->getIncomingValue(
I);
10216 if (cast<Instruction>(In)->
getParent() !=
10226 auto MightBeIgnored = [=](
Value *
V) {
10227 auto *
I = dyn_cast<Instruction>(V);
10228 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
10230 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
10235 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
10237 bool UsedInSameVTE =
false;
10238 auto It = UsedValuesEntry.
find(V1);
10239 if (It != UsedValuesEntry.
end())
10240 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
10241 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
10243 cast<Instruction>(V)->getParent() ==
10244 cast<Instruction>(V1)->getParent() &&
10245 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
10250 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
10252 auto It = UsedValuesEntry.
find(V);
10253 if (It == UsedValuesEntry.
end())
10259 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
10260 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
10262 unsigned Idx = It->second;
10269 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
10270 if (!UsedIdxs.test(
I))
10276 for (std::pair<unsigned, int> &Pair : EntryLanes)
10277 if (Pair.first ==
I)
10278 Pair.first = TempEntries.
size();
10281 Entries.swap(TempEntries);
10282 if (EntryLanes.size() == Entries.size() &&
10284 .
slice(Part * VL.size(),
10285 std::min<int>(VL.size(),
TE->Scalars.size())))) {
10291 return std::nullopt;
10294 bool IsIdentity = Entries.size() == 1;
10297 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
10298 unsigned Idx = Part * VL.size() + Pair.second;
10301 (ForOrder ? std::distance(
10302 Entries[Pair.first]->Scalars.begin(),
10303 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
10304 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
10305 IsIdentity &=
Mask[
Idx] == Pair.second;
10307 switch (Entries.size()) {
10309 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
10313 if (EntryLanes.size() > 2 || VL.size() <= 2)
10321 std::fill(std::next(
Mask.begin(), Part * VL.size()),
10323 return std::nullopt;
10327BoUpSLP::isGatherShuffledEntry(
10331 assert(NumParts > 0 && NumParts < VL.
size() &&
10332 "Expected positive number of registers.");
10335 if (TE == VectorizableTree.front().get())
10338 assert(
TE->UserTreeIndices.size() == 1 &&
10339 "Expected only single user of the gather node.");
10341 "Number of scalars must be divisible by NumParts.");
10342 unsigned SliceSize = VL.
size() / NumParts;
10344 for (
unsigned Part = 0; Part < NumParts; ++Part) {
10347 std::optional<TTI::ShuffleKind> SubRes =
10348 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
10351 SubEntries.
clear();
10354 SubEntries.
front()->getVectorFactor() == VL.
size() &&
10355 (SubEntries.
front()->isSame(
TE->Scalars) ||
10356 SubEntries.
front()->isSame(VL))) {
10358 LocalSubEntries.
swap(SubEntries);
10361 std::iota(
Mask.begin(),
Mask.end(), 0);
10363 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
10364 if (isa<PoisonValue>(VL[
I]))
10366 Entries.emplace_back(1, LocalSubEntries.
front());
10372 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
10380 bool ForPoisonSrc)
const {
10382 Type *ScalarTy = VL[0]->getType();
10383 if (
StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
10384 ScalarTy =
SI->getValueOperand()->getType();
10386 bool DuplicateNonConst =
false;
10394 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
10401 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
10404 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
10412 EstimateInsertCost(
I, V);
10413 ShuffleMask[
I] =
I;
10417 DuplicateNonConst =
true;
10419 ShuffleMask[
I] = Res.first->second;
10425 if (DuplicateNonConst)
10427 VecTy, ShuffleMask);
10433void BoUpSLP::reorderInputsAccordingToOpcode(
10439 VLOperands Ops(VL, TLI, DL, SE, R);
10442 Left = Ops.getVL(0);
10443 Right = Ops.getVL(1);
10446Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
10449 return *Res.second;
10453 auto *Front =
E->getMainOp();
10456 if (E->getOpcode() == Instruction::GetElementPtr &&
10457 !isa<GetElementPtrInst>(V))
10459 auto *I = cast<Instruction>(V);
10460 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
10461 isVectorLikeInstWithConstOps(I);
10464 auto FindLastInst = [&]() {
10466 for (
Value *V :
E->Scalars) {
10467 auto *
I = dyn_cast<Instruction>(V);
10470 if (LastInst->
getParent() ==
I->getParent()) {
10475 assert(((
E->getOpcode() == Instruction::GetElementPtr &&
10476 !isa<GetElementPtrInst>(
I)) ||
10479 "Expected vector-like or non-GEP in GEP node insts only.");
10487 auto *NodeB = DT->
getNode(
I->getParent());
10488 assert(NodeA &&
"Should only process reachable instructions");
10489 assert(NodeB &&
"Should only process reachable instructions");
10490 assert((NodeA == NodeB) ==
10491 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10492 "Different nodes should have different DFS numbers");
10493 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
10500 auto FindFirstInst = [&]() {
10502 for (
Value *V :
E->Scalars) {
10503 auto *
I = dyn_cast<Instruction>(V);
10506 if (FirstInst->
getParent() ==
I->getParent()) {
10507 if (
I->comesBefore(FirstInst))
10511 assert(((
E->getOpcode() == Instruction::GetElementPtr &&
10512 !isa<GetElementPtrInst>(
I)) ||
10515 "Expected vector-like or non-GEP in GEP node insts only.");
10523 auto *NodeB = DT->
getNode(
I->getParent());
10524 assert(NodeA &&
"Should only process reachable instructions");
10525 assert(NodeB &&
"Should only process reachable instructions");
10526 assert((NodeA == NodeB) ==
10527 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10528 "Different nodes should have different DFS numbers");
10529 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
10538 (
E->State != TreeEntry::NeedToGather &&
10540 if ((
E->getOpcode() == Instruction::GetElementPtr &&
10543 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
10546 return !isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V);
10548 Res.second = FindLastInst();
10550 Res.second = FindFirstInst();
10551 return *Res.second;
10558 if (BlocksSchedules.count(BB)) {
10559 Value *
V =
E->isOneOf(
E->Scalars.back());
10562 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
10563 if (Bundle && Bundle->isPartOfBundle())
10564 for (; Bundle; Bundle = Bundle->NextInBundle)
10565 if (Bundle->OpValue == Bundle->Inst)
10566 Res.second = Bundle->Inst;
10588 Res.second = FindLastInst();
10589 assert(Res.second &&
"Failed to find last instruction in bundle");
10590 return *Res.second;
10593void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
10594 auto *Front =
E->getMainOp();
10595 Instruction *LastInst = &getLastInstructionInBundle(
E);
10596 assert(LastInst &&
"Failed to find last instruction in bundle");
10599 bool IsPHI = isa<PHINode>(LastInst);
10602 if (IsPHI || (
E->State != TreeEntry::NeedToGather &&
10604 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
10608 Builder.SetInsertPoint(
10612 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
10622 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
10625 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
10626 InsertBB = InsertBB->getSinglePredecessor();
10627 return InsertBB && InsertBB == InstBB;
10629 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
10630 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
10631 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
10632 getTreeEntry(Inst) ||
10633 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
10634 PostponedIndices.
insert(
I).second)
10638 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos) {
10639 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos));
10640 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
10643 GatherShuffleExtractSeq.
insert(InsElt);
10644 CSEBlocks.
insert(InsElt->getParent());
10646 if (isa<Instruction>(V)) {
10647 if (TreeEntry *Entry = getTreeEntry(V)) {
10649 unsigned FoundLane = Entry->findLaneForValue(V);
10650 ExternalUses.emplace_back(V, InsElt, FoundLane);
10656 isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
10661 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
10669 if (!isa<UndefValue>(VL[
I])) {
10673 if (isa<PoisonValue>(VL[
I]))
10675 if (
auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
10680 Vec = CreateInsertElement(Vec, VL[
I],
I);
10683 for (
int I : NonConsts)
10684 Vec = CreateInsertElement(Vec, VL[
I],
I);
10687 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
10688 Vec = CreateInsertElement(Vec, Pair.first, Pair.second);
10726 bool IsFinalized =
false;
10739 class ShuffleIRBuilder {
10750 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
10751 CSEBlocks(CSEBlocks) {}
10752 ~ShuffleIRBuilder() =
default;
10756 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
10757 GatherShuffleExtractSeq.
insert(
I);
10758 CSEBlocks.
insert(
I->getParent());
10767 unsigned VF = Mask.size();
10768 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
10772 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
10773 GatherShuffleExtractSeq.
insert(
I);
10774 CSEBlocks.
insert(
I->getParent());
10778 Value *createIdentity(
Value *V) {
return V; }
10779 Value *createPoison(
Type *Ty,
unsigned VF) {
10784 void resizeToMatch(
Value *&V1,
Value *&V2) {
10785 if (V1->
getType() == V2->getType())
10787 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
10788 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
10789 int VF = std::max(V1VF, V2VF);
10790 int MinVF = std::min(V1VF, V2VF);
10792 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
10794 Value *&
Op = MinVF == V1VF ? V1 : V2;
10796 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
10797 GatherShuffleExtractSeq.
insert(
I);
10798 CSEBlocks.
insert(
I->getParent());
10811 assert(V1 &&
"Expected at least one vector value.");
10812 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
10814 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
10822 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10829 : Builder(Builder), R(R) {}
10833 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10834 unsigned NumParts,
bool &UseVecBaseAsInput) {
10835 UseVecBaseAsInput =
false;
10837 Value *VecBase =
nullptr;
10838 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
10842 auto *EI = cast<ExtractElementInst>(
E->Scalars[
I]);
10843 VecBase = EI->getVectorOperand();
10844 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
10845 VecBase = TE->VectorizedValue;
10846 assert(VecBase &&
"Expected vectorized value.");
10847 UniqueBases.
insert(VecBase);
10850 if (!EI->hasOneUse() || (NumParts != 1 &&
count(
E->Scalars, EI) > 1) ||
10852 const TreeEntry *UTE = R.getTreeEntry(U);
10853 return !UTE || R.MultiNodeScalars.contains(U) ||
10854 count_if(R.VectorizableTree,
10855 [&](const std::unique_ptr<TreeEntry> &TE) {
10856 return any_of(TE->UserTreeIndices,
10857 [&](const EdgeInfo &Edge) {
10858 return Edge.UserTE == UTE;
10860 is_contained(TE->Scalars, EI);
10864 R.eraseInstruction(EI);
10866 if (NumParts == 1 || UniqueBases.
size() == 1)
10868 UseVecBaseAsInput =
true;
10878 Value *Vec =
nullptr;
10880 unsigned SliceSize =
E->Scalars.size() / NumParts;
10881 for (
unsigned Part = 0; Part < NumParts; ++Part) {
10885 constexpr int MaxBases = 2;
10893 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
10894 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
10895 VecOp = TE->VectorizedValue;
10896 assert(VecOp &&
"Expected vectorized value.");
10898 cast<FixedVectorType>(VecOp->
getType())->getNumElements();
10900 assert((PrevSize ==
Size || PrevSize == 0) &&
10901 "Expected vectors of the same size.");
10904 Bases[SubMask[
I] <
Size ? 0 : 1] = VecOp;
10906 if (!Bases.front())
10909 if (Bases.back()) {
10910 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
10911 TransformToIdentity(SubMask);
10913 SubVec = Bases.front();
10920 Mask.slice(
P * SliceSize, SliceSize);
10925 "Expected first part or all previous parts masked.");
10926 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
10928 unsigned VF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
10930 unsigned SubVecVF =
10931 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
10932 VF = std::max(VF, SubVecVF);
10938 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
10939 Vec = createShuffle(Vec, SubVec, VecMask);
10940 TransformToIdentity(VecMask);
10948 std::optional<Value *>
10954 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
10956 return std::nullopt;
10960 E->getVectorFactor());
10968 add(E1.VectorizedValue, E2.VectorizedValue, Mask);
10973 add(E1.VectorizedValue, Mask);
10977 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
10978 if (InVectors.
empty()) {
10981 CommonMask.
assign(Mask.begin(), Mask.end());
10985 if (InVectors.
size() == 2) {
10986 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
10987 transformMaskAfterShuffle(CommonMask, CommonMask);
10988 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
10990 Vec = createShuffle(Vec,
nullptr, CommonMask);
10991 transformMaskAfterShuffle(CommonMask, CommonMask);
10993 V1 = createShuffle(V1, V2, Mask);
10994 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10996 CommonMask[
Idx] =
Idx + Sz;
10997 InVectors.
front() = Vec;
10998 if (InVectors.
size() == 2)
10999 InVectors.
back() = V1;
11005 if (InVectors.
empty()) {
11006 if (!isa<FixedVectorType>(V1->
getType())) {
11007 V1 = createShuffle(V1,
nullptr, CommonMask);
11009 transformMaskAfterShuffle(CommonMask, Mask);
11012 CommonMask.
assign(Mask.begin(), Mask.end());
11015 const auto *It =
find(InVectors, V1);
11016 if (It == InVectors.
end()) {
11017 if (InVectors.
size() == 2 ||
11019 !isa<FixedVectorType>(V1->
getType())) {
11021 if (InVectors.
size() == 2) {
11022 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11023 transformMaskAfterShuffle(CommonMask, CommonMask);
11024 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11025 CommonMask.
size()) {
11026 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
11027 transformMaskAfterShuffle(CommonMask, CommonMask);
11029 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11032 V->getType() != V1->
getType()
11034 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
11035 ->getNumElements();
11036 if (V->getType() != V1->
getType())
11037 V1 = createShuffle(V1,
nullptr, Mask);
11038 InVectors.
front() = V;
11039 if (InVectors.
size() == 2)
11040 InVectors.
back() = V1;
11047 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11053 int VF = CommonMask.
size();
11054 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
11055 VF = FTy->getNumElements();
11056 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11058 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
11067 Value *Root =
nullptr) {
11068 return R.gather(VL, Root);
11077 IsFinalized =
true;
11080 if (InVectors.
size() == 2) {
11081 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11084 Vec = createShuffle(Vec,
nullptr, CommonMask);
11086 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11090 "Expected vector length for the final value before action.");
11091 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11094 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11095 Vec = createShuffle(Vec,
nullptr, ResizeMask);
11097 Action(Vec, CommonMask);
11098 InVectors.
front() = Vec;
11100 if (!ExtMask.
empty()) {
11101 if (CommonMask.
empty()) {
11105 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
11108 NewMask[
I] = CommonMask[ExtMask[
I]];
11110 CommonMask.
swap(NewMask);
11113 if (CommonMask.
empty()) {
11114 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11115 return InVectors.
front();
11117 if (InVectors.
size() == 2)
11118 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11119 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
11124 "Shuffle construction must be finalized.");
11128Value *BoUpSLP::vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx,
11129 bool PostponedPHIs) {
11131 const unsigned VF = VL.size();
11134 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11136 find_if(VL, [](
Value *V) {
return isa<GetElementPtrInst>(V); });
11137 if (It != VL.end())
11140 if (S.getOpcode()) {
11141 auto CheckSameVE = [&](
const TreeEntry *VE) {
11142 return VE->isSame(VL) &&
11143 (
any_of(VE->UserTreeIndices,
11144 [
E, NodeIdx](
const EdgeInfo &EI) {
11145 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11147 any_of(VectorizableTree,
11148 [
E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
11149 return TE->isOperandGatherNode({
E, NodeIdx}) &&
11150 VE->isSame(
TE->Scalars);
11153 TreeEntry *VE = getTreeEntry(S.OpValue);
11154 bool IsSameVE = VE && CheckSameVE(VE);
11156 auto It = MultiNodeScalars.
find(S.OpValue);
11157 if (It != MultiNodeScalars.
end()) {
11158 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
11159 return TE != VE && CheckSameVE(TE);
11161 if (
I != It->getSecond().end()) {
11169 ShuffleInstructionBuilder ShuffleBuilder(Builder, *
this);
11170 ShuffleBuilder.add(V, Mask);
11171 return ShuffleBuilder.finalize(std::nullopt);
11174 if (VF != cast<FixedVectorType>(
V->getType())->getNumElements()) {
11175 if (!VE->ReuseShuffleIndices.empty()) {
11196 if (isa<PoisonValue>(V))
11198 Mask[
I] = VE->findLaneForValue(V);
11200 V = FinalShuffle(V, Mask);
11202 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
11203 "Expected vectorization factor less "
11204 "than original vector size.");
11206 std::iota(UniformMask.begin(), UniformMask.end(), 0);
11207 V = FinalShuffle(V, UniformMask);
11213 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11214 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11215 }) == VE->UserTreeIndices.end()) {
11217 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11218 return TE->State == TreeEntry::NeedToGather &&
11219 TE->UserTreeIndices.front().UserTE ==
E &&
11220 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
11222 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
11223 (*It)->VectorizedValue =
V;
11232 auto *
I =
find_if(VectorizableTree,
11233 [
E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
11234 return TE->isOperandGatherNode({
E, NodeIdx});
11236 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
11237 assert(
I->get()->UserTreeIndices.size() == 1 &&
11238 "Expected only single user for the gather node.");
11239 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
11243template <
typename BVTy,
typename ResTy,
typename...
Args>
11244ResTy BoUpSLP::processBuildVector(
const TreeEntry *
E, Args &...Params) {
11245 assert(
E->State == TreeEntry::NeedToGather &&
"Expected gather node.");
11246 unsigned VF =
E->getVectorFactor();
11248 bool NeedFreeze =
false;
11250 E->ReuseShuffleIndices.end());
11256 if (!ReorderMask.
empty())
11259 unsigned I,
unsigned SliceSize) {
11261 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
11264 TreeEntry *UserTE =
E->UserTreeIndices.back().UserTE;
11265 unsigned EdgeIdx =
E->UserTreeIndices.back().EdgeIdx;
11266 if (UserTE->getNumOperands() != 2)
11269 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
11270 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
11271 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
11272 }) !=
TE->UserTreeIndices.end();
11274 if (It == VectorizableTree.end())
11277 if ((
Mask.size() < InputVF &&
11280 (
Mask.size() == InputVF &&
11282 std::iota(std::next(
Mask.begin(),
I * SliceSize),
11283 std::next(
Mask.begin(), (
I + 1) * SliceSize), 0);
11287 std::fill(std::next(
Mask.begin(),
I * SliceSize),
11288 std::next(
Mask.begin(), (
I + 1) * SliceSize), IVal);
11292 BVTy ShuffleBuilder(Params...);
11293 ResTy Res = ResTy();
11297 Value *ExtractVecBase =
nullptr;
11298 bool UseVecBaseAsInput =
false;
11301 Type *ScalarTy = GatheredScalars.front()->getType();
11304 if (NumParts == 0 || NumParts >= GatheredScalars.size())
11308 bool Resized =
false;
11310 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
11311 if (!ExtractShuffles.
empty()) {
11316 if (
const auto *TE = getTreeEntry(
11317 cast<ExtractElementInst>(
E->Scalars[
Idx])->getVectorOperand()))
11320 if (std::optional<ResTy> Delayed =
11321 ShuffleBuilder.needToDelay(
E, ExtractEntries)) {
11328 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
11329 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
11330 ExtractVecBase = VecBase;
11331 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
11332 if (VF == VecBaseTy->getNumElements() &&
11333 GatheredScalars.size() != VF) {
11335 GatheredScalars.append(VF - GatheredScalars.size(),
11341 if (!ExtractShuffles.
empty() ||
E->getOpcode() != Instruction::Load ||
11342 E->isAltShuffle() ||
11343 all_of(
E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
11345 (
E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
11347 isGatherShuffledEntry(
E, GatheredScalars, Mask, Entries, NumParts);
11349 if (!GatherShuffles.
empty()) {
11350 if (std::optional<ResTy> Delayed =
11351 ShuffleBuilder.needToDelay(
E, Entries)) {
11358 if (GatherShuffles.
size() == 1 &&
11360 Entries.front().front()->isSame(
E->Scalars)) {
11365 <<
"SLP: perfect diamond match for gather bundle "
11368 Mask.resize(
E->Scalars.size());
11369 const TreeEntry *FrontTE = Entries.front().front();
11370 if (FrontTE->ReorderIndices.empty() &&
11371 ((FrontTE->ReuseShuffleIndices.empty() &&
11372 E->Scalars.size() == FrontTE->Scalars.size()) ||
11373 (
E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
11374 std::iota(
Mask.begin(),
Mask.end(), 0);
11377 if (isa<PoisonValue>(V)) {
11381 Mask[
I] = FrontTE->findLaneForValue(V);
11384 ShuffleBuilder.add(*FrontTE, Mask);
11385 Res = ShuffleBuilder.finalize(
E->getCommonMask());
11389 if (GatheredScalars.size() != VF &&
11391 return any_of(TEs, [&](
const TreeEntry *TE) {
11392 return TE->getVectorFactor() == VF;
11395 GatheredScalars.append(VF - GatheredScalars.size(),
11399 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
11407 bool IsRootPoison) {
11410 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
11417 int NumNonConsts = 0;
11420 if (isa<UndefValue>(V)) {
11421 if (!isa<PoisonValue>(V)) {
11436 Scalars.
front() = OrigV;
11439 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
11440 Scalars[Res.first->second] = OrigV;
11441 ReuseMask[
I] = Res.first->second;
11444 if (NumNonConsts == 1) {
11449 if (!UndefPos.
empty() && UndefPos.
front() == 0)
11452 ReuseMask[SinglePos] = SinglePos;
11453 }
else if (!UndefPos.
empty() && IsSplat) {
11458 return !isa<UndefValue>(V) &&
11460 (
E->UserTreeIndices.size() == 1 &&
11464 return E->UserTreeIndices.front().EdgeIdx !=
11465 U.getOperandNo() &&
11467 E->UserTreeIndices.front().UserTE->Scalars,
11471 if (It != Scalars.
end()) {
11473 int Pos = std::distance(Scalars.
begin(), It);
11474 for (
int I : UndefPos) {
11476 ReuseMask[
I] = Pos;
11485 for (
int I : UndefPos) {
11487 if (isa<UndefValue>(Scalars[
I]))
11494 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
11495 bool IsNonPoisoned =
true;
11496 bool IsUsedInExpr =
true;
11497 Value *Vec1 =
nullptr;
11498 if (!ExtractShuffles.
empty()) {
11502 Value *Vec2 =
nullptr;
11503 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
11507 if (UseVecBaseAsInput) {
11508 Vec1 = ExtractVecBase;
11510 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
11513 if (isa<UndefValue>(
E->Scalars[
I]))
11515 auto *EI = cast<ExtractElementInst>(
E->Scalars[
I]);
11516 Value *VecOp = EI->getVectorOperand();
11517 if (
const auto *TE = getTreeEntry(VecOp))
11518 if (
TE->VectorizedValue)
11519 VecOp =
TE->VectorizedValue;
11522 }
else if (Vec1 != EI->getVectorOperand()) {
11523 assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
11524 "Expected only 1 or 2 vectors shuffle.");
11530 IsUsedInExpr =
false;
11533 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
11535 IsUsedInExpr &= FindReusedSplat(
11537 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
11538 ExtractMask.size());
11539 ShuffleBuilder.add(Vec1, ExtractMask,
true);
11542 IsUsedInExpr =
false;
11544 ScalarTy, GatheredScalars.size())),
11545 ExtractMask,
true);
11548 if (!GatherShuffles.
empty()) {
11549 unsigned SliceSize =
E->Scalars.size() / NumParts;
11551 for (
const auto [
I, TEs] :
enumerate(Entries)) {
11554 "No shuffles with empty entries list expected.");
11558 "Expected shuffle of 1 or 2 entries.");
11561 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
11562 if (TEs.
size() == 1) {
11564 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I, SliceSize);
11565 ShuffleBuilder.add(*TEs.
front(), VecMask);
11566 if (TEs.
front()->VectorizedValue)
11570 IsUsedInExpr =
false;
11571 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
11572 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
11583 int EMSz = ExtractMask.size();
11584 int MSz =
Mask.size();
11587 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
11588 bool IsIdentityShuffle =
11589 ((UseVecBaseAsInput ||
11591 [](
const std::optional<TTI::ShuffleKind> &SK) {
11595 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
11597 (!GatherShuffles.
empty() &&
11599 [](
const std::optional<TTI::ShuffleKind> &SK) {
11603 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
11605 bool EnoughConstsForShuffle =
11609 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
11613 return isa<Constant>(V) && !isa<UndefValue>(V);
11615 (!IsIdentityShuffle ||
11616 (GatheredScalars.size() == 2 &&
11618 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
11620 return isa<Constant>(V) && !isa<PoisonValue>(V);
11624 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
11625 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
11633 TryPackScalars(GatheredScalars, BVMask,
true);
11634 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
11635 ShuffleBuilder.add(BV, BVMask);
11638 return isa<PoisonValue>(V) ||
11639 (IsSingleShuffle && ((IsIdentityShuffle &&
11640 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
11642 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices);
11644 Res = ShuffleBuilder.finalize(
11645 E->ReuseShuffleIndices,
E->Scalars.size(),
11647 TryPackScalars(NonConstants, Mask,
false);
11648 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
11653 TryPackScalars(GatheredScalars, ReuseMask,
true);
11654 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
11655 ShuffleBuilder.add(BV, ReuseMask);
11656 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices);
11661 if (!isa<PoisonValue>(V))
11664 Value *BV = ShuffleBuilder.gather(
E->Scalars);
11665 ShuffleBuilder.add(BV, Mask);
11666 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices);
11670 Res = ShuffleBuilder.createFreeze(Res);
11674Value *BoUpSLP::createBuildVector(
const TreeEntry *
E) {
11675 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, Builder,
11682 if (
E->VectorizedValue &&
11683 (
E->State != TreeEntry::Vectorize ||
E->getOpcode() != Instruction::PHI ||
11684 E->isAltShuffle())) {
11685 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *
E->Scalars[0] <<
".\n");
11686 return E->VectorizedValue;
11689 if (
E->State == TreeEntry::NeedToGather) {
11691 if (
E->getMainOp() &&
E->Idx == 0 && !UserIgnoreList)
11692 setInsertPointAfterBundle(
E);
11693 Value *Vec = createBuildVector(
E);
11694 E->VectorizedValue = Vec;
11701 if (
V->getType() != VecTy)
11703 ShuffleInstructionBuilder ShuffleBuilder(Builder, *
this);
11704 if (
E->getOpcode() == Instruction::Store) {
11706 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
11707 E->ReorderIndices.size());
11708 ShuffleBuilder.add(V, Mask);
11709 }
else if (
E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
11710 ShuffleBuilder.addOrdered(V, std::nullopt);
11712 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
11714 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices);
11717 assert((
E->State == TreeEntry::Vectorize ||
11718 E->State == TreeEntry::ScatterVectorize ||
11719 E->State == TreeEntry::StridedVectorize) &&
11720 "Unhandled state");
11721 unsigned ShuffleOrOp =
11722 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector :
E->getOpcode();
11725 if (
auto *Store = dyn_cast<StoreInst>(VL0))
11726 ScalarTy =
Store->getValueOperand()->getType();
11727 else if (
auto *IE = dyn_cast<InsertElementInst>(VL0))
11728 ScalarTy =
IE->getOperand(1)->getType();
11729 bool IsSigned =
false;
11730 auto It = MinBWs.
find(
E);
11731 if (It != MinBWs.
end()) {
11733 IsSigned = It->second.second;
11736 switch (ShuffleOrOp) {
11737 case Instruction::PHI: {
11738 assert((
E->ReorderIndices.empty() ||
11739 E != VectorizableTree.front().get() ||
11740 !
E->UserTreeIndices.empty()) &&
11741 "PHI reordering is free.");
11742 if (PostponedPHIs &&
E->VectorizedValue)
11743 return E->VectorizedValue;
11744 auto *PH = cast<PHINode>(VL0);
11746 PH->getParent()->getFirstNonPHIIt());
11748 if (PostponedPHIs || !
E->VectorizedValue) {
11755 PH->getParent()->getFirstInsertionPt());
11758 V = FinalShuffle(V,
E, VecTy, IsSigned);
11760 E->VectorizedValue =
V;
11764 PHINode *NewPhi = cast<PHINode>(
E->PHI);
11773 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
11779 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
11783 if (!VisitedBBs.
insert(IBB).second) {
11790 Value *Vec = vectorizeOperand(
E,
I,
true);
11791 if (VecTy != Vec->
getType()) {
11793 "Expected item in MinBWs.");
11794 Vec = Builder.
CreateIntCast(Vec, VecTy, It->second.second);
11800 "Invalid number of incoming values");
11804 case Instruction::ExtractElement: {
11805 Value *
V =
E->getSingleOperand(0);
11806 if (
const TreeEntry *TE = getTreeEntry(V))
11807 V =
TE->VectorizedValue;
11808 setInsertPointAfterBundle(
E);
11809 V = FinalShuffle(V,
E, VecTy, IsSigned);
11810 E->VectorizedValue =
V;
11813 case Instruction::ExtractValue: {
11814 auto *LI = cast<LoadInst>(
E->getSingleOperand(0));
11819 NewV = FinalShuffle(NewV,
E, VecTy, IsSigned);
11820 E->VectorizedValue = NewV;
11823 case Instruction::InsertElement: {
11824 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
11826 Value *
V = vectorizeOperand(
E, 1, PostponedPHIs);
11828 Type *ScalarTy =
Op.front()->getType();
11829 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
11831 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(
E, 1));
11832 assert(Res.first > 0 &&
"Expected item in MinBWs.");
11837 cast<FixedVectorType>(
V->getType())->getNumElements()),
11842 auto *FirstInsert = cast<Instruction>(*
find_if(
E->Scalars, [
E](
Value *V) {
11843 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11845 const unsigned NumElts =
11846 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
11847 const unsigned NumScalars =
E->Scalars.size();
11850 assert(
Offset < NumElts &&
"Failed to find vector index offset");
11854 if (!
E->ReorderIndices.empty()) {
11859 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
11862 bool IsIdentity =
true;
11864 Mask.swap(PrevMask);
11865 for (
unsigned I = 0;
I < NumScalars; ++
I) {
11868 IsIdentity &= InsertIdx -
Offset ==
I;
11871 if (!IsIdentity || NumElts != NumScalars) {
11875 if (NumElts != NumScalars &&
Offset == 0) {
11884 InsertMask[*InsertIdx] = *InsertIdx;
11885 if (!
Ins->hasOneUse())
11887 Ins = dyn_cast_or_null<InsertElementInst>(
11888 Ins->getUniqueUndroppableUser());
11891 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
11893 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
11896 if (!IsFirstPoison.
all()) {
11898 for (
unsigned I = 0;
I < NumElts;
I++) {
11900 IsFirstUndef.
test(
I)) {
11901 if (IsVNonPoisonous) {
11902 InsertMask[
I] =
I < NumScalars ?
I : 0;
11907 if (
Idx >= NumScalars)
11908 Idx = NumScalars - 1;
11909 InsertMask[
I] = NumScalars +
Idx;
11923 if (
auto *
I = dyn_cast<Instruction>(V)) {
11924 GatherShuffleExtractSeq.
insert(
I);
11925 CSEBlocks.
insert(
I->getParent());
11930 for (
unsigned I = 0;
I < NumElts;
I++) {
11935 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
11938 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
11939 NumElts != NumScalars) {
11940 if (IsFirstUndef.
all()) {
11943 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
11944 if (!IsFirstPoison.
all()) {
11945 for (
unsigned I = 0;
I < NumElts;
I++) {
11947 InsertMask[
I] =
I + NumElts;
11954 InsertMask, cast<Instruction>(
E->Scalars.back())->
getName());
11955 if (
auto *
I = dyn_cast<Instruction>(V)) {
11956 GatherShuffleExtractSeq.
insert(
I);
11957 CSEBlocks.
insert(
I->getParent());
11962 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
11963 for (
unsigned I = 0;
I < NumElts;
I++) {
11967 InsertMask[
I] += NumElts;
11970 FirstInsert->getOperand(0), V, InsertMask,
11971 cast<Instruction>(
E->Scalars.back())->getName());
11972 if (
auto *
I = dyn_cast<Instruction>(V)) {
11973 GatherShuffleExtractSeq.
insert(
I);
11974 CSEBlocks.
insert(
I->getParent());
11979 ++NumVectorInstructions;
11980 E->VectorizedValue =
V;
11983 case Instruction::ZExt:
11984 case Instruction::SExt:
11985 case Instruction::FPToUI:
11986 case Instruction::FPToSI:
11987 case Instruction::FPExt:
11988 case Instruction::PtrToInt:
11989 case Instruction::IntToPtr:
11990 case Instruction::SIToFP:
11991 case Instruction::UIToFP:
11992 case Instruction::Trunc:
11993 case Instruction::FPTrunc:
11994 case Instruction::BitCast: {
11995 setInsertPointAfterBundle(
E);
11997 Value *InVec = vectorizeOperand(
E, 0, PostponedPHIs);
11998 if (
E->VectorizedValue) {
11999 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12000 return E->VectorizedValue;
12003 auto *CI = cast<CastInst>(VL0);
12006 auto SrcIt = MinBWs.
find(getOperandEntry(
E, 0));
12008 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
12010 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
12011 if (SrcIt != MinBWs.
end())
12012 SrcBWSz = SrcIt->second.first;
12013 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
12014 if (BWSz == SrcBWSz) {
12015 VecOpcode = Instruction::BitCast;
12016 }
else if (BWSz < SrcBWSz) {
12017 VecOpcode = Instruction::Trunc;
12018 }
else if (SrcIt != MinBWs.
end()) {
12019 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12021 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12024 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12026 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
12027 V = FinalShuffle(V,
E, VecTy, IsSigned);
12029 E->VectorizedValue =
V;
12030 ++NumVectorInstructions;
12033 case Instruction::FCmp:
12034 case Instruction::ICmp: {
12035 setInsertPointAfterBundle(
E);
12037 Value *
L = vectorizeOperand(
E, 0, PostponedPHIs);
12038 if (
E->VectorizedValue) {
12039 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12040 return E->VectorizedValue;
12042 Value *
R = vectorizeOperand(
E, 1, PostponedPHIs);
12043 if (
E->VectorizedValue) {
12044 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12045 return E->VectorizedValue;
12047 if (
L->getType() !=
R->getType()) {
12049 MinBWs.
contains(getOperandEntry(
E, 1))) &&
12050 "Expected item in MinBWs.");
12059 VecTy = cast<FixedVectorType>(
V->getType());
12060 V = FinalShuffle(V,
E, VecTy, IsSigned);
12062 E->VectorizedValue =
V;
12063 ++NumVectorInstructions;
12066 case Instruction::Select: {
12067 setInsertPointAfterBundle(
E);
12069 Value *
Cond = vectorizeOperand(
E, 0, PostponedPHIs);
12070 if (
E->VectorizedValue) {
12071 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12072 return E->VectorizedValue;
12074 Value *True = vectorizeOperand(
E, 1, PostponedPHIs);
12075 if (
E->VectorizedValue) {
12076 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12077 return E->VectorizedValue;
12079 Value *False = vectorizeOperand(
E, 2, PostponedPHIs);
12080 if (
E->VectorizedValue) {
12081 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12082 return E->VectorizedValue;
12086 MinBWs.
contains(getOperandEntry(
E, 2))) &&
12087 "Expected item in MinBWs.");
12093 V = FinalShuffle(V,
E, VecTy, IsSigned);
12095 E->VectorizedValue =
V;
12096 ++NumVectorInstructions;
12099 case Instruction::FNeg: {
12100 setInsertPointAfterBundle(
E);
12102 Value *
Op = vectorizeOperand(
E, 0, PostponedPHIs);
12104 if (
E->VectorizedValue) {
12105 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12106 return E->VectorizedValue;
12112 if (
auto *
I = dyn_cast<Instruction>(V))
12115 V = FinalShuffle(V,
E, VecTy, IsSigned);
12117 E->VectorizedValue =
V;
12118 ++NumVectorInstructions;
12122 case Instruction::Add:
12123 case Instruction::FAdd:
12124 case Instruction::Sub:
12125 case Instruction::FSub:
12126 case Instruction::Mul:
12127 case Instruction::FMul:
12128 case Instruction::UDiv:
12129 case Instruction::SDiv:
12130 case Instruction::FDiv:
12131 case Instruction::URem:
12132 case Instruction::SRem:
12133 case Instruction::FRem:
12134 case Instruction::Shl:
12135 case Instruction::LShr:
12136 case Instruction::AShr:
12137 case Instruction::And:
12138 case Instruction::Or:
12139 case Instruction::Xor: {
12140 setInsertPointAfterBundle(
E);
12142 Value *
LHS = vectorizeOperand(
E, 0, PostponedPHIs);
12143 if (
E->VectorizedValue) {
12144 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12145 return E->VectorizedValue;
12147 Value *
RHS = vectorizeOperand(
E, 1, PostponedPHIs);
12148 if (
E->VectorizedValue) {
12149 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12150 return E->VectorizedValue;
12154 MinBWs.
contains(getOperandEntry(
E, 1))) &&
12155 "Expected item in MinBWs.");
12164 if (
auto *
I = dyn_cast<Instruction>(V))
12167 V = FinalShuffle(V,
E, VecTy, IsSigned);
12169 E->VectorizedValue =
V;
12170 ++NumVectorInstructions;
12174 case Instruction::Load: {
12177 setInsertPointAfterBundle(
E);
12179 LoadInst *LI = cast<LoadInst>(VL0);
12182 if (
E->State == TreeEntry::Vectorize) {
12184 }
else if (
E->State == TreeEntry::StridedVectorize) {
12185 Value *Ptr0 = cast<LoadInst>(
E->Scalars.front())->getPointerOperand();
12186 Value *PtrN = cast<LoadInst>(
E->Scalars.back())->getPointerOperand();
12187 PO = IsReverseOrder ? PtrN : Ptr0;
12193 int Stride = *Diff / (
static_cast<int>(
E->Scalars.size()) - 1);
12195 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
12196 DL->getTypeAllocSize(ScalarTy));
12200 return cast<LoadInst>(V)->getPointerOperand();
12203 std::optional<Value *> Stride =
12212 (IsReverseOrder ? -1 : 1) *
12213 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
12215 Align CommonAlignment = computeCommonAlignment<LoadInst>(
E->Scalars);
12217 Intrinsic::experimental_vp_strided_load,
12218 {VecTy, PO->
getType(), StrideTy},
12219 {PO, StrideVal, Builder.
getAllOnesMask(VecTy->getElementCount()),
12226 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
12227 Value *VecPtr = vectorizeOperand(
E, 0, PostponedPHIs);
12228 if (
E->VectorizedValue) {
12229 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12230 return E->VectorizedValue;
12233 Align CommonAlignment = computeCommonAlignment<LoadInst>(
E->Scalars);
12238 V = FinalShuffle(V,
E, VecTy, IsSigned);
12239 E->VectorizedValue =
V;
12240 ++NumVectorInstructions;
12243 case Instruction::Store: {
12244 auto *
SI = cast<StoreInst>(VL0);
12246 setInsertPointAfterBundle(
E);
12248 Value *VecValue = vectorizeOperand(
E, 0, PostponedPHIs);
12249 VecValue = FinalShuffle(VecValue,
E, VecTy, IsSigned);
12257 E->VectorizedValue =
V;
12258 ++NumVectorInstructions;
12261 case Instruction::GetElementPtr: {
12262 auto *GEP0 = cast<GetElementPtrInst>(VL0);
12263 setInsertPointAfterBundle(
E);
12265 Value *Op0 = vectorizeOperand(
E, 0, PostponedPHIs);
12266 if (
E->VectorizedValue) {
12267 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12268 return E->VectorizedValue;
12272 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
12273 Value *OpVec = vectorizeOperand(
E, J, PostponedPHIs);
12274 if (
E->VectorizedValue) {
12275 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12276 return E->VectorizedValue;
12281 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
12282 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
12284 for (
Value *V :
E->Scalars) {
12285 if (isa<GetElementPtrInst>(V))
12291 V = FinalShuffle(V,
E, VecTy, IsSigned);
12293 E->VectorizedValue =
V;
12294 ++NumVectorInstructions;
12298 case Instruction::Call: {
12299 CallInst *CI = cast<CallInst>(VL0);
12300 setInsertPointAfterBundle(
E);
12306 VecCallCosts.first <= VecCallCosts.second;
12308 Value *ScalarArg =
nullptr;
12315 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
12320 CallInst *CEI = cast<CallInst>(VL0);
12328 Value *OpVec = vectorizeOperand(
E,
I, PostponedPHIs);
12329 if (
E->VectorizedValue) {
12330 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12331 return E->VectorizedValue;
12340 if (!UseIntrinsic) {
12344 static_cast<unsigned>(VecTy->getNumElements())),
12356 V = FinalShuffle(V,
E, VecTy, IsSigned);
12358 E->VectorizedValue =
V;
12359 ++NumVectorInstructions;
12362 case Instruction::ShuffleVector: {
12368 (isa<CmpInst>(VL0) && isa<CmpInst>(
E->getAltOp()))) &&
12369 "Invalid Shuffle Vector Operand");
12373 setInsertPointAfterBundle(
E);
12374 LHS = vectorizeOperand(
E, 0, PostponedPHIs);
12375 if (
E->VectorizedValue) {
12376 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12377 return E->VectorizedValue;
12379 RHS = vectorizeOperand(
E, 1, PostponedPHIs);
12381 setInsertPointAfterBundle(
E);
12382 LHS = vectorizeOperand(
E, 0, PostponedPHIs);
12384 if (
E->VectorizedValue) {
12385 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12386 return E->VectorizedValue;
12390 MinBWs.
contains(getOperandEntry(
E, 1))) &&
12391 "Expected item in MinBWs.");
12402 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
12403 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
12404 auto *AltCI = cast<CmpInst>(
E->getAltOp());
12406 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
12415 for (
Value *V : {V0, V1}) {
12416 if (
auto *
I = dyn_cast<Instruction>(V)) {
12417 GatherShuffleExtractSeq.
insert(
I);
12418 CSEBlocks.
insert(
I->getParent());
12427 E->buildAltOpShuffleMask(
12429 assert(
E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
12433 Mask, &OpScalars, &AltScalars);
12439 if (
auto *
I = dyn_cast<Instruction>(V)) {
12441 GatherShuffleExtractSeq.
insert(
I);
12442 CSEBlocks.
insert(
I->getParent());
12445 if (
V->getType() != VecTy && !isa<CmpInst>(VL0))
12448 E->VectorizedValue =
V;
12449 ++NumVectorInstructions;
12462 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
12468struct ShuffledInsertData {
12481 for (
auto &BSIter : BlocksSchedules) {
12482 scheduleBlock(BSIter.second.get());
12486 EntryToLastInstruction.
clear();
12496 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
12497 if (TE->State == TreeEntry::Vectorize &&
12498 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
12499 TE->VectorizedValue)
12505 for (
const TreeEntry *
E : PostponedNodes) {
12506 auto *TE =
const_cast<TreeEntry *
>(
E);
12507 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
12508 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
12509 TE->UserTreeIndices.front().EdgeIdx)))
12513 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
12514 TE->VectorizedValue =
nullptr;
12516 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
12525 if (isa<PHINode>(UserI)) {
12528 for (
User *U : PrevVec->users()) {
12531 auto *UI = dyn_cast<Instruction>(U);
12532 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
12534 if (UI->comesBefore(InsertPt))
12543 PrevVec->replaceAllUsesWith(Vec);
12544 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
12547 auto It = PostponedValues.
find(PrevVec);
12548 if (It != PostponedValues.
end()) {
12549 for (TreeEntry *VTE : It->getSecond())
12550 VTE->VectorizedValue = Vec;
12570 for (
const auto &ExternalUse : ExternalUses) {
12571 Value *Scalar = ExternalUse.Scalar;
12578 TreeEntry *
E = getTreeEntry(Scalar);
12579 assert(
E &&
"Invalid scalar");
12580 assert(
E->State != TreeEntry::NeedToGather &&
12581 "Extracting from a gather list");
12583 if (
E->getOpcode() == Instruction::GetElementPtr &&
12584 !isa<GetElementPtrInst>(Scalar))
12587 Value *Vec =
E->VectorizedValue;
12588 assert(Vec &&
"Can't find vectorizable value");
12591 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
12592 if (Scalar->getType() != Vec->
getType()) {
12593 Value *Ex =
nullptr;
12594 Value *ExV =
nullptr;
12595 auto It = ScalarToEEs.find(Scalar);
12596 if (It != ScalarToEEs.end()) {
12600 if (EEIt != It->second.end()) {
12606 if (
auto *CI = EEIt->second.second)
12610 ExV = EEIt->second.second ? EEIt->second.second : Ex;
12615 if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
12616 Value *V = ES->getVectorOperand();
12617 if (
const TreeEntry *ETE = getTreeEntry(V))
12618 V = ETE->VectorizedValue;
12626 if (Scalar->getType() != Ex->
getType())
12628 MinBWs.
find(
E)->second.second);
12629 if (
auto *
I = dyn_cast<Instruction>(Ex))
12630 ScalarToEEs[Scalar].try_emplace(
12632 std::make_pair(
I, cast<Instruction>(ExV)));
12636 if (
auto *ExI = dyn_cast<Instruction>(Ex)) {
12637 GatherShuffleExtractSeq.
insert(ExI);
12638 CSEBlocks.
insert(ExI->getParent());
12642 assert(isa<FixedVectorType>(Scalar->getType()) &&
12643 isa<InsertElementInst>(Scalar) &&
12644 "In-tree scalar of vector type is not insertelement?");
12645 auto *IE = cast<InsertElementInst>(Scalar);
12653 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
12658 TreeEntry *UseEntry = getTreeEntry(U);
12660 (UseEntry->State == TreeEntry::Vectorize ||
12662 TreeEntry::StridedVectorize) &&
12663 (E->State == TreeEntry::Vectorize ||
12664 E->State == TreeEntry::StridedVectorize) &&
12665 doesInTreeUserNeedToExtract(
12667 cast<Instruction>(UseEntry->Scalars.front()),
12670 "Scalar with nullptr User must be registered in "
12671 "ExternallyUsedValues map or remain as scalar in vectorized "
12673 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
12674 if (
auto *
PHI = dyn_cast<PHINode>(VecI))
12676 PHI->getParent()->getFirstNonPHIIt());
12679 std::next(VecI->getIterator()));
12683 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
12685 Scalar->replaceAllUsesWith(NewInst);
12686 ReplacedExternals.emplace_back(Scalar, NewInst);
12690 if (
auto *VU = dyn_cast<InsertElementInst>(
User)) {
12692 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
12693 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
12694 if (!UsedInserts.
insert(VU).second)
12697 auto BWIt = MinBWs.
find(
E);
12699 auto *ScalarTy = FTy->getElementType();
12700 auto Key = std::make_pair(Vec, ScalarTy);
12701 auto VecIt = VectorCasts.
find(Key);
12702 if (VecIt == VectorCasts.
end()) {
12704 if (
auto *IVec = dyn_cast<Instruction>(Vec))
12710 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
12711 BWIt->second.second);
12714 Vec = VecIt->second;
12721 find_if(ShuffledInserts, [VU](
const ShuffledInsertData &
Data) {
12728 unsigned Idx = *InsertIdx;
12729 if (It == ShuffledInserts.
end()) {
12731 It = std::next(ShuffledInserts.
begin(),
12732 ShuffledInserts.
size() - 1);
12738 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
12739 if (IEBase !=
User &&
12740 (!IEBase->hasOneUse() ||
12744 if (
const TreeEntry *
E = getTreeEntry(IEBase)) {
12746 IEBase = cast<InsertElementInst>(
Base);
12749 "InsertElementInstruction used already.");
12750 Mask[IEIdx] = IEIdx;
12751 Base = IEBase->getOperand(0);
12752 }
while (
E == getTreeEntry(
Base));
12755 Base = cast<InsertElementInst>(
Base)->getOperand(0);
12759 auto It = VectorToInsertElement.
find(
Base);
12760 if (It != VectorToInsertElement.
end())
12767 Mask[
Idx] = ExternalUse.Lane;
12768 It->InsertElements.push_back(cast<InsertElementInst>(
User));
12777 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
12779 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12780 if (PH->getIncomingValue(
I) == Scalar) {
12782 PH->getIncomingBlock(
I)->getTerminator();
12783 if (isa<CatchSwitchInst>(IncomingTerminator)) {
12785 std::next(VecI->getIterator()));
12789 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
12790 PH->setOperand(
I, NewInst);
12795 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
12800 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
12810 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
12811 for (
int I = 0,
E = Mask.size();
I <
E; ++
I) {
12813 CombinedMask1[
I] = Mask[
I];
12815 CombinedMask2[
I] = Mask[
I] - VF;
12818 ShuffleBuilder.
add(V1, CombinedMask1);
12820 ShuffleBuilder.
add(V2, CombinedMask2);
12821 return ShuffleBuilder.
finalize(std::nullopt);
12825 bool ForSingleMask) {
12826 unsigned VF = Mask.size();
12827 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
12829 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
12830 Vec = CreateShuffle(Vec,
nullptr, Mask);
12831 return std::make_pair(Vec,
true);
12833 if (!ForSingleMask) {
12835 for (
unsigned I = 0;
I < VF; ++
I) {
12837 ResizeMask[Mask[
I]] = Mask[
I];
12839 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
12843 return std::make_pair(Vec,
false);
12847 for (
int I = 0,
E = ShuffledInserts.
size();
I <
E; ++
I) {
12853 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
12854 Value *NewInst = performExtractsShuffleAction<Value>(
12858 return cast<VectorType>(Vec->getType())
12859 ->getElementCount()
12860 .getKnownMinValue();
12865 assert((Vals.size() == 1 || Vals.size() == 2) &&
12866 "Expected exactly 1 or 2 input values.");
12867 if (Vals.size() == 1) {
12870 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
12871 ->getNumElements() ||
12872 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
12873 return CreateShuffle(Vals.front(), nullptr, Mask);
12874 return Vals.front();
12876 return CreateShuffle(Vals.
front() ? Vals.
front()
12878 Vals.
back(), Mask);
12880 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
12883 if (It != ShuffledInserts[
I].InsertElements.
rend())
12886 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
12887 assert(II &&
"Must be an insertelement instruction.");
12891 Inserts.
push_back(cast<Instruction>(II));
12892 II = dyn_cast<InsertElementInst>(II->
getOperand(0));
12896 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
12903 IE->replaceUsesOfWith(IE->getOperand(0),
12905 IE->replaceUsesOfWith(IE->getOperand(1),
12909 CSEBlocks.
insert(LastInsert->getParent());
12914 for (
auto &TEPtr : VectorizableTree) {
12915 TreeEntry *Entry = TEPtr.get();
12918 if (Entry->State == TreeEntry::NeedToGather)
12921 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
12924 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
12925 Value *Scalar = Entry->Scalars[Lane];
12927 if (Entry->getOpcode() == Instruction::GetElementPtr &&
12928 !isa<GetElementPtrInst>(Scalar))
12931 Type *Ty = Scalar->getType();
12933 for (
User *U : Scalar->users()) {
12937 assert((getTreeEntry(U) ||
12938 (UserIgnoreList && UserIgnoreList->contains(U)) ||
12939 (isa_and_nonnull<Instruction>(U) &&
12940 isDeleted(cast<Instruction>(U)))) &&
12941 "Deleting out-of-tree value");
12945 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
12950 RemovedInsts.
push_back(cast<Instruction>(Scalar));
12956 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
12957 V->mergeDIAssignID(RemovedInsts);
12960 InstrElementSize.
clear();
12962 return VectorizableTree[0]->VectorizedValue;
12967 <<
" gather sequences instructions.\n");
12974 Loop *L = LI->getLoopFor(
I->getParent());
12979 BasicBlock *PreHeader = L->getLoopPreheader();
12987 auto *OpI = dyn_cast<Instruction>(V);
12988 return OpI && L->contains(OpI);
12994 CSEBlocks.
insert(PreHeader);
13009 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
13010 "Different nodes should have different DFS numbers");
13011 return A->getDFSNumIn() <
B->getDFSNumIn();
13021 if (I1->getType() != I2->getType())
13023 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
13024 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
13026 return I1->isIdenticalTo(I2);
13027 if (SI1->isIdenticalTo(SI2))
13029 for (
int I = 0,
E = SI1->getNumOperands();
I <
E; ++
I)
13030 if (SI1->getOperand(
I) != SI2->getOperand(
I))
13033 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
13037 unsigned LastUndefsCnt = 0;
13038 for (
int I = 0,
E = NewMask.
size();
I <
E; ++
I) {
13044 NewMask[
I] != SM1[
I])
13047 NewMask[
I] = SM1[
I];
13051 return SM1.
size() - LastUndefsCnt > 1 &&
13055 SM1.
size() - LastUndefsCnt));
13061 for (
auto I = CSEWorkList.
begin(),
E = CSEWorkList.
end();
I !=
E; ++
I) {
13064 "Worklist not sorted properly!");
13070 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
13071 !GatherShuffleExtractSeq.contains(&In))
13076 bool Replaced =
false;
13079 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
13080 DT->
dominates(V->getParent(), In.getParent())) {
13081 In.replaceAllUsesWith(V);
13083 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
13084 if (!NewMask.
empty())
13085 SI->setShuffleMask(NewMask);
13089 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
13090 GatherShuffleExtractSeq.contains(V) &&
13091 IsIdenticalOrLessDefined(V, &In, NewMask) &&
13092 DT->
dominates(In.getParent(), V->getParent())) {
13094 V->replaceAllUsesWith(&In);
13096 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
13097 if (!NewMask.
empty())
13098 SI->setShuffleMask(NewMask);
13106 Visited.push_back(&In);
13111 GatherShuffleExtractSeq.clear();
13114BoUpSLP::ScheduleData *
13116 ScheduleData *Bundle =
nullptr;
13117 ScheduleData *PrevInBundle =
nullptr;
13118 for (
Value *V : VL) {
13121 ScheduleData *BundleMember = getScheduleData(V);
13123 "no ScheduleData for bundle member "
13124 "(maybe not in same basic block)");
13125 assert(BundleMember->isSchedulingEntity() &&
13126 "bundle member already part of other bundle");
13127 if (PrevInBundle) {
13128 PrevInBundle->NextInBundle = BundleMember;
13130 Bundle = BundleMember;
13134 BundleMember->FirstInBundle = Bundle;
13135 PrevInBundle = BundleMember;
13137 assert(Bundle &&
"Failed to find schedule bundle");
13143std::optional<BoUpSLP::ScheduleData *>
13145 const InstructionsState &S) {
13156 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
13157 ScheduleData *Bundle) {
13163 if (ScheduleEnd != OldScheduleEnd) {
13164 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
13165 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->clearDependencies(); });
13170 <<
" in block " << BB->
getName() <<
"\n");
13171 calculateDependencies(Bundle,
true, SLP);
13176 initialFillReadyList(ReadyInsts);
13183 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
13184 !ReadyInsts.empty()) {
13185 ScheduleData *Picked = ReadyInsts.pop_back_val();
13186 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
13187 "must be ready to schedule");
13188 schedule(Picked, ReadyInsts);
13194 for (
Value *V : VL) {
13197 if (!extendSchedulingRegion(V, S)) {
13204 TryScheduleBundleImpl(
false,
nullptr);
13205 return std::nullopt;
13209 bool ReSchedule =
false;
13210 for (
Value *V : VL) {
13213 ScheduleData *BundleMember = getScheduleData(V);
13215 "no ScheduleData for bundle member (maybe not in same basic block)");
13219 ReadyInsts.remove(BundleMember);
13221 if (!BundleMember->IsScheduled)
13226 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
13227 <<
" was already scheduled\n");
13231 auto *Bundle = buildBundle(VL);
13232 TryScheduleBundleImpl(ReSchedule, Bundle);
13233 if (!Bundle->isReady()) {
13234 cancelScheduling(VL, S.OpValue);
13235 return std::nullopt;
13248 ScheduleData *Bundle = getScheduleData(OpValue);
13249 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
13250 assert(!Bundle->IsScheduled &&
13251 "Can't cancel bundle which is already scheduled");
13252 assert(Bundle->isSchedulingEntity() &&
13254 "tried to unbundle something which is not a bundle");
13257 if (Bundle->isReady())
13258 ReadyInsts.remove(Bundle);
13261 ScheduleData *BundleMember = Bundle;
13262 while (BundleMember) {
13263 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
13264 BundleMember->FirstInBundle = BundleMember;
13265 ScheduleData *Next = BundleMember->NextInBundle;
13266 BundleMember->NextInBundle =
nullptr;
13267 BundleMember->TE =
nullptr;
13268 if (BundleMember->unscheduledDepsInBundle() == 0) {
13269 ReadyInsts.insert(BundleMember);
13271 BundleMember = Next;
13275BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
13277 if (ChunkPos >= ChunkSize) {
13278 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
13281 return &(ScheduleDataChunks.back()[ChunkPos++]);
13284bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V,
13285 const InstructionsState &S) {
13286 if (getScheduleData(V,
isOneOf(S, V)))
13289 assert(
I &&
"bundle member must be an instruction");
13292 "phi nodes/insertelements/extractelements/extractvalues don't need to "
13294 auto &&CheckScheduleForI = [
this, &S](
Instruction *
I) ->
bool {
13295 ScheduleData *ISD = getScheduleData(
I);
13298 assert(isInSchedulingRegion(ISD) &&
13299 "ScheduleData not in scheduling region");
13300 ScheduleData *SD = allocateScheduleDataChunks();
13302 SD->init(SchedulingRegionID, S.OpValue);
13303 ExtraScheduleDataMap[
I][S.OpValue] = SD;
13306 if (CheckScheduleForI(
I))
13308 if (!ScheduleStart) {
13310 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
13312 ScheduleEnd =
I->getNextNode();
13314 CheckScheduleForI(
I);
13315 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
13316 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
13324 ++ScheduleStart->getIterator().getReverse();
13329 if (
auto *II = dyn_cast<IntrinsicInst>(&
I))
13330 return II->isAssumeLikeIntrinsic();
13333 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
13334 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
13335 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
13337 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
13338 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
13345 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
13346 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
13348 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
13349 assert(
I->getParent() == ScheduleStart->getParent() &&
13350 "Instruction is in wrong basic block.");
13351 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
13354 CheckScheduleForI(
I);
13359 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
13360 "Expected to reach top of the basic block or instruction down the "
13362 assert(
I->getParent() == ScheduleEnd->getParent() &&
13363 "Instruction is in wrong basic block.");
13364 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
13366 ScheduleEnd =
I->getNextNode();
13368 CheckScheduleForI(
I);
13369 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
13370 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
13374void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
13376 ScheduleData *PrevLoadStore,
13377 ScheduleData *NextLoadStore) {
13378 ScheduleData *CurrentLoadStore = PrevLoadStore;
13383 ScheduleData *SD = ScheduleDataMap.lookup(
I);
13385 SD = allocateScheduleDataChunks();
13386 ScheduleDataMap[
I] = SD;
13389 assert(!isInSchedulingRegion(SD) &&
13390 "new ScheduleData already in scheduling region");
13391 SD->init(SchedulingRegionID,
I);
13393 if (
I->mayReadOrWriteMemory() &&
13394 (!isa<IntrinsicInst>(
I) ||
13395 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
13396 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
13397 Intrinsic::pseudoprobe))) {
13399 if (CurrentLoadStore) {
13400 CurrentLoadStore->NextLoadStore = SD;
13402 FirstLoadStoreInRegion = SD;
13404 CurrentLoadStore = SD;
13407 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
13408 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
13409 RegionHasStackSave =
true;
13411 if (NextLoadStore) {
13412 if (CurrentLoadStore)
13413 CurrentLoadStore->NextLoadStore = NextLoadStore;
13415 LastLoadStoreInRegion = CurrentLoadStore;
13419void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
13420 bool InsertInReadyList,
13422 assert(SD->isSchedulingEntity());
13427 while (!WorkList.
empty()) {
13429 for (ScheduleData *BundleMember = SD; BundleMember;
13430 BundleMember = BundleMember->NextInBundle) {
13431 assert(isInSchedulingRegion(BundleMember));
13432 if (BundleMember->hasValidDependencies())
13437 BundleMember->Dependencies = 0;
13438 BundleMember->resetUnscheduledDeps();
13441 if (BundleMember->OpValue != BundleMember->Inst) {
13442 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
13443 BundleMember->Dependencies++;
13444 ScheduleData *DestBundle = UseSD->FirstInBundle;
13445 if (!DestBundle->IsScheduled)
13446 BundleMember->incrementUnscheduledDeps(1);
13447 if (!DestBundle->hasValidDependencies())
13451 for (
User *U : BundleMember->Inst->
users()) {
13452 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
13453 BundleMember->Dependencies++;
13454 ScheduleData *DestBundle = UseSD->FirstInBundle;
13455 if (!DestBundle->IsScheduled)
13456 BundleMember->incrementUnscheduledDeps(1);
13457 if (!DestBundle->hasValidDependencies())
13464 auto *DepDest = getScheduleData(
I);
13465 assert(DepDest &&
"must be in schedule window");
13466 DepDest->ControlDependencies.push_back(BundleMember);
13467 BundleMember->Dependencies++;
13468 ScheduleData *DestBundle = DepDest->FirstInBundle;
13469 if (!DestBundle->IsScheduled)
13470 BundleMember->incrementUnscheduledDeps(1);
13471 if (!DestBundle->hasValidDependencies())
13479 for (
Instruction *
I = BundleMember->Inst->getNextNode();
13480 I != ScheduleEnd;
I =
I->getNextNode()) {
13485 MakeControlDependent(
I);
13493 if (RegionHasStackSave) {
13497 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
13498 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
13499 for (
Instruction *
I = BundleMember->Inst->getNextNode();
13500 I != ScheduleEnd;
I =
I->getNextNode()) {
13501 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
13502 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
13507 if (!isa<AllocaInst>(
I))
13511 MakeControlDependent(
I);
13520 if (isa<AllocaInst>(BundleMember->Inst) ||
13521 BundleMember->Inst->mayReadOrWriteMemory()) {
13522 for (
Instruction *
I = BundleMember->Inst->getNextNode();
13523 I != ScheduleEnd;
I =
I->getNextNode()) {
13524 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
13525 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
13529 MakeControlDependent(
I);
13536 ScheduleData *DepDest = BundleMember->NextLoadStore;
13541 "NextLoadStore list for non memory effecting bundle?");
13543 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
13544 unsigned NumAliased = 0;
13545 unsigned DistToSrc = 1;
13547 for (; DepDest; DepDest = DepDest->NextLoadStore) {
13548 assert(isInSchedulingRegion(DepDest));
13558 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
13560 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
13567 DepDest->MemoryDependencies.push_back(BundleMember);
13568 BundleMember->Dependencies++;
13569 ScheduleData *DestBundle = DepDest->FirstInBundle;
13570 if (!DestBundle->IsScheduled) {
13571 BundleMember->incrementUnscheduledDeps(1);
13573 if (!DestBundle->hasValidDependencies()) {
13596 if (InsertInReadyList && SD->isReady()) {
13597 ReadyInsts.insert(SD);
13604void BoUpSLP::BlockScheduling::resetSchedule() {
13606 "tried to reset schedule on block which has not been scheduled");
13607 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
13608 doForAllOpcodes(
I, [&](ScheduleData *SD) {
13609 assert(isInSchedulingRegion(SD) &&
13610 "ScheduleData not in scheduling region");
13611 SD->IsScheduled =
false;
13612 SD->resetUnscheduledDeps();
13615 ReadyInsts.clear();
13618void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
13619 if (!BS->ScheduleStart)
13622 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
13629 BS->resetSchedule();
13636 struct ScheduleDataCompare {
13637 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
13638 return SD2->SchedulingPriority < SD1->SchedulingPriority;
13641 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
13646 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
13647 I =
I->getNextNode()) {
13648 BS->doForAllOpcodes(
I, [
this, &
Idx, BS](ScheduleData *SD) {
13649 TreeEntry *SDTE = getTreeEntry(SD->Inst);
13652 SD->isPartOfBundle() ==
13654 "scheduler and vectorizer bundle mismatch");
13655 SD->FirstInBundle->SchedulingPriority =
Idx++;
13657 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
13658 BS->calculateDependencies(SD,
false,
this);
13661 BS->initialFillReadyList(ReadyInsts);
13663 Instruction *LastScheduledInst = BS->ScheduleEnd;
13666 while (!ReadyInsts.empty()) {
13667 ScheduleData *Picked = *ReadyInsts.begin();
13668 ReadyInsts.erase(ReadyInsts.begin());
13672 for (ScheduleData *BundleMember = Picked; BundleMember;
13673 BundleMember = BundleMember->NextInBundle) {
13677 LastScheduledInst = PickedInst;
13680 BS->schedule(Picked, ReadyInsts);
13684#ifdef EXPENSIVE_CHECKS
13688#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
13690 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
13691 BS->doForAllOpcodes(
I, [&](ScheduleData *SD) {
13692 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
13693 assert(SD->IsScheduled &&
"must be scheduled at this point");
13700 BS->ScheduleStart =
nullptr;
13707 if (
auto *Store = dyn_cast<StoreInst>(V))
13708 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
13710 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
13713 auto E = InstrElementSize.
find(V);
13714 if (
E != InstrElementSize.
end())
13723 if (
auto *
I = dyn_cast<Instruction>(V)) {
13731 while (!Worklist.
empty()) {
13738 auto *Ty =
I->getType();
13739 if (isa<VectorType>(Ty))
13744 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
13745 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
13753 for (
Use &U :
I->operands())
13754 if (
auto *J = dyn_cast<Instruction>(U.get()))
13755 if (Visited.
insert(J).second &&
13756 (isa<PHINode>(
I) || J->getParent() == Parent))
13767 if (
auto *CI = dyn_cast<CmpInst>(V))
13769 Width =
DL->getTypeSizeInBits(V->getType());
13773 InstrElementSize[
I] = Width;
13781bool BoUpSLP::collectValuesToDemote(
13786 if (isa<Constant>(V))
13792 auto *
I = dyn_cast<Instruction>(V);
13793 if (!
I || !getTreeEntry(
I) || MultiNodeScalars.
contains(
I) ||
13795 return isa<InsertElementInst>(U) && !getTreeEntry(U);
13799 unsigned Start = 0;
13800 unsigned End =
I->getNumOperands();
13801 switch (
I->getOpcode()) {
13805 case Instruction::Trunc:
13808 case Instruction::ZExt:
13809 case Instruction::SExt:
13810 if (isa<ExtractElementInst, InsertElementInst>(
I->getOperand(0)))
13816 case Instruction::Add:
13817 case Instruction::Sub:
13818 case Instruction::Mul:
13819 case Instruction::And:
13820 case Instruction::Or:
13821 case Instruction::Xor:
13822 if (!collectValuesToDemote(
I->getOperand(0), ToDemote, DemotedConsts, Roots,
13824 !collectValuesToDemote(
I->getOperand(1), ToDemote, DemotedConsts, Roots,
13830 case Instruction::Select: {
13833 if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts,
13835 !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts,
13843 case Instruction::PHI: {
13846 if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots,
13858 for (
unsigned Idx : seq<unsigned>(Start,
End))
13859 if (isa<Constant>(
I->getOperand(
Idx)))
13860 DemotedConsts.try_emplace(
I).first->getSecond().push_back(
Idx);
13868 auto &TreeRoot = VectorizableTree[0]->Scalars;
13869 auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->
getType());
13870 if (!TreeRootIT || VectorizableTree.front()->State == TreeEntry::NeedToGather)
13874 if (!VectorizableTree.front()->UserTreeIndices.empty())
13883 for (
auto *Root : TreeRoot) {
13885 if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited))
13892 auto MaxBitWidth = 1u;
13896 for (
auto *Root : TreeRoot) {
13898 MaxBitWidth = std::max<unsigned>(Mask.getBitWidth() - Mask.countl_zero(),
13905 bool IsKnownPositive =
true;
13916 if (MaxBitWidth ==
DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
13918 return all_of(V->users(),
13919 [](
User *U) { return isa<GetElementPtrInst>(U); });
13932 for (
auto *Scalar : ToDemote) {
13934 auto NumTypeBits =
DL->getTypeSizeInBits(Scalar->getType());
13935 MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
13953 if (!IsKnownPositive)
13962 if (MaxBitWidth >= TreeRootIT->getBitWidth())
13968 while (!Roots.
empty()) {
13970 collectValuesToDemote(Roots.
pop_back_val(), ToDemote, DemotedConsts, Roots,
13977 for (
Value *V: ToDemote) {
13978 const TreeEntry *TE = getTreeEntry(V);
13979 assert(TE &&
"Expected vectorized scalar.");
13980 if (!Visited.
insert(TE).second)
13983 return all_of(EI.UserTE->Scalars,
13984 [&](Value *V) { return Demoted.contains(V); });
13989 for (
auto *Scalar : ToDemote) {
13990 auto *TE = getTreeEntry(Scalar);
13991 assert(TE &&
"Expected vectorized scalar.");
13992 if (MinBWs.contains(TE))
13994 bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
13995 KnownBits Known = computeKnownBits(R, *DL);
13996 return !Known.isNonNegative();
13998 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
13999 const auto *
I = cast<Instruction>(Scalar);
14000 auto DCIt = DemotedConsts.find(
I);
14001 if (DCIt != DemotedConsts.end()) {
14002 for (
unsigned Idx : DCIt->getSecond()) {
14005 auto SIt = DemotedConsts.find(cast<Instruction>(V));
14006 return SIt != DemotedConsts.end() &&
14007 is_contained(SIt->getSecond(), Idx);
14009 const TreeEntry *CTE = getOperandEntry(TE,
Idx);
14010 MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);
14028 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
14053 DL = &
F.getParent()->getDataLayout();
14057 bool Changed =
false;
14063 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
14068 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
14071 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
14075 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
14081 DT->updateDFSNumbers();
14084 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
14086 R.clearReductionData();
14087 collectSeedInstructions(BB);
14090 if (!Stores.empty()) {
14092 <<
" underlying objects.\n");
14093 Changed |= vectorizeStoreChains(R);
14097 Changed |= vectorizeChainsInBlock(BB, R);
14102 if (!GEPs.
empty()) {
14104 <<
" underlying objects.\n");
14105 Changed |= vectorizeGEPIndices(BB, R);
14110 R.optimizeGatherSequence();
14117 unsigned Idx,
unsigned MinVF) {
14120 const unsigned Sz = R.getVectorElementSize(Chain[0]);
14121 unsigned VF = Chain.
size();
14129 R.buildTree(Chain);
14130 if (R.isTreeTinyAndNotFullyVectorizable())
14132 if (R.isLoadCombineCandidate())
14134 R.reorderTopToBottom();
14135 R.reorderBottomToTop();
14136 R.buildExternalUses();
14138 R.computeMinimumValueSizes();
14146 using namespace ore;
14149 cast<StoreInst>(Chain[0]))
14150 <<
"Stores SLP vectorized with cost " << NV(
"Cost",
Cost)
14151 <<
" and with tree size "
14152 << NV(
"TreeSize", R.getTreeSize()));
14166 bool Changed =
false;
14172 struct StoreDistCompare {
14173 bool operator()(
const std::pair<unsigned, int> &Op1,
14174 const std::pair<unsigned, int> &Op2)
const {
14175 return Op1.second < Op2.second;
14180 using StoreIndexToDistSet =
14181 std::set<std::pair<unsigned, int>, StoreDistCompare>;
14182 auto TryToVectorize = [&](
const StoreIndexToDistSet &Set) {
14187 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
14189 PrevDist =
Data.second;
14190 if (
Idx != Set.size() - 1)
14195 Operands.push_back(Stores[DataVar.first]);
14196 PrevDist = DataVar.second;
14202 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
14203 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
14207 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
14209 Type *StoreTy =
Store->getValueOperand()->getType();
14210 Type *ValueTy = StoreTy;
14211 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
14212 ValueTy = Trunc->getSrcTy();
14214 R.getMinVF(
DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
14216 if (MaxVF < MinVF) {
14217 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
14219 <<
"MinVF (" << MinVF <<
")\n");
14227 unsigned Size = MaxVF;
14228 for_each(CandidateVFs, [&](
unsigned &VF) {
14232 unsigned StartIdx = 0;
14233 for (
unsigned Size : CandidateVFs) {
14234 for (
unsigned Cnt = StartIdx,
E =
Operands.size(); Cnt +
Size <=
E;) {
14240 return cast<StoreInst>(V)->getValueOperand()->getType() ==
14241 cast<StoreInst>(Slice.
front())
14242 ->getValueOperand()
14245 "Expected all operands of same type.");
14246 if (!VectorizedStores.
count(Slice.
front()) &&
14247 !VectorizedStores.
count(Slice.
back()) &&
14250 vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
14256 if (Cnt == StartIdx)
14312 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
14314 Stores[Set.first]->getValueOperand()->getType(),
14315 Stores[Set.first]->getPointerOperand(),
14316 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
14320 auto It = Set.second.find(std::make_pair(
Idx, *Diff));
14321 if (It == Set.second.end()) {
14322 Set.second.emplace(
Idx, *Diff);
14326 TryToVectorize(Set.second);
14327 StoreIndexToDistSet PrevSet;
14328 PrevSet.swap(Set.second);
14330 Set.second.emplace(
Idx, 0);
14333 unsigned StartIdx = It->first + 1;
14338 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
14340 if (Pair.first <= It->first ||
14341 VectorizedStores.
contains(Stores[Pair.first]))
14343 unsigned BI = Pair.first - StartIdx;
14344 UsedStores.set(BI);
14345 Dists[BI] = Pair.second - It->second;
14347 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
14348 unsigned BI =
I - StartIdx;
14349 if (UsedStores.test(BI))
14350 Set.second.emplace(
I, Dists[BI]);
14354 auto &Res = SortedStores.emplace_back();
14356 Res.second.emplace(
Idx, 0);
14362 SI->getValueOperand()->getType()) {
14363 for (
auto &Set : SortedStores)
14364 TryToVectorize(Set.second);
14365 SortedStores.clear();
14368 FillStoresSet(
I, SI);
14372 for (
auto &Set : SortedStores)
14373 TryToVectorize(Set.second);
14378void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
14389 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
14390 if (!
SI->isSimple())
14400 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
14401 if (
GEP->getNumIndices() != 1)
14404 if (isa<Constant>(
Idx))
14408 if (
GEP->getType()->isVectorTy())
14420 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
14421 << VL.
size() <<
".\n");
14426 if (!S.getOpcode())
14432 for (
Value *V : VL) {
14433 Type *Ty =
V->getType();
14437 R.getORE()->emit([&]() {
14438 std::string TypeStr;
14442 <<
"Cannot SLP vectorize list: type "
14443 << rso.str() +
" is unsupported by vectorizer";
14449 unsigned Sz =
R.getVectorElementSize(I0);
14450 unsigned MinVF =
R.getMinVF(Sz);
14451 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
14452 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
14454 R.getORE()->emit([&]() {
14456 <<
"Cannot SLP vectorize list: vectorization factor "
14457 <<
"less than 2 is not supported";
14462 bool Changed =
false;
14463 bool CandidateFound =
false;
14465 Type *ScalarTy = VL[0]->getType();
14466 if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
14467 ScalarTy =
IE->getOperand(1)->getType();
14469 unsigned NextInst = 0, MaxInst = VL.size();
14470 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
14477 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
14478 unsigned ActualVF = std::min(MaxInst -
I, VF);
14483 if (MaxVFOnly && ActualVF < MaxVF)
14485 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
14491 auto *
I = dyn_cast<Instruction>(V);
14492 return I &&
R.isDeleted(
I);
14496 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
14500 if (
R.isTreeTinyAndNotFullyVectorizable())
14502 R.reorderTopToBottom();
14503 R.reorderBottomToTop(
14504 !isa<InsertElementInst>(Ops.
front()) &&
14505 !
R.doesRootHaveInTreeUses());
14506 R.buildExternalUses();
14508 R.computeMinimumValueSizes();
14510 CandidateFound =
true;
14511 MinCost = std::min(MinCost,
Cost);
14514 <<
" for VF=" << ActualVF <<
"\n");
14518 cast<Instruction>(Ops[0]))
14519 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
14520 <<
" and with tree size "
14521 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
14532 if (!Changed && CandidateFound) {
14533 R.getORE()->emit([&]() {
14535 <<
"List vectorization was possible but not beneficial with cost "
14536 <<
ore::NV(
"Cost", MinCost) <<
" >= "
14539 }
else if (!Changed) {
14540 R.getORE()->emit([&]() {
14542 <<
"Cannot SLP vectorize list: vectorization was impossible"
14543 <<
" with available vectorization factors";
14553 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
14559 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
14560 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
14561 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P)
14568 auto *
A = dyn_cast<BinaryOperator>(Op0);
14569 auto *
B = dyn_cast<BinaryOperator>(Op1);
14571 if (
A &&
B &&
B->hasOneUse()) {
14572 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
14573 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
14574 if (B0 && B0->getParent() ==
P)
14576 if (B1 && B1->getParent() ==
P)
14580 if (
B &&
A &&
A->hasOneUse()) {
14581 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
14582 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
14583 if (A0 && A0->getParent() ==
P)
14585 if (A1 && A1->getParent() ==
P)
14589 if (Candidates.
size() == 1)
14590 return tryToVectorizeList({Op0, Op1},
R);
14593 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
14594 if (!BestCandidate)
14596 return tryToVectorizeList(
14597 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
14631 ReductionOpsListType ReductionOps;
14643 bool IsSupportedHorRdxIdentityOp =
false;
14654 return isa<SelectInst>(
I) &&
14660 if (Kind == RecurKind::None)
14668 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
14672 return I->getFastMathFlags().noNaNs();
14675 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
14678 return I->isAssociative();
14686 if (getRdxKind(
I) == RecurKind::Or && isa<SelectInst>(
I) &&
Index == 1)
14687 return I->getOperand(2);
14688 return I->getOperand(
Index);
14697 case RecurKind::Or:
14703 case RecurKind::And:
14709 case RecurKind::Add:
14710 case RecurKind::Mul:
14711 case RecurKind::Xor:
14712 case RecurKind::FAdd:
14713 case RecurKind::FMul:
14716 case RecurKind::FMax:
14719 maxnum(cast<ConstantFP>(
LHS)->getValueAPF(),
14720 cast<ConstantFP>(
RHS)->getValueAPF()));
14722 case RecurKind::FMin:
14725 minnum(cast<ConstantFP>(
LHS)->getValueAPF(),
14726 cast<ConstantFP>(
RHS)->getValueAPF()));
14728 case RecurKind::FMaximum:
14731 maximum(cast<ConstantFP>(
LHS)->getValueAPF(),
14732 cast<ConstantFP>(
RHS)->getValueAPF()));
14734 case RecurKind::FMinimum:
14737 minimum(cast<ConstantFP>(
LHS)->getValueAPF(),
14738 cast<ConstantFP>(
RHS)->getValueAPF()));
14740 case RecurKind::SMax:
14741 if (IsConstant || UseSelect) {
14746 case RecurKind::SMin:
14747 if (IsConstant || UseSelect) {
14752 case RecurKind::UMax:
14753 if (IsConstant || UseSelect) {
14758 case RecurKind::UMin:
14759 if (IsConstant || UseSelect) {
14773 const ReductionOpsListType &ReductionOps) {
14775 ReductionOps.size() == 2 ||
14777 (ReductionOps.size() == 1 &&
any_of(ReductionOps.front(), [](
Value *V) {
14778 return isa<SelectInst>(V);
14780 assert((!UseSelect || ReductionOps.size() != 2 ||
14781 isa<SelectInst>(ReductionOps[1][0])) &&
14782 "Expected cmp + select pairs for reduction");
14785 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
14799 auto *
I = dyn_cast<Instruction>(V);
14801 return RecurKind::None;
14803 return RecurKind::Add;
14805 return RecurKind::Mul;
14808 return RecurKind::And;
14811 return RecurKind::Or;
14813 return RecurKind::Xor;
14815 return RecurKind::FAdd;
14817 return RecurKind::FMul;
14820 return RecurKind::FMax;
14822 return RecurKind::FMin;
14825 return RecurKind::FMaximum;
14827 return RecurKind::FMinimum;
14833 return RecurKind::SMax;
14835 return RecurKind::SMin;
14837 return RecurKind::UMax;
14839 return RecurKind::UMin;
14841 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
14863 if (!isa<ExtractElementInst>(
RHS) ||
14865 return RecurKind::None;
14867 if (!isa<ExtractElementInst>(
LHS) ||
14869 return RecurKind::None;
14871 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
14872 return RecurKind::None;
14876 return RecurKind::None;
14881 return RecurKind::None;
14884 return RecurKind::SMax;
14887 return RecurKind::SMin;
14890 return RecurKind::UMax;
14893 return RecurKind::UMin;
14896 return RecurKind::None;
14900 static unsigned getFirstOperandIndex(
Instruction *
I) {
14901 return isCmpSelMinMax(
I) ? 1 : 0;
14907 return isCmpSelMinMax(
I) ? 3 : 2;
14913 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
14914 auto *Sel = cast<SelectInst>(
I);
14915 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
14916 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
14918 return I->getParent() == BB;
14922 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
14923 if (IsCmpSelMinMax) {
14926 if (
auto *Sel = dyn_cast<SelectInst>(
I))
14927 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
14928 return I->hasNUses(2);
14932 return I->hasOneUse();
14937 if (isCmpSelMinMax(
I))
14938 ReductionOps.assign(2, ReductionOpsType());
14940 ReductionOps.assign(1, ReductionOpsType());
14945 if (isCmpSelMinMax(
I)) {
14946 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
14947 ReductionOps[1].emplace_back(
I);
14949 ReductionOps[0].emplace_back(
I);
14954 int Sz = Data.size();
14955 auto *
I = dyn_cast<Instruction>(Data.front());
14956 return Sz > 1 ||
isConstant(Data.front()) ||
14967 RdxKind = HorizontalReduction::getRdxKind(Root);
14968 if (!isVectorizable(RdxKind, Root))
14979 if (
auto *Sel = dyn_cast<SelectInst>(Root))
14980 if (!Sel->getCondition()->hasOneUse())
14983 ReductionRoot = Root;
14988 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
14997 for (
int I = getFirstOperandIndex(TreeN),
14998 End = getNumberOfOperands(TreeN);
15000 Value *EdgeVal = getRdxOperand(TreeN,
I);
15001 ReducedValsToOps[EdgeVal].push_back(TreeN);
15002 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
15005 !hasSameParent(EdgeInst, BB)) {
15006 ExtraArgs.push_back(EdgeVal);
15013 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
15014 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
15015 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
15016 !isVectorizable(RdxKind, EdgeInst) ||
15017 (
R.isAnalyzedReductionRoot(EdgeInst) &&
15019 PossibleReducedVals.push_back(EdgeVal);
15022 ReductionOps.push_back(EdgeInst);
15031 PossibleReducedVals;
15032 initReductionOps(Root);
15037 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
15040 auto LIt = LoadsMap.
find(
Ptr);
15041 if (LIt != LoadsMap.
end()) {
15042 for (
LoadInst *RLI : LIt->second) {
15048 for (
LoadInst *RLI : LIt->second) {
15052 DoNotReverseVals.
insert(RLI);
15056 if (LIt->second.size() > 2) {
15058 hash_value(LIt->second.back()->getPointerOperand());
15059 DoNotReverseVals.
insert(LIt->second.back());
15064 LoadKeyUsed.
insert(Key);
15069 while (!Worklist.empty()) {
15074 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
15077 if (
Args.size() < 2) {
15078 addReductionOps(TreeN);
15080 if (!
Args.empty()) {
15081 assert(
Args.size() == 1 &&
"Expected only single argument.");
15082 ExtraArgs[TreeN] =
Args.front();
15086 for (
Value *V : PossibleRedVals) {
15090 ++PossibleReducedVals[
Key][
Idx]
15091 .
insert(std::make_pair(V, 0))
15094 Worklist.append(PossibleReductionOps.
rbegin(),
15095 PossibleReductionOps.
rend());
15100 ++PossibleReducedVals[
Key][
Idx]
15101 .
insert(std::make_pair(TreeN, 0))
15105 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
15108 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
15109 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
15111 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
15114 auto RedValsVect = It->second.takeVector();
15116 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
15117 PossibleRedValsVect.
back().append(Data.second, Data.first);
15119 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
15120 return P1.size() > P2.size();
15124 if (isGoodForReduction(Data) ||
15125 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
15126 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
15128 cast<LoadInst>(Data.front())->getPointerOperand()) ==
15132 NewIdx = ReducedVals.
size();
15135 if (DoNotReverseVals.
contains(Data.front()))
15136 ReducedVals[NewIdx].
append(Data.begin(), Data.end());
15138 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
15140 ReducedVals.
emplace_back().append(Data.rbegin(), Data.rend());
15155 constexpr int ReductionLimit = 4;
15156 constexpr unsigned RegMaxNumber = 4;
15157 constexpr unsigned RedValsMaxNumber = 128;
15161 unsigned NumReducedVals =
15162 std::accumulate(ReducedVals.
begin(), ReducedVals.
end(), 0,
15164 if (!isGoodForReduction(Vals))
15166 return Num + Vals.size();
15168 if (NumReducedVals < ReductionLimit &&
15173 for (ReductionOpsType &RdxOps : ReductionOps)
15174 for (
Value *RdxOp : RdxOps)
15175 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
15179 IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
15184 ReducedVals.
size() * ReducedVals.
front().size() + ExtraArgs.size());
15187 ExternallyUsedValues.
reserve(ExtraArgs.size() + 1);
15190 for (
const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
15191 assert(Pair.first &&
"DebugLoc must be set.");
15192 ExternallyUsedValues[Pair.second].push_back(Pair.first);
15193 TrackedVals.
try_emplace(Pair.second, Pair.second);
15198 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
15199 assert(isa<SelectInst>(RdxRootInst) &&
15200 "Expected min/max reduction to have select root instruction");
15201 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
15202 assert(isa<Instruction>(ScalarCond) &&
15203 "Expected min/max reduction to have compare condition");
15204 return cast<Instruction>(ScalarCond);
15208 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
15209 if (VectorizedTree) {
15212 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
15213 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
15216 auto It = ReducedValsToOps.
find(Res);
15217 if (It != ReducedValsToOps.
end() &&
15223 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
15229 bool AnyBoolLogicOp =
15231 return isBoolLogicOp(cast<Instruction>(V));
15235 ExternallyUsedValues[ReductionRoot];
15237 ReductionOps.front().size());
15238 for (ReductionOpsType &RdxOps : ReductionOps)
15239 for (
Value *RdxOp : RdxOps) {
15242 IgnoreList.insert(RdxOp);
15247 for (
Value *U : IgnoreList)
15248 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
15249 RdxFMF &= FPMO->getFastMathFlags();
15250 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
15255 for (
Value *V : Candidates)
15256 TrackedVals.try_emplace(V, V);
15262 Value *VectorizedTree =
nullptr;
15263 bool CheckForReusedReductionOps =
false;
15265 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
15271 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
15272 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
15277 auto *Inst = dyn_cast<Instruction>(RdxVal);
15279 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
15280 (S.getOpcode() && !Inst))
15283 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
15285 bool ShuffledExtracts =
false;
15287 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
15289 InstructionsState NextS =
getSameOpcode(ReducedVals[
I + 1], TLI);
15290 if (NextS.getOpcode() == Instruction::ExtractElement &&
15291 !NextS.isAltShuffle()) {
15293 for (
Value *RV : ReducedVals[
I + 1]) {
15294 Value *RdxVal = TrackedVals.find(RV)->second;
15298 if (
auto *Inst = dyn_cast<Instruction>(RdxVal))
15299 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
15301 CommonCandidates.push_back(RdxVal);
15302 TrackedToOrig.try_emplace(RdxVal, RV);
15307 Candidates.
swap(CommonCandidates);
15308 ShuffledExtracts =
true;
15317 ++VectorizedVals.try_emplace(Candidates.
front(), 0).first->getSecond();
15319 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
15320 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
15321 if (
auto *ResI = dyn_cast<Instruction>(Res))
15322 V.analyzedReductionRoot(ResI);
15324 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
15328 unsigned NumReducedVals = Candidates.
size();
15329 if (NumReducedVals < ReductionLimit &&
15336 IsSupportedHorRdxIdentityOp =
15338 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
15341 if (IsSupportedHorRdxIdentityOp)
15342 for (
Value *V : Candidates)
15343 ++SameValuesCounter.
insert(std::make_pair(V, 0)).first->second;
15354 bool SameScaleFactor =
false;
15355 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
15356 SameValuesCounter.
size() != Candidates.size();
15357 if (OptReusedScalars) {
15359 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
15360 RdxKind == RecurKind::Xor) &&
15362 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
15363 return P.second == SameValuesCounter.
front().second;
15365 Candidates.resize(SameValuesCounter.
size());
15366 transform(SameValuesCounter, Candidates.begin(),
15367 [](
const auto &
P) { return P.first; });
15368 NumReducedVals = Candidates.size();
15370 if (NumReducedVals == 1) {
15371 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
15372 unsigned Cnt = SameValuesCounter.
lookup(OrigV);
15374 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
15375 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
15376 VectorizedVals.try_emplace(OrigV, Cnt);
15381 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
15382 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
15386 unsigned ReduxWidth = std::min<unsigned>(
15387 llvm::bit_floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));
15388 unsigned Start = 0;
15389 unsigned Pos = Start;
15391 unsigned PrevReduxWidth = ReduxWidth;
15392 bool CheckForReusedReductionOpsLocal =
false;
15393 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
15394 &CheckForReusedReductionOpsLocal,
15395 &PrevReduxWidth, &
V,
15396 &IgnoreList](
bool IgnoreVL =
false) {
15397 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
15398 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
15401 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
15404 if (Pos < NumReducedVals - ReduxWidth + 1)
15405 return IsAnyRedOpGathered;
15408 return IsAnyRedOpGathered;
15410 bool AnyVectorized =
false;
15411 while (Pos < NumReducedVals - ReduxWidth + 1 &&
15412 ReduxWidth >= ReductionLimit) {
15415 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
15417 CheckForReusedReductionOps =
true;
15420 PrevReduxWidth = ReduxWidth;
15423 if (
V.areAnalyzedReductionVals(VL)) {
15424 (void)AdjustReducedVals(
true);
15430 auto *RedValI = dyn_cast<Instruction>(RedVal);
15433 return V.isDeleted(RedValI);
15436 V.buildTree(VL, IgnoreList);
15437 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
15438 if (!AdjustReducedVals())
15439 V.analyzedReductionVals(VL);
15442 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
15443 if (!AdjustReducedVals())
15444 V.analyzedReductionVals(VL);
15447 V.reorderTopToBottom();
15449 V.reorderBottomToTop(
true);
15453 ExternallyUsedValues);
15454 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
15455 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
15457 for (
Value *V : ReducedVals[Cnt])
15458 if (isa<Instruction>(V))
15459 LocalExternallyUsedValues[TrackedVals[
V]];
15461 if (!IsSupportedHorRdxIdentityOp) {
15464 "Reused values counter map is not empty");
15465 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
15466 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
15468 Value *
V = Candidates[Cnt];
15469 Value *OrigV = TrackedToOrig.find(V)->second;
15470 ++SameValuesCounter[OrigV];
15476 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
15477 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
15479 Value *RdxVal = Candidates[Cnt];
15480 if (!Visited.
insert(RdxVal).second)
15484 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
15485 LocalExternallyUsedValues[RdxVal];
15488 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
15490 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
15491 if (NumOps != ReducedValsToOps.
find(OrigV)->second.size())
15492 LocalExternallyUsedValues[RdxVal];
15495 if (!IsSupportedHorRdxIdentityOp)
15496 SameValuesCounter.
clear();
15497 for (
Value *RdxVal : VL)
15498 if (RequiredExtract.
contains(RdxVal))
15499 LocalExternallyUsedValues[RdxVal];
15503 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals)
15504 ReplacementToExternal.
try_emplace(Pair.second, Pair.first);
15505 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
15507 auto RIt = ReplacementToExternal.
find(Ext);
15508 while (RIt != ReplacementToExternal.
end()) {
15510 RIt = ReplacementToExternal.
find(Ext);
15512 auto *It = ExternallyUsedValues.
find(Ext);
15513 if (It == ExternallyUsedValues.
end())
15515 LocalExternallyUsedValues[Pair.second].append(It->second);
15517 V.buildExternalUses(LocalExternallyUsedValues);
15519 V.computeMinimumValueSizes();
15524 getReductionCost(
TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
15527 <<
" for reduction\n");
15531 V.getORE()->emit([&]() {
15533 SV_NAME,
"HorSLPNotBeneficial",
15534 ReducedValsToOps.
find(VL[0])->second.front())
15535 <<
"Vectorizing horizontal reduction is possible "
15536 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
15537 <<
" and threshold "
15540 if (!AdjustReducedVals())
15541 V.analyzedReductionVals(VL);
15545 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
15546 <<
Cost <<
". (HorRdx)\n");
15547 V.getORE()->emit([&]() {
15549 SV_NAME,
"VectorizedHorizontalReduction",
15550 ReducedValsToOps.
find(VL[0])->second.front())
15551 <<
"Vectorized horizontal reduction with cost "
15552 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
15553 <<
ore::NV(
"TreeSize",
V.getTreeSize());
15560 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
15562 if (IsCmpSelMinMax)
15563 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
15566 Value *VectorizedRoot =
V.vectorizeTree(LocalExternallyUsedValues,
15567 ReplacedExternals, InsertPt);
15574 if ((isBoolLogicOp(RdxRootInst) ||
15575 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
15577 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
15580 if (OptReusedScalars && !SameScaleFactor) {
15582 emitReusedOps(VectorizedRoot, Builder,
V.getRootNodeScalars(),
15583 SameValuesCounter, TrackedToOrig);
15586 Value *ReducedSubTree =
15587 emitReduction(VectorizedRoot, Builder, ReduxWidth,
TTI);
15588 if (ReducedSubTree->
getType() != VL.front()->getType()) {
15590 ReducedSubTree, VL.front()->getType(),
any_of(VL, [&](
Value *R) {
15592 R, cast<Instruction>(ReductionOps.front().front())
15594 ->getDataLayout());
15602 if (OptReusedScalars && SameScaleFactor)
15603 ReducedSubTree = emitScaleForReusedOps(
15604 ReducedSubTree, Builder, SameValuesCounter.
front().second);
15606 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
15608 for (
Value *RdxVal : VL) {
15609 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
15610 if (IsSupportedHorRdxIdentityOp) {
15611 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
15614 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
15615 if (!
V.isVectorized(RdxVal))
15616 RequiredExtract.
insert(RdxVal);
15621 AnyVectorized =
true;
15623 if (OptReusedScalars && !AnyVectorized) {
15624 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
15625 Value *RedVal = emitScaleForReusedOps(
P.first, Builder,
P.second);
15626 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
15627 Value *OrigV = TrackedToOrig.find(
P.first)->second;
15628 VectorizedVals.try_emplace(OrigV,
P.second);
15633 if (VectorizedTree) {
15654 if (!AnyBoolLogicOp)
15656 if (isBoolLogicOp(RedOp1) &&
15657 ((!InitStep &&
LHS == VectorizedTree) ||
15660 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
15661 getRdxOperand(RedOp2, 0) ==
RHS ||
15666 if (
LHS != VectorizedTree)
15677 unsigned Sz = InstVals.
size();
15680 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
15683 Value *RdxVal1 = InstVals[
I].second;
15684 Value *StableRdxVal1 = RdxVal1;
15685 auto It1 = TrackedVals.find(RdxVal1);
15686 if (It1 != TrackedVals.end())
15687 StableRdxVal1 = It1->second;
15688 Value *RdxVal2 = InstVals[
I + 1].second;
15689 Value *StableRdxVal2 = RdxVal2;
15690 auto It2 = TrackedVals.find(RdxVal2);
15691 if (It2 != TrackedVals.end())
15692 StableRdxVal2 = It2->second;
15696 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
15698 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
15699 StableRdxVal2,
"op.rdx", ReductionOps);
15700 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
15703 ExtraReds[Sz / 2] = InstVals.
back();
15707 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
15711 for (
Value *RdxVal : Candidates) {
15712 if (!Visited.
insert(RdxVal).second)
15714 unsigned NumOps = VectorizedVals.lookup(RdxVal);
15721 for (
auto &Pair : ExternallyUsedValues) {
15723 for (
auto *
I : Pair.second)
15727 bool InitStep =
true;
15728 while (ExtraReductions.
size() > 1) {
15729 VectorizedTree = ExtraReductions.
front().second;
15731 FinalGen(ExtraReductions, InitStep);
15732 ExtraReductions.
swap(NewReds);
15735 VectorizedTree = ExtraReductions.
front().second;
15737 ReductionRoot->replaceAllUsesWith(VectorizedTree);
15746 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
15753 for (
auto *U :
Ignore->users()) {
15755 "All users must be either in the reduction ops list.");
15758 if (!
Ignore->use_empty()) {
15760 Ignore->replaceAllUsesWith(Undef);
15762 V.eraseInstruction(cast<Instruction>(
Ignore));
15765 }
else if (!CheckForReusedReductionOps) {
15766 for (ReductionOpsType &RdxOps : ReductionOps)
15767 for (
Value *RdxOp : RdxOps)
15768 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
15770 return VectorizedTree;
15777 bool IsCmpSelMinMax,
unsigned ReduxWidth,
15780 Type *ScalarTy = ReducedVals.
front()->getType();
15789 int Cnt = ReducedVals.
size();
15790 for (
Value *RdxVal : ReducedVals) {
15795 Cost += GenCostFn();
15800 auto *RdxOp = cast<Instruction>(U);
15801 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
15809 Cost += ScalarCost;
15811 Cost += GenCostFn();
15816 case RecurKind::Add:
15817 case RecurKind::Mul:
15818 case RecurKind::Or:
15819 case RecurKind::And:
15820 case RecurKind::Xor:
15821 case RecurKind::FAdd:
15822 case RecurKind::FMul: {
15827 ScalarCost = EvaluateScalarCost([&]() {
15832 case RecurKind::FMax:
15833 case RecurKind::FMin:
15834 case RecurKind::FMaximum:
15835 case RecurKind::FMinimum:
15836 case RecurKind::SMax:
15837 case RecurKind::SMin:
15838 case RecurKind::UMax:
15839 case RecurKind::UMin: {
15843 ScalarCost = EvaluateScalarCost([&]() {
15853 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
15855 <<
" (It is a splitting reduction)\n");
15856 return VectorCost - ScalarCost;
15862 assert(VectorizedValue &&
"Need to have a vectorized tree node");
15864 "We only handle power-of-two reductions for now");
15865 assert(RdxKind != RecurKind::FMulAdd &&
15866 "A call to the llvm.fmuladd intrinsic is not handled yet");
15868 ++NumVectorInstructions;
15875 assert(IsSupportedHorRdxIdentityOp &&
15876 "The optimization of matched scalar identity horizontal reductions "
15877 "must be supported.");
15879 case RecurKind::Add: {
15881 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
15883 << VectorizedValue <<
". (HorRdx)\n");
15884 return Builder.
CreateMul(VectorizedValue, Scale);
15886 case RecurKind::Xor: {
15888 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
15889 <<
". (HorRdx)\n");
15892 return VectorizedValue;
15894 case RecurKind::FAdd: {
15896 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
15898 << VectorizedValue <<
". (HorRdx)\n");
15899 return Builder.
CreateFMul(VectorizedValue, Scale);
15901 case RecurKind::And:
15902 case RecurKind::Or:
15903 case RecurKind::SMax:
15904 case RecurKind::SMin:
15905 case RecurKind::UMax:
15906 case RecurKind::UMin:
15907 case RecurKind::FMax:
15908 case RecurKind::FMin:
15909 case RecurKind::FMaximum:
15910 case RecurKind::FMinimum:
15912 return VectorizedValue;
15913 case RecurKind::Mul:
15914 case RecurKind::FMul:
15915 case RecurKind::FMulAdd:
15916 case RecurKind::IAnyOf:
15917 case RecurKind::FAnyOf:
15918 case RecurKind::None:
15930 assert(IsSupportedHorRdxIdentityOp &&
15931 "The optimization of matched scalar identity horizontal reductions "
15932 "must be supported.");
15933 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
15934 if (VTy->getElementType() != VL.
front()->getType()) {
15940 R, cast<Instruction>(ReductionOps.front().front())
15942 ->getDataLayout());
15947 case RecurKind::Add: {
15950 for (
Value *V : VL) {
15951 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
15952 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
15956 << VectorizedValue <<
". (HorRdx)\n");
15957 return Builder.
CreateMul(VectorizedValue, Scale);
15959 case RecurKind::And:
15960 case RecurKind::Or:
15963 <<
". (HorRdx)\n");
15964 return VectorizedValue;
15965 case RecurKind::SMax:
15966 case RecurKind::SMin:
15967 case RecurKind::UMax:
15968 case RecurKind::UMin:
15969 case RecurKind::FMax:
15970 case RecurKind::FMin:
15971 case RecurKind::FMaximum:
15972 case RecurKind::FMinimum:
15975 <<
". (HorRdx)\n");
15976 return VectorizedValue;
15977 case RecurKind::Xor: {
15983 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
15985 std::iota(
Mask.begin(),
Mask.end(), 0);
15986 bool NeedShuffle =
false;
15987 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
15989 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
15990 if (Cnt % 2 == 0) {
15992 NeedShuffle =
true;
15998 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
16002 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
16003 return VectorizedValue;
16005 case RecurKind::FAdd: {
16008 for (
Value *V : VL) {
16009 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
16010 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
16013 return Builder.
CreateFMul(VectorizedValue, Scale);
16015 case RecurKind::Mul:
16016 case RecurKind::FMul:
16017 case RecurKind::FMulAdd:
16018 case RecurKind::IAnyOf:
16019 case RecurKind::FAnyOf:
16020 case RecurKind::None:
16029 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
16030 return cast<FixedVectorType>(IE->getType())->getNumElements();
16032 unsigned AggregateSize = 1;
16033 auto *
IV = cast<InsertValueInst>(InsertInst);
16034 Type *CurrentType =
IV->getType();
16036 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
16037 for (
auto *Elt : ST->elements())
16038 if (Elt != ST->getElementType(0))
16039 return std::nullopt;
16040 AggregateSize *= ST->getNumElements();
16041 CurrentType = ST->getElementType(0);
16042 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
16043 AggregateSize *= AT->getNumElements();
16044 CurrentType = AT->getElementType();
16045 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
16046 AggregateSize *= VT->getNumElements();
16047 return AggregateSize;
16049 return AggregateSize;
16051 return std::nullopt;
16060 unsigned OperandOffset) {
16063 std::optional<unsigned> OperandIndex =
16067 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
16069 BuildVectorOpds, InsertElts, *OperandIndex);
16072 BuildVectorOpds[*OperandIndex] = InsertedOperand;
16073 InsertElts[*OperandIndex] = LastInsertInst;
16075 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
16076 }
while (LastInsertInst !=
nullptr &&
16077 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
16100 assert((isa<InsertElementInst>(LastInsertInst) ||
16101 isa<InsertValueInst>(LastInsertInst)) &&
16102 "Expected insertelement or insertvalue instruction!");
16105 "Expected empty result vectors!");
16108 if (!AggregateSize)
16110 BuildVectorOpds.
resize(*AggregateSize);
16111 InsertElts.
resize(*AggregateSize);
16116 if (BuildVectorOpds.
size() >= 2)
16134 auto DominatedReduxValue = [&](
Value *R) {
16135 return isa<Instruction>(R) &&
16136 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
16142 if (
P->getIncomingBlock(0) == ParentBB) {
16143 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
16144 }
else if (
P->getIncomingBlock(1) == ParentBB) {
16145 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
16148 if (Rdx && DominatedReduxValue(Rdx))
16161 if (
P->getIncomingBlock(0) == BBLatch) {
16162 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
16163 }
else if (
P->getIncomingBlock(1) == BBLatch) {
16164 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
16167 if (Rdx && DominatedReduxValue(Rdx))
16201 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
16202 isa<IntrinsicInst>(Root)) &&
16203 "Expected binop, select, or intrinsic for reduction matching");
16205 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
16207 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
16209 return dyn_cast<Instruction>(
RHS);
16211 return dyn_cast<Instruction>(
LHS);
16218 Value *Op0 =
nullptr;
16219 Value *Op1 =
nullptr;
16222 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
16228 Value *B0 =
nullptr, *B1 =
nullptr;
16233bool SLPVectorizerPass::vectorizeHorReduction(
16238 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
16240 if (Root->
getParent() != BB || isa<PHINode>(Root))
16244 auto SelectRoot = [&]() {
16263 std::queue<std::pair<Instruction *, unsigned>>
Stack;
16264 Stack.emplace(SelectRoot(), 0);
16268 if (
R.isAnalyzedReductionRoot(Inst))
16273 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
16275 return HorRdx.tryToReduce(R,
TTI, *TLI);
16277 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
16278 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
16285 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
16290 while (!
Stack.empty()) {
16293 std::tie(Inst, Level) =
Stack.front();
16298 if (
R.isDeleted(Inst))
16300 if (
Value *VectorizedV = TryToReduce(Inst)) {
16302 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
16304 Stack.emplace(
I, Level);
16309 if (!TryAppendToPostponedInsts(Inst)) {
16320 if (VisitedInstrs.
insert(
Op).second)
16321 if (
auto *
I = dyn_cast<Instruction>(
Op))
16324 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
16325 !
R.isDeleted(
I) &&
I->getParent() == BB)
16326 Stack.emplace(
I, Level);
16335 bool Res = vectorizeHorReduction(
P, Root, BB, R,
TTI, PostponedInsts);
16336 Res |= tryToVectorize(PostponedInsts, R);
16343 for (
Value *V : Insts)
16344 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
16345 Res |= tryToVectorize(Inst, R);
16349bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
16351 if (!
R.canMapToVector(IVI->
getType()))
16359 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
16361 return tryToVectorizeList(BuildVectorOpds, R);
16372 [](
Value *V) {
return isa<ExtractElementInst, UndefValue>(V); }) &&
16376 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
16377 return tryToVectorizeList(BuildVectorInsts, R);
16380template <
typename T>
16385 bool MaxVFOnly,
BoUpSLP &R) {
16386 bool Changed =
false;
16395 auto *SameTypeIt = IncIt;
16396 while (SameTypeIt !=
E && AreCompatible(*SameTypeIt, *IncIt))
16400 unsigned NumElts = (SameTypeIt - IncIt);
16401 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
16402 << NumElts <<
")\n");
16413 TryToVectorizeHelper(
ArrayRef(IncIt, NumElts), MaxVFOnly)) {
16419 auto GetMinNumElements = [&R](
Value *V) {
16420 unsigned EltSize = R.getVectorElementSize(V);
16421 return std::max(2U, R.getMaxVecRegSize() / EltSize);
16423 if (NumElts < GetMinNumElements(*IncIt) &&
16424 (Candidates.
empty() ||
16425 Candidates.
front()->getType() == (*IncIt)->getType())) {
16426 Candidates.
append(IncIt, std::next(IncIt, NumElts));
16430 if (Candidates.
size() > 1 &&
16431 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
16432 if (TryToVectorizeHelper(Candidates,
false)) {
16435 }
else if (MaxVFOnly) {
16437 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end();
16439 auto *SameTypeIt = It;
16440 while (SameTypeIt !=
End && AreCompatible(*SameTypeIt, *It))
16442 unsigned NumElts = (SameTypeIt - It);
16443 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(It, NumElts),
16449 Candidates.
clear();
16453 IncIt = SameTypeIt;
16465template <
bool IsCompatibility>
16470 "Expected valid element types only.");
16472 return IsCompatibility;
16473 auto *CI1 = cast<CmpInst>(V);
16474 auto *CI2 = cast<CmpInst>(V2);
16475 if (CI1->getOperand(0)->getType()->getTypeID() <
16477 return !IsCompatibility;
16478 if (CI1->getOperand(0)->getType()->getTypeID() >
16487 if (BasePred1 < BasePred2)
16488 return !IsCompatibility;
16489 if (BasePred1 > BasePred2)
16492 bool CI1Preds = Pred1 == BasePred1;
16493 bool CI2Preds = Pred2 == BasePred1;
16494 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
16495 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
16500 return !IsCompatibility;
16503 if (
auto *I1 = dyn_cast<Instruction>(Op1))
16504 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
16505 if (IsCompatibility) {
16506 if (I1->getParent() != I2->getParent())
16513 return NodeI2 !=
nullptr;
16516 assert((NodeI1 == NodeI2) ==
16518 "Different nodes should have different DFS numbers");
16519 if (NodeI1 != NodeI2)
16523 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
16525 if (IsCompatibility)
16527 if (I1->getOpcode() != I2->getOpcode())
16528 return I1->getOpcode() < I2->getOpcode();
16531 return IsCompatibility;
16534template <
typename ItT>
16537 bool Changed =
false;
16540 if (
R.isDeleted(
I))
16543 if (
auto *RootOp = dyn_cast<Instruction>(
Op))
16544 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R,
TTI);
16548 if (
R.isDeleted(
I))
16550 Changed |= tryToVectorize(
I, R);
16557 return compareCmp<false>(V, V2, *TLI, *DT);
16560 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
16563 return compareCmp<true>(V1, V2, *TLI, *DT);
16570 if (Vals.
size() <= 1)
16572 Changed |= tryToVectorizeSequence<Value>(
16573 Vals, CompareSorter, AreCompatibleCompares,
16576 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
16578 auto *Select = dyn_cast<SelectInst>(U);
16580 Select->getParent() != cast<Instruction>(V)->getParent();
16583 if (ArePossiblyReducedInOtherBlock)
16585 return tryToVectorizeList(Candidates, R, MaxVFOnly);
16591bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
16595 return isa<InsertElementInst, InsertValueInst>(
I);
16597 "This function only accepts Insert instructions");
16598 bool OpsChanged =
false;
16601 for (
auto *
I :
reverse(Instructions)) {
16602 if (
R.isDeleted(
I))
16604 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R,
TTI, PostponedInsts);
16607 for (
auto *
I :
reverse(Instructions)) {
16608 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
16610 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
16611 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
16612 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
16613 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
16617 OpsChanged |= tryToVectorize(PostponedInsts, R);
16624 bool Changed =
false;
16631 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
16634 "Expected vectorizable types only.");
16643 if (Opcodes1.
size() < Opcodes2.
size())
16645 if (Opcodes1.
size() > Opcodes2.
size())
16647 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
16650 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
16651 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
16656 return NodeI2 !=
nullptr;
16659 assert((NodeI1 == NodeI2) ==
16661 "Different nodes should have different DFS numbers");
16662 if (NodeI1 != NodeI2)
16665 if (S.getOpcode() && !S.isAltShuffle())
16667 return I1->getOpcode() < I2->getOpcode();
16676 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
16677 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
16685 bool U1 = isa<UndefValue>(Opcodes1[
I]);
16686 bool U2 = isa<UndefValue>(Opcodes2[
I]);
16690 auto ValID1 = Opcodes1[
I]->getValueID();
16691 auto ValID2 = Opcodes2[
I]->getValueID();
16692 if (ValID1 == ValID2)
16694 if (ValID1 < ValID2)
16696 if (ValID1 > ValID2)
16705 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
16710 auto AreCompatiblePHIs = [&PHIToOpcodes,
this](
Value *V1,
Value *
V2) {
16713 if (V1->getType() !=
V2->getType())
16717 if (Opcodes1.
size() != Opcodes2.
size())
16719 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
16721 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
16723 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
16724 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
16725 if (
I1->getParent() != I2->getParent())
16732 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
16734 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
16740 bool HaveVectorizedPhiNodes =
false;
16751 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
16764 if (!Opcodes.
empty())
16768 while (!Nodes.empty()) {
16769 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
16772 for (
Value *V :
PHI->incoming_values()) {
16773 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
16774 Nodes.push_back(PHI1);
16782 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
16783 Incoming, PHICompare, AreCompatiblePHIs,
16785 return tryToVectorizeList(Candidates, R, MaxVFOnly);
16788 Changed |= HaveVectorizedPhiNodes;
16790 }
while (HaveVectorizedPhiNodes);
16792 VisitedInstrs.
clear();
16794 InstSetVector PostProcessInserts;
16798 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
16799 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
16800 if (VectorizeCmps) {
16801 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
16802 PostProcessCmps.
clear();
16804 PostProcessInserts.clear();
16809 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
16810 return PostProcessCmps.
contains(Cmp);
16811 return isa<InsertElementInst, InsertValueInst>(
I) &&
16812 PostProcessInserts.contains(
I);
16818 return I->use_empty() &&
16819 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
16824 if (isa<ScalableVectorType>(It->getType()))
16828 if (
R.isDeleted(&*It))
16831 if (!VisitedInstrs.
insert(&*It).second) {
16832 if (HasNoUsers(&*It) &&
16833 VectorizeInsertsAndCmps(It->isTerminator())) {
16843 if (isa<DbgInfoIntrinsic>(It))
16847 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
16849 if (
P->getNumIncomingValues() == 2) {
16852 if (Root && vectorizeRootInstruction(
P, Root, BB, R,
TTI)) {
16861 for (
unsigned I = 0,
E =
P->getNumIncomingValues();
I !=
E;
I++) {
16866 if (BB ==
P->getIncomingBlock(
I) ||
16867 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
16872 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
16873 PI && !IsInPostProcessInstrs(PI))
16874 Changed |= vectorizeRootInstruction(
nullptr, PI,
16875 P->getIncomingBlock(
I), R,
TTI);
16880 if (HasNoUsers(&*It)) {
16881 bool OpsChanged =
false;
16882 auto *
SI = dyn_cast<StoreInst>(It);
16892 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
16893 SI->getValueOperand()->hasOneUse();
16895 if (TryToVectorizeRoot) {
16896 for (
auto *V : It->operand_values()) {
16899 if (
auto *VI = dyn_cast<Instruction>(V);
16900 VI && !IsInPostProcessInstrs(VI))
16902 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R,
TTI);
16909 VectorizeInsertsAndCmps(It->isTerminator());
16920 if (isa<InsertElementInst, InsertValueInst>(It))
16921 PostProcessInserts.insert(&*It);
16922 else if (isa<CmpInst>(It))
16923 PostProcessCmps.
insert(cast<CmpInst>(&*It));
16930 auto Changed =
false;
16931 for (
auto &Entry : GEPs) {
16934 if (Entry.second.size() < 2)
16937 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
16938 << Entry.second.size() <<
".\n");
16945 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
16946 unsigned EltSize =
R.getVectorElementSize(*Entry.second[0]->idx_begin());
16947 if (MaxVecRegSize < EltSize)
16950 unsigned MaxElts = MaxVecRegSize / EltSize;
16951 for (
unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
16952 auto Len = std::min<unsigned>(BE - BI, MaxElts);
16965 Candidates.remove_if([&R](
Value *
I) {
16966 return R.isDeleted(cast<Instruction>(
I)) ||
16967 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
16975 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
16976 auto *GEPI = GEPList[
I];
16977 if (!Candidates.count(GEPI))
16979 auto *SCEVI = SE->
getSCEV(GEPList[
I]);
16980 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
16981 auto *GEPJ = GEPList[J];
16982 auto *SCEVJ = SE->
getSCEV(GEPList[J]);
16983 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
16984 Candidates.remove(GEPI);
16985 Candidates.remove(GEPJ);
16986 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
16987 Candidates.remove(GEPJ);
16994 if (Candidates.
size() < 2)
17001 auto BundleIndex = 0
u;
17002 for (
auto *V : Candidates) {
17003 auto *
GEP = cast<GetElementPtrInst>(V);
17004 auto *GEPIdx =
GEP->idx_begin()->get();
17005 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
17006 Bundle[BundleIndex++] = GEPIdx;
17018 Changed |= tryToVectorizeList(Bundle, R);
17024bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
17025 bool Changed =
false;
17030 if (
V->getValueOperand()->getType()->getTypeID() <
17031 V2->getValueOperand()->getType()->getTypeID())
17033 if (
V->getValueOperand()->getType()->getTypeID() >
17034 V2->getValueOperand()->getType()->getTypeID())
17036 if (
V->getPointerOperandType()->getTypeID() <
17037 V2->getPointerOperandType()->getTypeID())
17039 if (
V->getPointerOperandType()->getTypeID() >
17040 V2->getPointerOperandType()->getTypeID())
17043 if (isa<UndefValue>(
V->getValueOperand()) ||
17044 isa<UndefValue>(
V2->getValueOperand()))
17046 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
17047 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
17049 DT->getNode(
I1->getParent());
17051 DT->getNode(I2->getParent());
17052 assert(NodeI1 &&
"Should only process reachable instructions");
17053 assert(NodeI2 &&
"Should only process reachable instructions");
17054 assert((NodeI1 == NodeI2) ==
17056 "Different nodes should have different DFS numbers");
17057 if (NodeI1 != NodeI2)
17062 return I1->getOpcode() < I2->getOpcode();
17064 if (isa<Constant>(
V->getValueOperand()) &&
17065 isa<Constant>(
V2->getValueOperand()))
17067 return V->getValueOperand()->getValueID() <
17068 V2->getValueOperand()->getValueID();
17080 isa<UndefValue>(
V2->getValueOperand()))
17083 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
17084 if (
I1->getParent() != I2->getParent())
17087 return S.getOpcode() > 0;
17090 isa<Constant>(
V2->getValueOperand()))
17093 V2->getValueOperand()->getValueID();
17097 for (
auto &Pair : Stores) {
17098 if (Pair.second.size() < 2)
17102 << Pair.second.size() <<
".\n");
17111 Pair.second.rend());
17112 Changed |= tryToVectorizeSequence<StoreInst>(
17113 ReversedStores, StoreSorter, AreCompatibleStores,
17115 return vectorizeStores(Candidates, R);
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Loop::LoopBounds::Direction Direction
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleCostEstimator(TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
static bool classof(const Value *V)
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
value_type & FindAndConstruct(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
const BasicBlock * getParent() const
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
size_type count(const KeyT &Key) const
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T get() const
Returns the value of the specified pointer type.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
static bool classof(const Value *V)
Methods for support type inquiry through isa, cast, and dyn_cast:
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
void clear()
Completely clear the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isPointerTy() const
True if this is an instance of PointerType.
unsigned getStructNumElements() const
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
static bool classof(const Value *V)
Methods for support type inquiry through isa, cast, and dyn_cast:
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Value * getOperand(unsigned i) const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
unsigned getTreeSize() const
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isLoadCombineCandidate() const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail)
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
testing::Matcher< const detail::ErrorHolder & > Failed()
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
LLVM_READONLY APFloat maximum(const APFloat &A, const APFloat &B)
Implements IEEE 754-2019 maximum semantics.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2019 maximumNumber semantics.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2019 minimumNumber semantics.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_READONLY APFloat minimum(const APFloat &A, const APFloat &B)
Implements IEEE 754-2019 minimum semantics.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const