73#ifdef EXPENSIVE_CHECKS
106using namespace slpvectorizer;
107using namespace std::placeholders;
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
112STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
115 "Controls which SLP graphs should be vectorized.");
119 cl::desc(
"Run the SLP vectorization passes"));
123 cl::desc(
"Enable vectorization for wider vector utilization"));
127 cl::desc(
"Only vectorize if you gain more than this "
132 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
133 "heuristics and makes vectorization decision via cost modeling."));
137 cl::desc(
"Attempt to vectorize horizontal reductions"));
142 "Attempt to vectorize horizontal reductions feeding into a store"));
146 cl::desc(
"Attempt to vectorize for this register size in bits"));
150 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
158 cl::desc(
"Limit the size of the SLP scheduling region per block"));
162 cl::desc(
"Attempt to vectorize for this register size in bits"));
166 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
170 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
176 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
185 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
189 cl::desc(
"The minimum number of loads, which should be considered strided, "
190 "if the stride is > 1 or is runtime value"));
194 cl::desc(
"The maximum stride, considered to be profitable."));
198 cl::desc(
"Display the SLP trees with Graphviz"));
202 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
233 if (
SLPReVec && isa<FixedVectorType>(Ty))
235 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
244 if (
auto *SI = dyn_cast<StoreInst>(V))
245 return SI->getValueOperand()->getType();
246 if (
auto *CI = dyn_cast<CmpInst>(V))
247 return CI->getOperand(0)->getType();
248 if (
auto *IE = dyn_cast<InsertElementInst>(V))
249 return IE->getOperand(1)->getType();
255 assert(!isa<ScalableVectorType>(Ty) &&
256 "ScalableVectorType is not supported.");
257 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258 return VecTy->getNumElements();
272 Type *Ty,
unsigned Sz) {
277 if (NumParts == 0 || NumParts >= Sz)
292 if (NumParts == 0 || NumParts >= Sz)
297 return (Sz / RegVF) * RegVF;
307 for (
unsigned I : seq<unsigned>(Mask.size()))
309 I * VecTyNumElements, VecTyNumElements)))
311 : Mask[
I] * VecTyNumElements + J;
342 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
344 auto *SV = cast<ShuffleVectorInst>(VL.
front());
345 unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348 if (SVNumElements % ShuffleMaskSize != 0)
350 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
353 unsigned NumGroup = 0;
354 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
355 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
356 Value *Src = SV->getOperand(0);
360 auto *SV = cast<ShuffleVectorInst>(V);
362 if (SV->getOperand(0) != Src)
365 if (!SV->isExtractSubvectorMask(Index))
367 ExpectedIndex.
set(Index / ShuffleMaskSize);
371 if (!ExpectedIndex.
all())
375 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
393 auto *SV = cast<ShuffleVectorInst>(VL.
front());
394 unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
397 unsigned AccumulateLength = 0;
398 for (
Value *V : VL) {
399 auto *SV = cast<ShuffleVectorInst>(V);
400 for (
int M : SV->getShuffleMask())
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
418 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
421 auto *
I = dyn_cast<Instruction>(V);
422 if (!
I || isa<ExtractValueInst>(
I))
424 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
426 if (isa<ExtractElementInst>(
I))
428 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
444 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
453 OS <<
"Idx: " <<
Idx <<
", ";
454 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
462 auto *It =
find_if(VL, IsaPred<Instruction>);
471 if (isa<PoisonValue>(V))
473 auto *
II = dyn_cast<Instruction>(V);
477 if (BB !=
II->getParent())
494 Value *FirstNonUndef =
nullptr;
495 for (
Value *V : VL) {
496 if (isa<UndefValue>(V))
498 if (!FirstNonUndef) {
502 if (V != FirstNonUndef)
505 return FirstNonUndef !=
nullptr;
510 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
511 return Cmp->isCommutative();
512 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
513 return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
533 (BO->getOpcode() == Instruction::FSub &&
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
539 return I->isCommutative();
545 static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
549 if (
const auto *IE = dyn_cast<T>(Inst)) {
550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
556 if (CI->getValue().uge(VT->getNumElements()))
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
570 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
572 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
577 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
581 Type *CurrentType =
IV->getType();
582 for (
unsigned I :
IV->indices()) {
583 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(
I);
586 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
621 if (MaskArg == UseMask::UndefsAsMask)
625 if (MaskArg == UseMask::FirstArg &&
Value < VF)
626 UseMask.reset(
Value);
627 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
628 UseMask.reset(
Value - VF);
636template <
bool IsPoisonOnly = false>
640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
643 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
646 auto *
C = dyn_cast<Constant>(V);
648 if (!UseMask.empty()) {
650 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
652 if (isa<T>(
II->getOperand(1)))
659 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
667 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
674 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
675 if (
Constant *Elem =
C->getAggregateElement(
I))
677 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
705static std::optional<TargetTransformInfo::ShuffleKind>
708 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
712 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
719 return std::max(S, VTy->getNumElements());
722 Value *Vec1 =
nullptr;
723 Value *Vec2 =
nullptr;
725 auto *EE = dyn_cast<ExtractElementInst>(V);
728 Value *Vec = EE->getVectorOperand();
729 if (isa<UndefValue>(Vec))
734 ShuffleMode CommonShuffleMode =
Unknown;
736 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
738 if (isa<UndefValue>(VL[
I]))
740 auto *EI = cast<ExtractElementInst>(VL[
I]);
741 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
743 auto *Vec = EI->getVectorOperand();
745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
748 if (isa<UndefValue>(Vec)) {
751 if (isa<UndefValue>(EI->getIndexOperand()))
753 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
759 unsigned IntIdx =
Idx->getValue().getZExtValue();
766 if (!Vec1 || Vec1 == Vec) {
768 }
else if (!Vec2 || Vec2 == Vec) {
774 if (CommonShuffleMode == Permute)
778 if (Mask[
I] %
Size !=
I) {
779 CommonShuffleMode = Permute;
782 CommonShuffleMode =
Select;
785 if (CommonShuffleMode ==
Select && Vec2)
796 assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798 "Expected extractelement or extractvalue instruction.");
799 if (Opcode == Instruction::ExtractElement) {
800 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
803 return CI->getZExtValue();
805 auto *EI = cast<ExtractValueInst>(E);
806 if (EI->getNumIndices() != 1)
808 return *EI->idx_begin();
814class InstructionsState {
821 assert(valid() &&
"InstructionsState is invalid.");
826 assert(valid() &&
"InstructionsState is invalid.");
831 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
833 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
836 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
839 unsigned CheckedOpcode =
I->getOpcode();
840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
844 bool valid()
const {
return MainOp && AltOp; }
846 explicit operator bool()
const {
return valid(); }
848 InstructionsState() =
delete;
850 : MainOp(MainOp), AltOp(AltOp) {}
851 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
890 "Assessing comparisons of different types?");
900 return (BasePred == Pred &&
902 (BasePred == SwappedPred &&
912 if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
913 return InstructionsState::invalid();
915 auto *It =
find_if(VL, IsaPred<Instruction>);
917 return InstructionsState::invalid();
920 unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
921 if ((VL.
size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.
size() / 2) ||
922 (VL.
size() == 2 && InstCnt < 2))
923 return InstructionsState::invalid();
925 bool IsCastOp = isa<CastInst>(MainOp);
926 bool IsBinOp = isa<BinaryOperator>(MainOp);
927 bool IsCmpOp = isa<CmpInst>(MainOp);
932 unsigned AltOpcode = Opcode;
934 bool SwappedPredsCompatible = IsCmpOp && [&]() {
936 UniquePreds.
insert(BasePred);
937 UniqueNonSwappedPreds.
insert(BasePred);
938 for (
Value *V : VL) {
939 auto *
I = dyn_cast<CmpInst>(V);
945 UniqueNonSwappedPreds.
insert(CurrentPred);
946 if (!UniquePreds.
contains(CurrentPred) &&
947 !UniquePreds.
contains(SwappedCurrentPred))
948 UniquePreds.
insert(CurrentPred);
953 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
959 if (
auto *
CallBase = dyn_cast<CallInst>(MainOp)) {
963 return InstructionsState::invalid();
965 bool AnyPoison = InstCnt != VL.
size();
968 auto *
I = dyn_cast<Instruction>(V);
975 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
976 return InstructionsState::invalid();
977 unsigned InstOpcode =
I->getOpcode();
978 if (IsBinOp && isa<BinaryOperator>(
I)) {
979 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
983 AltOpcode = InstOpcode;
987 }
else if (IsCastOp && isa<CastInst>(
I)) {
990 Value *Op1 =
I->getOperand(0);
993 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
995 if (Opcode == AltOpcode) {
998 "Cast isn't safe for alternation, logic needs to be updated!");
999 AltOpcode = InstOpcode;
1004 }
else if (
auto *Inst = dyn_cast<CmpInst>(
I); Inst && IsCmpOp) {
1005 auto *BaseInst = cast<CmpInst>(MainOp);
1006 Type *Ty0 = BaseInst->getOperand(0)->getType();
1007 Type *Ty1 = Inst->getOperand(0)->getType();
1009 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1010 assert(InstOpcode == AltOpcode &&
1011 "Alternate instructions are only supported by BinaryOperator "
1019 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1020 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1025 auto *AltInst = cast<CmpInst>(AltOp);
1026 if (MainOp != AltOp) {
1029 }
else if (BasePred != CurrentPred) {
1032 "CmpInst isn't safe for alternation, logic needs to be updated!");
1037 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1038 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1041 }
else if (InstOpcode == Opcode) {
1042 assert(InstOpcode == AltOpcode &&
1043 "Alternate instructions are only supported by BinaryOperator and "
1045 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1046 if (Gep->getNumOperands() != 2 ||
1048 return InstructionsState::invalid();
1049 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1051 return InstructionsState::invalid();
1052 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
1053 auto *BaseLI = cast<LoadInst>(MainOp);
1054 if (!LI->isSimple() || !BaseLI->isSimple())
1055 return InstructionsState::invalid();
1056 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
1057 auto *
CallBase = cast<CallInst>(MainOp);
1059 return InstructionsState::invalid();
1060 if (Call->hasOperandBundles() &&
1062 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1063 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1066 return InstructionsState::invalid();
1069 return InstructionsState::invalid();
1072 if (Mappings.
size() != BaseMappings.
size() ||
1073 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1074 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1075 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1076 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1077 Mappings.
front().Shape.Parameters !=
1078 BaseMappings.
front().Shape.Parameters)
1079 return InstructionsState::invalid();
1084 return InstructionsState::invalid();
1087 return InstructionsState(MainOp, AltOp);
1104 unsigned Opcode = UserInst->
getOpcode();
1106 case Instruction::Load: {
1107 LoadInst *LI = cast<LoadInst>(UserInst);
1110 case Instruction::Store: {
1111 StoreInst *SI = cast<StoreInst>(UserInst);
1112 return (SI->getPointerOperand() == Scalar);
1114 case Instruction::Call: {
1115 CallInst *CI = cast<CallInst>(UserInst);
1118 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1119 Arg.value().get() == Scalar;
1131 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1138 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1139 return LI->isSimple();
1141 return SI->isSimple();
1143 return !
MI->isVolatile();
1151 bool ExtendingManyInputs =
false) {
1152 if (SubMask.
empty())
1155 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1158 "SubMask with many inputs support must be larger than the mask.");
1160 Mask.append(SubMask.
begin(), SubMask.
end());
1164 int TermValue = std::min(Mask.size(), SubMask.
size());
1165 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1167 (!ExtendingManyInputs &&
1168 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1170 NewMask[
I] = Mask[SubMask[
I]];
1186 const unsigned Sz = Order.
size();
1189 for (
unsigned I = 0;
I < Sz; ++
I) {
1191 UnusedIndices.
reset(Order[
I]);
1193 MaskedIndices.
set(
I);
1195 if (MaskedIndices.
none())
1198 "Non-synced masked/available indices.");
1202 assert(
Idx >= 0 &&
"Indices must be synced.");
1213 Type *ScalarTy = VL[0]->getType();
1216 for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1217 if (isa<PoisonValue>(VL[Lane]))
1219 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1220 OpcodeMask.
set(Lane * ScalarTyNumElements,
1221 Lane * ScalarTyNumElements + ScalarTyNumElements);
1231 const unsigned E = Indices.
size();
1233 for (
unsigned I = 0;
I < E; ++
I)
1234 Mask[Indices[
I]] =
I;
1240 assert(!Mask.empty() &&
"Expected non-empty mask.");
1244 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1246 Scalars[Mask[
I]] = Prev[
I];
1254 auto *
I = dyn_cast<Instruction>(V);
1259 auto *IO = dyn_cast<Instruction>(V);
1262 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1271 auto *
I = dyn_cast<Instruction>(V);
1275 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1277 auto *IU = dyn_cast<Instruction>(U);
1280 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1296 return !VL.
empty() &&
1312 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1316namespace slpvectorizer {
1321 struct ScheduleData;
1345 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1346 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1397 return !VectorizableTree.
empty() &&
1398 !VectorizableTree.
front()->UserTreeIndices.empty();
1403 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1404 return VectorizableTree.
front()->Scalars;
1410 const TreeEntry &Root = *VectorizableTree.
front().get();
1411 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1412 !Root.Scalars.front()->getType()->isIntegerTy())
1413 return std::nullopt;
1414 auto It = MinBWs.
find(&Root);
1415 if (It != MinBWs.
end())
1419 if (Root.getOpcode() == Instruction::ZExt ||
1420 Root.getOpcode() == Instruction::SExt)
1421 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1422 Root.getOpcode() == Instruction::SExt);
1423 return std::nullopt;
1429 return MinBWs.
at(VectorizableTree.
front().get()).second;
1434 if (ReductionBitWidth == 0 ||
1435 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
1436 ReductionBitWidth >=
1437 DL->getTypeSizeInBits(
1438 VectorizableTree.
front()->Scalars.front()->getType()))
1440 VectorizableTree.
front()->Scalars.front()->getType(),
1441 VectorizableTree.
front()->getVectorFactor());
1444 VectorizableTree.
front()->Scalars.front()->getContext(),
1446 VectorizableTree.
front()->getVectorFactor());
1461 VectorizableTree.
clear();
1462 ScalarToTreeEntry.clear();
1463 MultiNodeScalars.clear();
1465 NonScheduledFirst.
clear();
1466 EntryToLastInstruction.clear();
1467 LoadEntriesToVectorize.
clear();
1468 IsGraphTransformMode =
false;
1469 GatheredLoadsEntriesFirst.reset();
1470 ExternalUses.
clear();
1471 ExternalUsesAsOriginalScalar.clear();
1472 for (
auto &Iter : BlocksSchedules) {
1473 BlockScheduling *BS = Iter.second.get();
1477 ReductionBitWidth = 0;
1479 CastMaxMinBWSizes.reset();
1480 ExtraBitWidthNodes.
clear();
1481 InstrElementSize.clear();
1482 UserIgnoreList =
nullptr;
1483 PostponedGathers.
clear();
1484 ValueToGatherNodes.
clear();
1500 assert(!Order.
empty() &&
"expected non-empty order");
1501 const unsigned Sz = Order.
size();
1503 return P.value() ==
P.index() ||
P.value() == Sz;
1556 return MaxVecRegSize;
1561 return MinVecRegSize;
1569 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1571 return MaxVF ? MaxVF : UINT_MAX;
1623 unsigned *BestVF =
nullptr,
1624 bool TryRecursiveCheck =
true)
const;
1632 template <
typename T>
1659 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1660 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1682 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1683 MaxLevel(MaxLevel) {}
1737 if (isa<LoadInst>(V1)) {
1739 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1744 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1746 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1749 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1752 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1754 ((
int)V1->getNumUses() == NumLanes ||
1755 AllUsersAreInternal(V1, V2)))
1761 auto CheckSameEntryOrFail = [&]() {
1762 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1763 TE1 && TE1 == R.getTreeEntry(V2))
1768 auto *LI1 = dyn_cast<LoadInst>(V1);
1769 auto *LI2 = dyn_cast<LoadInst>(V2);
1771 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1773 return CheckSameEntryOrFail();
1776 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1777 LI2->getPointerOperand(),
DL, SE,
true);
1778 if (!Dist || *Dist == 0) {
1781 R.TTI->isLegalMaskedGather(
1784 return CheckSameEntryOrFail();
1788 if (std::abs(*Dist) > NumLanes / 2)
1797 auto *C1 = dyn_cast<Constant>(V1);
1798 auto *C2 = dyn_cast<Constant>(V2);
1812 if (isa<UndefValue>(V2))
1816 Value *EV2 =
nullptr;
1829 int Dist = Idx2 - Idx1;
1832 if (std::abs(Dist) == 0)
1834 if (std::abs(Dist) > NumLanes / 2)
1841 return CheckSameEntryOrFail();
1844 auto *I1 = dyn_cast<Instruction>(V1);
1845 auto *I2 = dyn_cast<Instruction>(V2);
1847 if (I1->getParent() != I2->getParent())
1848 return CheckSameEntryOrFail();
1856 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1857 !S.isAltShuffle()) &&
1859 return isa<PoisonValue>(V) ||
1860 cast<Instruction>(V)->getNumOperands() ==
1861 S.getMainOp()->getNumOperands();
1867 if (I1 && isa<PoisonValue>(V2))
1870 if (isa<UndefValue>(V2))
1873 return CheckSameEntryOrFail();
1907 int ShallowScoreAtThisLevel =
1916 auto *I1 = dyn_cast<Instruction>(
LHS);
1917 auto *I2 = dyn_cast<Instruction>(
RHS);
1918 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1920 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1921 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1922 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1923 ShallowScoreAtThisLevel))
1924 return ShallowScoreAtThisLevel;
1925 assert(I1 && I2 &&
"Should have early exited.");
1932 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1933 OpIdx1 != NumOperands1; ++OpIdx1) {
1935 int MaxTmpScore = 0;
1936 unsigned MaxOpIdx2 = 0;
1937 bool FoundBest =
false;
1941 ? I2->getNumOperands()
1942 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1943 assert(FromIdx <= ToIdx &&
"Bad index");
1944 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1946 if (Op2Used.
count(OpIdx2))
1951 I1, I2, CurrLevel + 1, {});
1954 TmpScore > MaxTmpScore) {
1955 MaxTmpScore = TmpScore;
1962 Op2Used.
insert(MaxOpIdx2);
1963 ShallowScoreAtThisLevel += MaxTmpScore;
1966 return ShallowScoreAtThisLevel;
1997 struct OperandData {
1998 OperandData() =
default;
1999 OperandData(
Value *V,
bool APO,
bool IsUsed)
2000 : V(V), APO(APO), IsUsed(IsUsed) {}
2010 bool IsUsed =
false;
2019 enum class ReorderingMode {
2033 unsigned ArgSize = 0;
2039 const Loop *L =
nullptr;
2042 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2043 return OpsVec[OpIdx][Lane];
2047 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2048 return OpsVec[OpIdx][Lane];
2053 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2054 OpIdx != NumOperands; ++OpIdx)
2055 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2057 OpsVec[OpIdx][Lane].IsUsed =
false;
2061 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2062 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2074 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2076 Value *IdxLaneV = getData(
Idx, Lane).V;
2077 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2078 isa<ExtractElementInst>(IdxLaneV))
2081 for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2084 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2085 if (!isa<Instruction>(OpIdxLnV))
2089 unsigned UniquesCount = Uniques.
size();
2090 auto IdxIt = Uniques.
find(IdxLaneV);
2091 unsigned UniquesCntWithIdxLaneV =
2092 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2093 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2094 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2095 unsigned UniquesCntWithOpIdxLaneV =
2096 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2097 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2099 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2100 UniquesCntWithOpIdxLaneV,
2101 UniquesCntWithOpIdxLaneV -
2103 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2104 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2105 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2114 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2115 Value *IdxLaneV = getData(
Idx, Lane).V;
2116 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2125 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2126 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2128 return R.areAllUsersVectorized(IdxLaneI)
2136 static const int ScoreScaleFactor = 10;
2144 int Lane,
unsigned OpIdx,
unsigned Idx,
2154 int SplatScore = getSplatScore(Lane, OpIdx,
Idx, UsedLanes);
2155 if (Score <= -SplatScore) {
2159 Score += SplatScore;
2165 Score *= ScoreScaleFactor;
2166 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2184 std::optional<unsigned>
2185 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2189 unsigned NumOperands = getNumOperands();
2192 Value *OpLastLane = getData(OpIdx, LastLane).V;
2195 ReorderingMode RMode = ReorderingModes[OpIdx];
2196 if (RMode == ReorderingMode::Failed)
2197 return std::nullopt;
2200 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2206 std::optional<unsigned>
Idx;
2210 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2216 bool IsUsed = RMode == ReorderingMode::Splat ||
2217 RMode == ReorderingMode::Constant ||
2218 RMode == ReorderingMode::Load;
2220 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2222 OperandData &OpData = getData(
Idx, Lane);
2224 bool OpAPO = OpData.APO;
2233 if (OpAPO != OpIdxAPO)
2238 case ReorderingMode::Load:
2239 case ReorderingMode::Opcode: {
2240 bool LeftToRight = Lane > LastLane;
2241 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2242 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2243 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2244 OpIdx,
Idx, IsUsed, UsedLanes);
2245 if (Score >
static_cast<int>(BestOp.Score) ||
2246 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2249 BestOp.Score = Score;
2250 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2254 case ReorderingMode::Constant:
2255 if (isa<Constant>(
Op) ||
2256 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2258 if (isa<Constant>(
Op)) {
2260 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2263 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2267 case ReorderingMode::Splat:
2268 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2269 IsUsed =
Op == OpLastLane;
2270 if (
Op == OpLastLane) {
2272 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2278 case ReorderingMode::Failed:
2284 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2288 return std::nullopt;
2295 unsigned getBestLaneToStartReordering()
const {
2296 unsigned Min = UINT_MAX;
2297 unsigned SameOpNumber = 0;
2308 for (
int I = getNumLanes();
I > 0; --
I) {
2309 unsigned Lane =
I - 1;
2310 OperandsOrderData NumFreeOpsHash =
2311 getMaxNumOperandsThatCanBeReordered(Lane);
2314 if (NumFreeOpsHash.NumOfAPOs < Min) {
2315 Min = NumFreeOpsHash.NumOfAPOs;
2316 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2319 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2323 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2324 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2325 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2326 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2327 auto [It, Inserted] =
2328 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2334 unsigned BestLane = 0;
2335 unsigned CntMin = UINT_MAX;
2337 if (
Data.second.first < CntMin) {
2338 CntMin =
Data.second.first;
2339 BestLane =
Data.second.second;
2346 struct OperandsOrderData {
2349 unsigned NumOfAPOs = UINT_MAX;
2352 unsigned NumOpsWithSameOpcodeParent = 0;
2366 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2367 unsigned CntTrue = 0;
2368 unsigned NumOperands = getNumOperands();
2378 bool AllUndefs =
true;
2379 unsigned NumOpsWithSameOpcodeParent = 0;
2383 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384 const OperandData &OpData = getData(OpIdx, Lane);
2389 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2391 I->getParent() != Parent) {
2392 if (NumOpsWithSameOpcodeParent == 0) {
2393 NumOpsWithSameOpcodeParent = 1;
2395 Parent =
I->getParent();
2397 --NumOpsWithSameOpcodeParent;
2400 ++NumOpsWithSameOpcodeParent;
2404 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2405 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2409 OperandsOrderData
Data;
2410 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2411 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2419 assert((empty() || VL.
size() == getNumLanes()) &&
2420 "Expected same number of lanes");
2423 constexpr unsigned IntrinsicNumOperands = 2;
2425 ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
2426 OpsVec.
resize(NumOperands);
2427 unsigned NumLanes = VL.
size();
2428 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2429 OpsVec[OpIdx].
resize(NumLanes);
2430 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2431 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2432 "Expected instruction or poison value");
2443 if (isa<PoisonValue>(VL[Lane])) {
2444 OpsVec[OpIdx][Lane] = {
2449 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2450 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2451 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2458 unsigned getNumOperands()
const {
return ArgSize; }
2461 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2464 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2465 return getData(OpIdx, Lane).V;
2469 bool empty()
const {
return OpsVec.
empty(); }
2472 void clear() { OpsVec.
clear(); }
2477 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2478 assert(
Op == getValue(OpIdx, Lane) &&
2479 "Op is expected to be getValue(OpIdx, Lane).");
2481 if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
2483 bool OpAPO = getData(OpIdx, Lane).APO;
2484 bool IsInvariant = L && L->isLoopInvariant(
Op);
2486 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2490 bool FoundCandidate =
false;
2491 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2492 OperandData &
Data = getData(OpI, Ln);
2493 if (
Data.APO != OpAPO ||
Data.IsUsed)
2495 Value *OpILane = getValue(OpI, Lane);
2496 bool IsConstantOp = isa<Constant>(OpILane);
2505 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2510 isa<Constant>(
Data.V)))) ||
2517 (IsInvariant && !isa<Constant>(
Data.V) &&
2519 L->isLoopInvariant(
Data.V))) {
2520 FoundCandidate =
true;
2527 if (!FoundCandidate)
2530 return getNumLanes() == 2 || Cnt > 1;
2535 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2536 assert(
Op == getValue(OpIdx, Lane) &&
2537 "Op is expected to be getValue(OpIdx, Lane).");
2538 bool OpAPO = getData(OpIdx, Lane).APO;
2539 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2542 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2543 const OperandData &
Data = getData(OpI, Ln);
2544 if (
Data.APO != OpAPO ||
Data.IsUsed)
2546 Value *OpILn = getValue(OpI, Ln);
2547 return (L && L->isLoopInvariant(OpILn)) ||
2559 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2560 L(R.LI->getLoopFor((VL0->
getParent()))) {
2562 appendOperandsOfVL(RootVL, VL0);
2569 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2570 "Expected same num of lanes across all operands");
2571 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2572 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2580 unsigned NumOperands = getNumOperands();
2581 unsigned NumLanes = getNumLanes();
2601 unsigned FirstLane = getBestLaneToStartReordering();
2604 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2605 Value *OpLane0 = getValue(OpIdx, FirstLane);
2608 if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2610 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2611 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2612 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2613 else if (isa<LoadInst>(OpILane0))
2614 ReorderingModes[OpIdx] = ReorderingMode::Load;
2616 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2617 }
else if (isa<Constant>(OpLane0)) {
2618 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2619 }
else if (isa<Argument>(OpLane0)) {
2621 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2631 auto &&SkipReordering = [
this]() {
2634 for (
const OperandData &
Data : Op0)
2638 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2657 if (SkipReordering())
2660 bool StrategyFailed =
false;
2668 for (
unsigned I = 0;
I < NumOperands; ++
I)
2669 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2672 UsedLanes.
set(FirstLane);
2673 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2676 int Lane = FirstLane +
Direction * Distance;
2677 if (Lane < 0 || Lane >= (
int)NumLanes)
2679 UsedLanes.
set(Lane);
2681 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2684 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2686 std::optional<unsigned> BestIdx =
2687 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2688 MainAltOps[OpIdx], UsedLanes);
2695 swap(OpIdx, *BestIdx, Lane);
2698 StrategyFailed =
true;
2701 if (MainAltOps[OpIdx].
size() != 2) {
2702 OperandData &AltOp = getData(OpIdx, Lane);
2703 InstructionsState OpS =
2705 if (OpS && OpS.isAltShuffle())
2712 if (!StrategyFailed)
2717#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2720 case ReorderingMode::Load:
2722 case ReorderingMode::Opcode:
2724 case ReorderingMode::Constant:
2726 case ReorderingMode::Splat:
2728 case ReorderingMode::Failed:
2749 const unsigned Indent = 2;
2752 OS <<
"Operand " << Cnt++ <<
"\n";
2753 for (
const OperandData &OpData : OpDataVec) {
2755 if (
Value *V = OpData.V)
2759 OS <<
", APO:" << OpData.APO <<
"}\n";
2781 int BestScore = Limit;
2782 std::optional<int> Index;
2783 for (
int I : seq<int>(0, Candidates.size())) {
2785 Candidates[
I].second,
2788 if (Score > BestScore) {
2803 DeletedInstructions.insert(
I);
2808 template <
typename T>
2811 for (
T *V : DeadVals) {
2812 auto *
I = cast<Instruction>(V);
2813 DeletedInstructions.insert(
I);
2816 for (
T *V : DeadVals) {
2817 if (!V || !Processed.
insert(V).second)
2819 auto *
I = cast<Instruction>(V);
2822 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2823 Entries.push_back(Entry);
2824 auto It = MultiNodeScalars.find(
I);
2825 if (It != MultiNodeScalars.end())
2826 Entries.append(It->second.begin(), It->second.end());
2828 for (
Use &U :
I->operands()) {
2829 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2830 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2832 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2833 return Entry->VectorizedValue == OpI;
2837 I->dropAllReferences();
2839 for (
T *V : DeadVals) {
2840 auto *
I = cast<Instruction>(V);
2841 if (!
I->getParent())
2846 cast<Instruction>(U.getUser()));
2848 "trying to erase instruction with users.");
2849 I->removeFromParent();
2853 while (!DeadInsts.
empty()) {
2856 if (!VI || !VI->getParent())
2859 "Live instruction found in dead worklist!");
2860 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2867 for (
Use &OpU : VI->operands()) {
2868 Value *OpV = OpU.get();
2879 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2880 if (!DeletedInstructions.contains(OpI) &&
2885 VI->removeFromParent();
2886 DeletedInstructions.insert(VI);
2894 return AnalyzedReductionsRoots.count(
I);
2899 AnalyzedReductionsRoots.insert(
I);
2913 AnalyzedReductionsRoots.clear();
2914 AnalyzedReductionVals.
clear();
2915 AnalyzedMinBWVals.
clear();
2927 return NonScheduledFirst.
contains(V);
2940 bool collectValuesToDemote(
2941 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
2944 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
2954 canReorderOperands(TreeEntry *UserTE,
2961 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2965 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2967 TreeEntry *TE =
nullptr;
2969 TE = getTreeEntry(V);
2970 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2972 auto It = MultiNodeScalars.find(V);
2973 if (It != MultiNodeScalars.end()) {
2974 for (TreeEntry *E : It->second) {
2975 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2983 if (It != VL.
end()) {
2984 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2992 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2993 unsigned OpIdx)
const {
2994 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2995 const_cast<TreeEntry *
>(UserTE), OpIdx);
2999 bool areAllUsersVectorized(
3008 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3013 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3017 getCastContextHint(
const TreeEntry &TE)
const;
3026 const EdgeInfo &EI,
unsigned InterleaveFactor = 0);
3037 bool ResizeAllowed =
false)
const;
3046 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
3047 const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
3048 unsigned NodeIdx)
const {
3049 return const_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
3056 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
3061 template <
typename BVTy,
typename ResTy,
typename...
Args>
3062 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3067 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
3068 bool PostponedPHIs);
3074 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3081 std::optional<TargetTransformInfo::ShuffleKind>
3093 unsigned NumParts)
const;
3105 std::optional<TargetTransformInfo::ShuffleKind>
3106 isGatherShuffledSingleRegisterEntry(
3123 isGatherShuffledEntry(
3126 unsigned NumParts,
bool ForOrder =
false);
3132 Type *ScalarTy)
const;
3136 void setInsertPointAfterBundle(
const TreeEntry *E);
3146 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3151 void tryToVectorizeGatheredLoads(
3160 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3176 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3180 void reorderGatherNode(TreeEntry &TE);
3184 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3201 [Scalars](
Value *V,
int Idx) {
3202 return (isa<UndefValue>(V) &&
3203 Idx == PoisonMaskElem) ||
3204 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3207 if (!ReorderIndices.empty()) {
3214 return IsSame(Scalars, Mask);
3215 if (VL.
size() == ReuseShuffleIndices.size()) {
3217 return IsSame(Scalars, Mask);
3221 return IsSame(Scalars, ReuseShuffleIndices);
3224 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3225 return isGather() && !UserTreeIndices.empty() &&
3226 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3227 UserTreeIndices.front().UserTE == UserEI.UserTE;
3231 bool hasEqualOperands(
const TreeEntry &TE)
const {
3232 if (
TE.getNumOperands() != getNumOperands())
3235 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3236 unsigned PrevCount =
Used.count();
3237 for (
unsigned K = 0;
K < E; ++
K) {
3240 if (getOperand(K) ==
TE.getOperand(
I)) {
3246 if (PrevCount ==
Used.count())
3255 unsigned getVectorFactor()
const {
3256 if (!ReuseShuffleIndices.empty())
3257 return ReuseShuffleIndices.size();
3258 return Scalars.
size();
3262 bool isGather()
const {
return State == NeedToGather; }
3289 enum CombinedOpcode {
3291 MinMax = Instruction::OtherOpsEnd + 1,
3293 CombinedOpcode CombinedOp = NotCombinedOp;
3307 VecTreeTy &Container;
3331 unsigned InterleaveFactor = 0;
3335 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
3337 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3343 assert(Operands[OpIdx].empty() &&
"Already resized?");
3345 "Number of operands is greater than the number of scalars.");
3351 void setOperand(
const BoUpSLP &R,
bool RequireReorder =
false) {
3352 VLOperands Ops(Scalars, MainOp, R);
3356 setOperand(
I, Ops.getVL(
I));
3378 unsigned getNumOperands()
const {
return Operands.size(); }
3381 Value *getSingleOperand(
unsigned OpIdx)
const {
3383 assert(!Operands[OpIdx].empty() &&
"No operand available");
3388 bool isAltShuffle()
const {
return MainOp != AltOp; }
3391 unsigned CheckedOpcode =
I->getOpcode();
3392 return (getOpcode() == CheckedOpcode ||
3393 getAltOpcode() == CheckedOpcode);
3400 auto *
I = dyn_cast<Instruction>(
Op);
3401 if (
I && isOpcodeOrAlt(
I))
3406 void setOperations(
const InstructionsState &S) {
3407 assert(S &&
"InstructionsState is invalid.");
3408 MainOp = S.getMainOp();
3409 AltOp = S.getAltOp();
3421 unsigned getOpcode()
const {
3422 return MainOp ? MainOp->
getOpcode() : 0;
3425 unsigned getAltOpcode()
const {
3431 int findLaneForValue(
Value *V)
const {
3432 unsigned FoundLane = getVectorFactor();
3433 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3434 std::advance(It, 1)) {
3437 FoundLane = std::distance(Scalars.begin(), It);
3438 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3439 if (!ReorderIndices.
empty())
3440 FoundLane = ReorderIndices[FoundLane];
3441 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3442 if (ReuseShuffleIndices.
empty())
3444 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3445 RIt != ReuseShuffleIndices.
end()) {
3446 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3450 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3463 bool isNonPowOf2Vec()
const {
3465 return IsNonPowerOf2;
3474 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3475 "Reshuffling not supported with non-power-of-2 vectors yet.");
3476 return IsNonPowerOf2;
3479 Value *getOrdered(
unsigned Idx)
const {
3480 assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
3481 if (ReorderIndices.
empty())
3482 return Scalars[
Idx];
3492 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3493 dbgs() <<
"Operand " << OpI <<
":\n";
3494 for (
const Value *V : Operands[OpI])
3497 dbgs() <<
"Scalars: \n";
3498 for (
Value *V : Scalars)
3500 dbgs() <<
"State: ";
3503 if (InterleaveFactor > 0) {
3504 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
3507 dbgs() <<
"Vectorize\n";
3510 case ScatterVectorize:
3511 dbgs() <<
"ScatterVectorize\n";
3513 case StridedVectorize:
3514 dbgs() <<
"StridedVectorize\n";
3517 dbgs() <<
"NeedToGather\n";
3519 case CombinedVectorize:
3520 dbgs() <<
"CombinedVectorize\n";
3523 dbgs() <<
"MainOp: ";
3525 dbgs() << *MainOp <<
"\n";
3528 dbgs() <<
"AltOp: ";
3530 dbgs() << *AltOp <<
"\n";
3533 dbgs() <<
"VectorizedValue: ";
3534 if (VectorizedValue)
3535 dbgs() << *VectorizedValue <<
"\n";
3538 dbgs() <<
"ReuseShuffleIndices: ";
3539 if (ReuseShuffleIndices.
empty())
3542 for (
int ReuseIdx : ReuseShuffleIndices)
3543 dbgs() << ReuseIdx <<
", ";
3545 dbgs() <<
"ReorderIndices: ";
3546 for (
unsigned ReorderIdx : ReorderIndices)
3547 dbgs() << ReorderIdx <<
", ";
3549 dbgs() <<
"UserTreeIndices: ";
3550 for (
const auto &EInfo : UserTreeIndices)
3551 dbgs() << EInfo <<
", ";
3553 if (!CombinedEntriesWithIndices.
empty()) {
3554 dbgs() <<
"Combined entries: ";
3556 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
3565 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3568 dbgs() <<
"SLP: " << Banner <<
":\n";
3570 dbgs() <<
"SLP: Costs:\n";
3571 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3572 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3573 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3574 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3575 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3581 std::optional<ScheduleData *> Bundle,
3582 const InstructionsState &S,
3583 const EdgeInfo &UserTreeIdx,
3586 unsigned InterleaveFactor = 0) {
3587 TreeEntry::EntryState EntryState =
3588 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3589 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3590 ReuseShuffleIndices, ReorderIndices);
3591 if (E && InterleaveFactor > 0)
3592 E->setInterleave(InterleaveFactor);
3597 TreeEntry::EntryState EntryState,
3598 std::optional<ScheduleData *> Bundle,
3599 const InstructionsState &S,
3600 const EdgeInfo &UserTreeIdx,
3603 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3604 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3605 "Need to vectorize gather entry?");
3607 if (GatheredLoadsEntriesFirst.has_value() &&
3608 EntryState == TreeEntry::NeedToGather && S &&
3609 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3610 !UserTreeIdx.UserTE)
3612 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3613 TreeEntry *
Last = VectorizableTree.
back().get();
3614 Last->Idx = VectorizableTree.
size() - 1;
3615 Last->State = EntryState;
3620 ReuseShuffleIndices.empty()) &&
3621 "Reshuffling scalars not yet supported for nodes with padding");
3622 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3623 ReuseShuffleIndices.end());
3624 if (ReorderIndices.
empty()) {
3627 Last->setOperations(S);
3630 Last->Scalars.assign(VL.
size(),
nullptr);
3633 if (Idx >= VL.size())
3634 return UndefValue::get(VL.front()->getType());
3639 Last->setOperations(S);
3640 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3642 if (!
Last->isGather()) {
3643 for (
Value *V : VL) {
3644 const TreeEntry *
TE = getTreeEntry(V);
3646 "Scalar already in tree!");
3649 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3652 ScalarToTreeEntry[
V] =
Last;
3655 ScheduleData *BundleMember = *Bundle;
3656 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3659 "Bundle and VL out of sync");
3661 for (
Value *V : VL) {
3666 BundleMember->TE =
Last;
3667 BundleMember = BundleMember->NextInBundle;
3670 assert(!BundleMember &&
"Bundle and VL out of sync");
3673 bool AllConstsOrCasts =
true;
3676 auto *
I = dyn_cast<CastInst>(V);
3677 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3678 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3679 !UserTreeIdx.UserTE->isGather())
3682 if (AllConstsOrCasts)
3684 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3685 MustGather.
insert(VL.begin(), VL.end());
3688 if (UserTreeIdx.UserTE)
3689 Last->UserTreeIndices.push_back(UserTreeIdx);
3695 TreeEntry::VecTreeTy VectorizableTree;
3700 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3701 VectorizableTree[
Id]->dump();
3707 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3709 const TreeEntry *getTreeEntry(
Value *V)
const {
3710 return ScalarToTreeEntry.lookup(V);
3719 bool areAltOperandsProfitable(
const InstructionsState &S,
3724 TreeEntry::EntryState
3726 bool IsScatterVectorizeUserTE,
3759 using ValueToGatherNodesMap =
3761 ValueToGatherNodesMap ValueToGatherNodes;
3769 bool IsGraphTransformMode =
false;
3772 std::optional<unsigned> GatheredLoadsEntriesFirst;
3775 struct ExternalUser {
3799 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3800 auto It = AliasCache.
find(Key);
3801 if (It != AliasCache.
end())
3806 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3810 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3842 UserList ExternalUses;
3865 struct ScheduleData {
3868 enum { InvalidDeps = -1 };
3870 ScheduleData() =
default;
3873 FirstInBundle =
this;
3874 NextInBundle =
nullptr;
3875 NextLoadStore =
nullptr;
3876 IsScheduled =
false;
3877 SchedulingRegionID = BlockSchedulingRegionID;
3878 clearDependencies();
3885 if (hasValidDependencies()) {
3886 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3888 assert(UnscheduledDeps == Dependencies &&
"invariant");
3892 assert(isSchedulingEntity() &&
3893 "unexpected scheduled state");
3894 for (
const ScheduleData *BundleMember =
this; BundleMember;
3895 BundleMember = BundleMember->NextInBundle) {
3896 assert(BundleMember->hasValidDependencies() &&
3897 BundleMember->UnscheduledDeps == 0 &&
3898 "unexpected scheduled state");
3899 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3900 "only bundle is marked scheduled");
3904 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3905 "all bundle members must be in same basic block");
3911 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3915 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3919 bool isPartOfBundle()
const {
3920 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3925 bool isReady()
const {
3926 assert(isSchedulingEntity() &&
3927 "can't consider non-scheduling entity for ready list");
3928 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3934 int incrementUnscheduledDeps(
int Incr) {
3935 assert(hasValidDependencies() &&
3936 "increment of unscheduled deps would be meaningless");
3937 UnscheduledDeps += Incr;
3938 return FirstInBundle->unscheduledDepsInBundle();
3943 void resetUnscheduledDeps() {
3944 UnscheduledDeps = Dependencies;
3948 void clearDependencies() {
3949 Dependencies = InvalidDeps;
3950 resetUnscheduledDeps();
3951 MemoryDependencies.clear();
3952 ControlDependencies.clear();
3955 int unscheduledDepsInBundle()
const {
3956 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3958 for (
const ScheduleData *BundleMember =
this; BundleMember;
3959 BundleMember = BundleMember->NextInBundle) {
3960 if (BundleMember->UnscheduledDeps == InvalidDeps)
3962 Sum += BundleMember->UnscheduledDeps;
3968 if (!isSchedulingEntity()) {
3969 os <<
"/ " << *Inst;
3970 }
else if (NextInBundle) {
3972 ScheduleData *SD = NextInBundle;
3974 os <<
';' << *SD->Inst;
3975 SD = SD->NextInBundle;
3986 TreeEntry *
TE =
nullptr;
3990 ScheduleData *FirstInBundle =
nullptr;
3994 ScheduleData *NextInBundle =
nullptr;
3998 ScheduleData *NextLoadStore =
nullptr;
4012 int SchedulingRegionID = 0;
4015 int SchedulingPriority = 0;
4021 int Dependencies = InvalidDeps;
4027 int UnscheduledDeps = InvalidDeps;
4031 bool IsScheduled =
false;
4036 const BoUpSLP::ScheduleData &SD) {
4061 struct BlockScheduling {
4063 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
4067 ScheduleStart =
nullptr;
4068 ScheduleEnd =
nullptr;
4069 FirstLoadStoreInRegion =
nullptr;
4070 LastLoadStoreInRegion =
nullptr;
4071 RegionHasStackSave =
false;
4075 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4078 ScheduleRegionSize = 0;
4082 ++SchedulingRegionID;
4086 if (BB !=
I->getParent())
4089 ScheduleData *SD = ScheduleDataMap.lookup(
I);
4090 if (SD && isInSchedulingRegion(SD))
4095 ScheduleData *getScheduleData(
Value *V) {
4096 if (
auto *
I = dyn_cast<Instruction>(V))
4097 return getScheduleData(
I);
4101 bool isInSchedulingRegion(ScheduleData *SD)
const {
4102 return SD->SchedulingRegionID == SchedulingRegionID;
4107 template <
typename ReadyListType>
4108 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4109 SD->IsScheduled =
true;
4112 for (ScheduleData *BundleMember = SD; BundleMember;
4113 BundleMember = BundleMember->NextInBundle) {
4118 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
4119 ScheduleData *OpDef = getScheduleData(
I);
4120 if (OpDef && OpDef->hasValidDependencies() &&
4121 OpDef->incrementUnscheduledDeps(-1) == 0) {
4125 ScheduleData *DepBundle = OpDef->FirstInBundle;
4126 assert(!DepBundle->IsScheduled &&
4127 "already scheduled bundle gets ready");
4128 ReadyList.insert(DepBundle);
4130 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
4137 if (TreeEntry *TE = BundleMember->TE) {
4139 int Lane = std::distance(
TE->Scalars.begin(),
4140 find(
TE->Scalars, BundleMember->Inst));
4141 assert(Lane >= 0 &&
"Lane not set");
4149 auto *
In = BundleMember->Inst;
4152 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4153 In->getNumOperands() ==
TE->getNumOperands()) &&
4154 "Missed TreeEntry operands?");
4157 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
4158 OpIdx != NumOperands; ++OpIdx)
4159 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
4164 for (
Use &U : BundleMember->Inst->operands())
4165 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
4169 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4170 if (MemoryDepSD->hasValidDependencies() &&
4171 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4174 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4175 assert(!DepBundle->IsScheduled &&
4176 "already scheduled bundle gets ready");
4177 ReadyList.insert(DepBundle);
4179 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4183 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4184 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4187 ScheduleData *DepBundle = DepSD->FirstInBundle;
4188 assert(!DepBundle->IsScheduled &&
4189 "already scheduled bundle gets ready");
4190 ReadyList.insert(DepBundle);
4192 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4203 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4204 ScheduleStart->comesBefore(ScheduleEnd) &&
4205 "Not a valid scheduling region?");
4207 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4208 auto *SD = getScheduleData(
I);
4211 assert(isInSchedulingRegion(SD) &&
4212 "primary schedule data not in window?");
4213 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4214 "entire bundle in window!");
4218 for (
auto *SD : ReadyInsts) {
4219 assert(SD->isSchedulingEntity() && SD->isReady() &&
4220 "item in ready list not ready?");
4226 template <
typename ReadyListType>
4227 void initialFillReadyList(ReadyListType &ReadyList) {
4228 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4229 ScheduleData *SD = getScheduleData(
I);
4230 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4232 ReadyList.insert(SD);
4234 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4248 std::optional<ScheduleData *>
4250 const InstructionsState &S);
4256 ScheduleData *allocateScheduleDataChunks();
4260 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4265 ScheduleData *PrevLoadStore,
4266 ScheduleData *NextLoadStore);
4270 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4274 void resetSchedule();
4304 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4308 ScheduleData *LastLoadStoreInRegion =
nullptr;
4313 bool RegionHasStackSave =
false;
4316 int ScheduleRegionSize = 0;
4325 int SchedulingRegionID = 1;
4333 void scheduleBlock(BlockScheduling *BS);
4340 struct OrdersTypeDenseMapInfo {
4353 static unsigned getHashValue(
const OrdersType &V) {
4374 unsigned MaxVecRegSize;
4375 unsigned MinVecRegSize;
4390 unsigned ReductionBitWidth = 0;
4393 unsigned BaseGraphSize = 1;
4397 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4416 struct ChildIteratorType
4418 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4429 return R.VectorizableTree[0].get();
4433 return {
N->UserTreeIndices.begin(),
N->Container};
4437 return {
N->UserTreeIndices.end(),
N->Container};
4442 class nodes_iterator {
4453 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4457 return nodes_iterator(R->VectorizableTree.begin());
4461 return nodes_iterator(R->VectorizableTree.end());
4464 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4475 OS << Entry->Idx <<
".\n";
4478 for (
auto *V : Entry->Scalars) {
4480 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4481 return EU.Scalar == V;
4491 if (Entry->isGather())
4493 if (Entry->State == TreeEntry::ScatterVectorize ||
4494 Entry->State == TreeEntry::StridedVectorize)
4495 return "color=blue";
4504 for (
auto *
I : DeletedInstructions) {
4505 if (!
I->getParent()) {
4508 if (isa<PHINode>(
I))
4510 I->insertBefore(
F->getEntryBlock(),
4511 F->getEntryBlock().getFirstNonPHIIt());
4513 I->insertBefore(
F->getEntryBlock().getTerminator());
4516 for (
Use &U :
I->operands()) {
4517 auto *
Op = dyn_cast<Instruction>(U.get());
4518 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4522 I->dropAllReferences();
4524 for (
auto *
I : DeletedInstructions) {
4526 "trying to erase instruction with users.");
4527 I->eraseFromParent();
4533#ifdef EXPENSIVE_CHECKS
4544 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4545 "Expected non-empty mask.");
4548 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4550 Reuses[Mask[
I]] = Prev[
I];
4558 bool BottomOrder =
false) {
4559 assert(!Mask.empty() &&
"Expected non-empty mask.");
4560 unsigned Sz = Mask.size();
4563 if (Order.
empty()) {
4565 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4567 PrevOrder.
swap(Order);
4570 for (
unsigned I = 0;
I < Sz; ++
I)
4572 Order[
I] = PrevOrder[Mask[
I]];
4574 return Data.value() == Sz ||
Data.index() ==
Data.value();
4583 if (Order.
empty()) {
4585 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4595 for (
unsigned I = 0;
I < Sz; ++
I)
4597 Order[MaskOrder[
I]] =
I;
4601std::optional<BoUpSLP::OrdersType>
4603 assert(TE.isGather() &&
"Expected gather node only.");
4607 Type *ScalarTy = GatheredScalars.
front()->getType();
4608 int NumScalars = GatheredScalars.
size();
4610 return std::nullopt;
4613 if (NumParts == 0 || NumParts >= NumScalars ||
4614 VecTy->getNumElements() % NumParts != 0 ||
4616 VecTy->getNumElements() / NumParts))
4622 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4624 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4627 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4628 return std::nullopt;
4629 OrdersType CurrentOrder(NumScalars, NumScalars);
4630 if (GatherShuffles.
size() == 1 &&
4632 Entries.front().front()->isSame(TE.Scalars)) {
4635 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4636 return CurrentOrder;
4640 return all_of(Mask, [&](
int I) {
4647 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4648 (Entries.size() != 1 ||
4649 Entries.front().front()->ReorderIndices.empty())) ||
4650 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4651 return std::nullopt;
4656 for (
int I : seq<int>(0, NumParts)) {
4657 if (ShuffledSubMasks.
test(
I))
4659 const int VF = GetVF(
I);
4665 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4666 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4667 ShuffledSubMasks.
set(
I);
4671 int FirstMin = INT_MAX;
4672 int SecondVecFound =
false;
4673 for (
int K : seq<int>(Limit)) {
4674 int Idx = Mask[
I * PartSz + K];
4676 Value *V = GatheredScalars[
I * PartSz + K];
4678 SecondVecFound =
true;
4687 SecondVecFound =
true;
4691 FirstMin = (FirstMin / PartSz) * PartSz;
4693 if (SecondVecFound) {
4694 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4695 ShuffledSubMasks.
set(
I);
4698 for (
int K : seq<int>(Limit)) {
4699 int Idx = Mask[
I * PartSz + K];
4703 if (
Idx >= PartSz) {
4704 SecondVecFound =
true;
4707 if (CurrentOrder[
I * PartSz +
Idx] >
4708 static_cast<unsigned>(
I * PartSz + K) &&
4709 CurrentOrder[
I * PartSz +
Idx] !=
4710 static_cast<unsigned>(
I * PartSz +
Idx))
4711 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4714 if (SecondVecFound) {
4715 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4716 ShuffledSubMasks.
set(
I);
4722 if (!ExtractShuffles.
empty())
4723 TransformMaskToOrder(
4724 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4725 if (!ExtractShuffles[
I])
4728 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4729 for (
unsigned Idx : seq<unsigned>(Sz)) {
4730 int K =
I * PartSz +
Idx;
4733 if (!TE.ReuseShuffleIndices.empty())
4734 K = TE.ReuseShuffleIndices[K];
4737 if (!TE.ReorderIndices.empty())
4738 K = std::distance(TE.ReorderIndices.begin(),
4739 find(TE.ReorderIndices, K));
4740 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4743 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4745 .getKnownMinValue());
4750 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4751 if (ShuffledSubMasks.
any())
4752 return std::nullopt;
4753 PartSz = NumScalars;
4756 if (!Entries.empty())
4757 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4758 if (!GatherShuffles[
I])
4760 return std::max(Entries[
I].front()->getVectorFactor(),
4761 Entries[
I].back()->getVectorFactor());
4764 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4765 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4766 return std::nullopt;
4767 return std::move(CurrentOrder);
4772 bool CompareOpcodes =
true) {
4776 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4777 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4778 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4779 (!GEP2 || GEP2->getNumOperands() == 2) &&
4780 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
4781 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
4784 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4788template <
typename T>
4790 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4792 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4793 return CommonAlignment;
4799 "Order is empty. Please check it before using isReverseOrder.");
4800 unsigned Sz = Order.
size();
4802 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4813static std::optional<Value *>
4819 const SCEV *PtrSCEVLowest =
nullptr;
4820 const SCEV *PtrSCEVHighest =
nullptr;
4826 return std::nullopt;
4828 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4829 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4833 if (isa<SCEVCouldNotCompute>(Diff))
4834 return std::nullopt;
4836 PtrSCEVLowest = PtrSCEV;
4840 if (isa<SCEVCouldNotCompute>(Diff1))
4841 return std::nullopt;
4843 PtrSCEVHighest = PtrSCEV;
4849 if (isa<SCEVCouldNotCompute>(Dist))
4850 return std::nullopt;
4851 int Size =
DL.getTypeStoreSize(ElemTy);
4852 auto TryGetStride = [&](
const SCEV *Dist,
4853 const SCEV *Multiplier) ->
const SCEV * {
4854 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4855 if (M->getOperand(0) == Multiplier)
4856 return M->getOperand(1);
4857 if (M->getOperand(1) == Multiplier)
4858 return M->getOperand(0);
4861 if (Multiplier == Dist)
4866 const SCEV *Stride =
nullptr;
4867 if (
Size != 1 || SCEVs.
size() > 2) {
4869 Stride = TryGetStride(Dist, Sz);
4871 return std::nullopt;
4873 if (!Stride || isa<SCEVConstant>(Stride))
4874 return std::nullopt;
4877 using DistOrdPair = std::pair<int64_t, int>;
4879 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4881 bool IsConsecutive =
true;
4882 for (
const SCEV *PtrSCEV : SCEVs) {
4884 if (PtrSCEV != PtrSCEVLowest) {
4886 const SCEV *Coeff = TryGetStride(Diff, Stride);
4888 return std::nullopt;
4889 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4890 if (!SC || isa<SCEVCouldNotCompute>(SC))
4891 return std::nullopt;
4895 return std::nullopt;
4896 Dist = SC->getAPInt().getZExtValue();
4900 return std::nullopt;
4901 auto Res = Offsets.emplace(Dist, Cnt);
4903 return std::nullopt;
4905 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4908 if (Offsets.size() != SCEVs.
size())
4909 return std::nullopt;
4910 SortedIndices.
clear();
4911 if (!IsConsecutive) {
4915 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4916 SortedIndices[Cnt] = Pair.second;
4926static std::pair<InstructionCost, InstructionCost>
4942 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4945 Mask, NumSrcElts, NumSubElts,
Index)) {
4946 if (
Index + NumSubElts > NumSrcElts &&
4947 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4963 if (
Index % SubVecVF == 0) {
4972 for (
unsigned I : seq<unsigned>(SubVecVF))
4975 Vec = Generator(Vec, V, Mask);
4979 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
4991 unsigned *BestVF,
bool TryRecursiveCheck)
const {
5004 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
5010 const unsigned Sz = VL.
size();
5012 auto *POIter = PointerOps.
begin();
5013 for (
Value *V : VL) {
5014 auto *L = dyn_cast<LoadInst>(V);
5015 if (!L || !L->isSimple())
5017 *POIter = L->getPointerOperand();
5026 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5046 if (Order.
empty()) {
5047 Ptr0 = PointerOps.
front();
5048 PtrN = PointerOps.
back();
5050 Ptr0 = PointerOps[Order.
front()];
5051 PtrN = PointerOps[Order.
back()];
5053 std::optional<int> Diff =
5056 if (
static_cast<unsigned>(*Diff) == Sz - 1)
5062 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5076 auto IsAnyPointerUsedOutGraph =
5077 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
5078 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
5079 return !getTreeEntry(U) && !MustGather.contains(U);
5082 const unsigned AbsoluteDiff = std::abs(*Diff);
5083 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5087 AbsoluteDiff > Sz) ||
5088 *Diff == -(
static_cast<int>(Sz) - 1))) {
5089 int Stride = *Diff /
static_cast<int>(Sz - 1);
5090 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
5102 else if (
Ptr != Ptr0)
5106 if (((Dist / Stride) * Stride) != Dist ||
5107 !Dists.
insert(Dist).second)
5110 if (Dists.
size() == Sz)
5119 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
5121 bool ProfitableGatherPointers) {
5126 auto [ScalarGEPCost, VectorGEPCost] =
5128 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
5134 VecTy->getNumElements());
5135 if (
static_cast<unsigned>(
count_if(
5136 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
5142 PtrVecTy, DemandedElts,
true,
false,
CostKind);
5161 false, CommonAlignment,
CostKind) +
5162 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5169 constexpr unsigned ListLimit = 4;
5170 if (!TryRecursiveCheck || VL.
size() < ListLimit)
5179 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
5184 for (
unsigned VF = VL.
size() / 2; VF >= MinVF; VF /= 2) {
5186 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
5199 DemandedElts.
setBits(Cnt, Cnt + VF);
5214 if (!DemandedElts.
isZero()) {
5219 for (
unsigned Idx : seq<unsigned>(VL.
size()))
5220 if (DemandedElts[
Idx])
5227 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
5232 LI0->getPointerOperand(),
5233 Instruction::GetElementPtr,
CostKind, ScalarTy,
5237 if (
static_cast<unsigned>(
5238 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5239 PointerOps.
size() - 1 ||
5259 LI0->getPointerAddressSpace(),
CostKind,
5265 LI0->getPointerOperand(),
5272 LI0->getPointerOperand(),
5282 for (
int Idx : seq<int>(0, VL.
size()))
5292 if (MaskedGatherCost >= VecLdCost &&
5305 bool ProfitableGatherPointers =
5306 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
5307 return L->isLoopInvariant(V);
5309 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
5310 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
5312 (
GEP &&
GEP->getNumOperands() == 2 &&
5313 isa<Constant, Instruction>(
GEP->getOperand(1)));
5320 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5321 ProfitableGatherPointers))
5334 "Expected list of pointer operands.");
5344 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
5346 SortedIndices.
clear();
5348 auto Key = std::make_pair(BBs[Cnt + 1],
5352 std::optional<int> Diff = getPointersDiff(
5353 ElemTy, std::get<0>(Base.front()), ElemTy,
5359 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5365 if (Bases.
size() > VL.
size() / 2 - 1)
5369 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
5376 if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
5377 Bases.
front().second.size() == VL.
size()))
5382 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
5391 FirstPointers.
insert(P1);
5392 SecondPointers.
insert(P2);
5398 "Unable to find matching root.");
5401 for (
auto &
Base : Bases) {
5402 for (
auto &Vec :
Base.second) {
5403 if (Vec.size() > 1) {
5404 stable_sort(Vec, [](
const std::tuple<Value *, int, unsigned> &
X,
5405 const std::tuple<Value *, int, unsigned> &
Y) {
5406 return std::get<1>(
X) < std::get<1>(
Y);
5408 int InitialOffset = std::get<1>(Vec[0]);
5409 bool AnyConsecutive =
5411 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
5415 if (!AnyConsecutive)
5420 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5424 for (
auto &
T : Bases)
5425 for (
const auto &Vec :
T.second)
5426 for (
const auto &
P : Vec)
5430 "Expected SortedIndices to be the size of VL");
5434std::optional<BoUpSLP::OrdersType>
5436 assert(TE.isGather() &&
"Expected gather node only.");
5437 Type *ScalarTy = TE.Scalars[0]->getType();
5440 Ptrs.
reserve(TE.Scalars.size());
5442 BBs.
reserve(TE.Scalars.size());
5443 for (
Value *V : TE.Scalars) {
5444 auto *L = dyn_cast<LoadInst>(V);
5445 if (!L || !L->isSimple())
5446 return std::nullopt;
5452 if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
5454 return std::move(Order);
5455 return std::nullopt;
5466 if (VU->
getType() != V->getType())
5469 if (!VU->
hasOneUse() && !V->hasOneUse())
5475 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5481 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
5482 bool IsReusedIdx =
false;
5484 if (IE2 == VU && !IE1)
5486 if (IE1 == V && !IE2)
5487 return V->hasOneUse();
5488 if (IE1 && IE1 != V) {
5490 IsReusedIdx |= ReusedIdx.
test(Idx1);
5491 ReusedIdx.
set(Idx1);
5492 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5495 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5497 if (IE2 && IE2 != VU) {
5499 IsReusedIdx |= ReusedIdx.
test(Idx2);
5500 ReusedIdx.
set(Idx2);
5501 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5504 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5506 }
while (!IsReusedIdx && (IE1 || IE2));
5510std::optional<BoUpSLP::OrdersType>
5514 if (!TE.ReuseShuffleIndices.empty()) {
5516 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
5517 "Reshuffling scalars not yet supported for nodes with padding");
5520 return std::nullopt;
5528 unsigned Sz = TE.Scalars.size();
5529 if (TE.isGather()) {
5530 if (std::optional<OrdersType> CurrentOrder =
5535 ::addMask(Mask, TE.ReuseShuffleIndices);
5536 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5537 unsigned Sz = TE.Scalars.size();
5538 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
5541 Res[
Idx + K * Sz] =
I + K * Sz;
5543 return std::move(Res);
5546 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5548 2 * TE.getVectorFactor())) == 1)
5549 return std::nullopt;
5553 if (TE.ReorderIndices.empty())
5554 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5557 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5558 unsigned VF = ReorderMask.
size();
5562 for (
unsigned I = 0;
I < VF;
I += Sz) {
5564 unsigned UndefCnt = 0;
5565 unsigned Limit = std::min(Sz, VF -
I);
5574 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5576 return std::nullopt;
5578 for (
unsigned K = 0; K < NumParts; ++K) {
5579 unsigned Idx = Val + Sz * K;
5581 ResOrder[
Idx] =
I + K;
5584 return std::move(ResOrder);
5586 unsigned VF = TE.getVectorFactor();
5589 TE.ReuseShuffleIndices.end());
5590 if (TE.getOpcode() == Instruction::ExtractElement &&
5592 if (isa<PoisonValue>(V))
5594 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5595 return Idx && *Idx < Sz;
5597 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
5598 "by BinaryOperator and CastInst.");
5600 if (TE.ReorderIndices.empty())
5601 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5604 for (
unsigned I = 0;
I < VF; ++
I) {
5605 int &
Idx = ReusedMask[
I];
5608 Value *V = TE.Scalars[ReorderMask[
Idx]];
5610 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5616 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5617 auto *It = ResOrder.
begin();
5618 for (
unsigned K = 0; K < VF; K += Sz) {
5622 std::iota(SubMask.begin(), SubMask.end(), 0);
5624 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5625 std::advance(It, Sz);
5628 return Data.index() ==
Data.value();
5630 return std::nullopt;
5631 return std::move(ResOrder);
5633 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5634 any_of(TE.UserTreeIndices,
5636 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5638 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
5639 return std::nullopt;
5640 if ((TE.State == TreeEntry::Vectorize ||
5641 TE.State == TreeEntry::StridedVectorize) &&
5642 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5643 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5644 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported by "
5645 "BinaryOperator and CastInst.");
5646 return TE.ReorderIndices;
5648 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5649 if (!TE.ReorderIndices.empty())
5650 return TE.ReorderIndices;
5653 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
5654 if (!V->hasNUsesOrMore(1))
5656 auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
5661 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
5663 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
5669 assert(BB1 != BB2 &&
"Expected different basic blocks.");
5670 auto *NodeA = DT->
getNode(BB1);
5671 auto *NodeB = DT->
getNode(BB2);
5672 assert(NodeA &&
"Should only process reachable instructions");
5673 assert(NodeB &&
"Should only process reachable instructions");
5674 assert((NodeA == NodeB) ==
5675 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5676 "Different nodes should have different DFS numbers");
5677 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5679 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5680 Value *V1 = TE.Scalars[I1];
5681 Value *V2 = TE.Scalars[I2];
5682 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5684 if (isa<PoisonValue>(V1))
5686 if (isa<PoisonValue>(V2))
5692 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5693 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5694 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5695 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5696 FirstUserOfPhi2->getParent());
5697 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5698 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5699 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5700 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5706 if (UserBVHead[I1] && !UserBVHead[I2])
5708 if (!UserBVHead[I1])
5710 if (UserBVHead[I1] == UserBVHead[I2])
5713 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
5715 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5722 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5723 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5724 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5725 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5728 if (EE1->getOperand(0) == EE2->getOperand(0))
5730 if (!Inst1 && Inst2)
5732 if (Inst1 && Inst2) {
5740 "Expected either instructions or arguments vector operands.");
5741 return P1->getArgNo() < P2->getArgNo();
5746 std::iota(Phis.
begin(), Phis.
end(), 0);
5749 return std::nullopt;
5750 return std::move(Phis);
5752 if (TE.isGather() && !TE.isAltShuffle() &&
allSameType(TE.Scalars)) {
5755 if ((TE.getOpcode() == Instruction::ExtractElement ||
5756 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5757 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5759 auto *EE = dyn_cast<ExtractElementInst>(V);
5760 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5765 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5767 if (Reuse || !CurrentOrder.
empty())
5768 return std::move(CurrentOrder);
5776 int Sz = TE.Scalars.size();
5778 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5780 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5781 if (It == TE.Scalars.begin())
5784 if (It != TE.Scalars.end()) {
5786 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5801 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5804 return std::move(Order);
5809 return std::nullopt;
5810 if (TE.Scalars.size() >= 3)
5815 if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
5819 CurrentOrder, PointerOps);
5821 return std::move(CurrentOrder);
5827 return CurrentOrder;
5829 return std::nullopt;
5839 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5841 if (Cluster != FirstCluster)
5847void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5850 const unsigned Sz =
TE.Scalars.size();
5852 if (!
TE.isGather() ||
5859 addMask(NewMask,
TE.ReuseShuffleIndices);
5861 TE.ReorderIndices.clear();
5868 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5869 *
End =
TE.ReuseShuffleIndices.end();
5870 It !=
End; std::advance(It, Sz))
5871 std::iota(It, std::next(It, Sz), 0);
5877 "Expected same size of orders");
5878 unsigned Sz = Order.
size();
5880 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5881 if (Order[
Idx] != Sz)
5882 UsedIndices.
set(Order[
Idx]);
5884 if (SecondaryOrder.
empty()) {
5885 for (
unsigned Idx : seq<unsigned>(0, Sz))
5886 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5889 for (
unsigned Idx : seq<unsigned>(0, Sz))
5890 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5891 !UsedIndices.
test(SecondaryOrder[
Idx]))
5892 Order[
Idx] = SecondaryOrder[
Idx];
5912 ExternalUserReorderMap;
5917 const std::unique_ptr<TreeEntry> &TE) {
5920 findExternalStoreUsersReorderIndices(TE.get());
5921 if (!ExternalUserReorderIndices.
empty()) {
5922 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5924 std::move(ExternalUserReorderIndices));
5930 if (TE->isAltShuffle()) {
5933 unsigned Opcode0 = TE->getOpcode();
5934 unsigned Opcode1 = TE->getAltOpcode();
5937 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5938 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5944 if (std::optional<OrdersType> CurrentOrder =
5954 const TreeEntry *UserTE = TE.get();
5956 if (UserTE->UserTreeIndices.size() != 1)
5959 return EI.UserTE->State == TreeEntry::Vectorize &&
5960 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5963 UserTE = UserTE->UserTreeIndices.back().UserTE;
5966 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5967 if (!(TE->State == TreeEntry::Vectorize ||
5968 TE->State == TreeEntry::StridedVectorize) ||
5969 !TE->ReuseShuffleIndices.empty())
5970 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5971 if (TE->State == TreeEntry::Vectorize &&
5972 TE->getOpcode() == Instruction::PHI)
5973 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5978 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
5979 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5980 auto It = VFToOrderedEntries.
find(VF);
5981 if (It == VFToOrderedEntries.
end())
5996 for (
const TreeEntry *OpTE : OrderedEntries) {
5999 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6002 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6004 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6005 auto It = GathersToOrders.find(OpTE);
6006 if (It != GathersToOrders.end())
6009 if (OpTE->isAltShuffle()) {
6010 auto It = AltShufflesToOrders.find(OpTE);
6011 if (It != AltShufflesToOrders.end())
6014 if (OpTE->State == TreeEntry::Vectorize &&
6015 OpTE->getOpcode() == Instruction::PHI) {
6016 auto It = PhisToOrders.
find(OpTE);
6017 if (It != PhisToOrders.
end())
6020 return OpTE->ReorderIndices;
6023 auto It = ExternalUserReorderMap.
find(OpTE);
6024 if (It != ExternalUserReorderMap.
end()) {
6025 const auto &ExternalUserReorderIndices = It->second;
6029 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6030 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
6031 ExternalUserReorderIndices.size();
6033 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
6034 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6041 if (OpTE->State == TreeEntry::Vectorize &&
6042 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6043 assert(!OpTE->isAltShuffle() &&
6044 "Alternate instructions are only supported by BinaryOperator "
6048 unsigned E = Order.size();
6051 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6054 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6056 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6059 if (OrdersUses.empty())
6062 unsigned IdentityCnt = 0;
6063 unsigned FilledIdentityCnt = 0;
6065 for (
auto &Pair : OrdersUses) {
6067 if (!Pair.first.empty())
6068 FilledIdentityCnt += Pair.second;
6069 IdentityCnt += Pair.second;
6074 unsigned Cnt = IdentityCnt;
6075 for (
auto &Pair : OrdersUses) {
6079 if (Cnt < Pair.second ||
6080 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6081 Cnt == Pair.second && !BestOrder.
empty() &&
6084 BestOrder = Pair.first;
6097 unsigned E = BestOrder.
size();
6099 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6102 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6104 if (TE->Scalars.size() != VF) {
6105 if (TE->ReuseShuffleIndices.size() == VF) {
6111 return EI.UserTE->Scalars.size() == VF ||
6112 EI.UserTE->Scalars.size() ==
6115 "All users must be of VF size.");
6123 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6128 return isa<ShuffleVectorInst>(
6129 EI.UserTE->getMainOp());
6131 "Does not know how to reorder.");
6135 reorderNodeWithReuses(*TE, Mask);
6139 if ((TE->State == TreeEntry::Vectorize ||
6140 TE->State == TreeEntry::StridedVectorize) &&
6143 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6144 assert(!TE->isAltShuffle() &&
6145 "Alternate instructions are only supported by BinaryOperator "
6150 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6151 TE->reorderOperands(Mask);
6154 TE->reorderOperands(Mask);
6155 assert(TE->ReorderIndices.empty() &&
6156 "Expected empty reorder sequence.");
6159 if (!TE->ReuseShuffleIndices.empty()) {
6166 addMask(NewReuses, TE->ReuseShuffleIndices);
6167 TE->ReuseShuffleIndices.swap(NewReuses);
6173bool BoUpSLP::canReorderOperands(
6174 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6177 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
6178 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
6179 return OpData.first ==
I &&
6180 (OpData.second->State == TreeEntry::Vectorize ||
6181 OpData.second->State == TreeEntry::StridedVectorize);
6184 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
6186 if (
any_of(TE->UserTreeIndices,
6187 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6191 Edges.emplace_back(
I, TE);
6197 if (TE->State != TreeEntry::Vectorize &&
6198 TE->State != TreeEntry::StridedVectorize &&
6199 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6203 TreeEntry *
Gather =
nullptr;
6205 [&
Gather, UserTE,
I](TreeEntry *TE) {
6206 assert(TE->State != TreeEntry::Vectorize &&
6207 TE->State != TreeEntry::StridedVectorize &&
6208 "Only non-vectorized nodes are expected.");
6209 if (
any_of(TE->UserTreeIndices,
6210 [UserTE,
I](
const EdgeInfo &EI) {
6211 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6213 assert(TE->isSame(UserTE->getOperand(
I)) &&
6214 "Operand entry does not match operands.");
6235 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6236 if (TE->State != TreeEntry::Vectorize &&
6237 TE->State != TreeEntry::StridedVectorize)
6239 if (std::optional<OrdersType> CurrentOrder =
6241 OrderedEntries.
insert(TE.get());
6242 if (!(TE->State == TreeEntry::Vectorize ||
6243 TE->State == TreeEntry::StridedVectorize) ||
6244 !TE->ReuseShuffleIndices.empty())
6245 GathersToOrders.
insert(TE.get());
6254 while (!OrderedEntries.
empty()) {
6259 for (TreeEntry *TE : OrderedEntries) {
6260 if (!(TE->State == TreeEntry::Vectorize ||
6261 TE->State == TreeEntry::StridedVectorize ||
6262 (TE->isGather() && GathersToOrders.
contains(TE))) ||
6263 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6266 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6268 !Visited.
insert(TE).second) {
6274 for (
EdgeInfo &EI : TE->UserTreeIndices)
6278 for (TreeEntry *TE : Filtered)
6279 OrderedEntries.remove(TE);
6281 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6283 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
6284 return Data1.first->Idx > Data2.first->Idx;
6286 for (
auto &
Data : UsersVec) {
6289 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
6291 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6292 OrderedEntries.remove(
Op.second);
6305 for (
const auto &
Op :
Data.second) {
6306 TreeEntry *OpTE =
Op.second;
6307 if (!VisitedOps.
insert(OpTE).second)
6309 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6311 const auto Order = [&]() ->
const OrdersType {
6312 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6315 return OpTE->ReorderIndices;
6319 if (Order.size() == 1)
6322 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
6323 return P.second == OpTE;
6326 if (OpTE->State == TreeEntry::Vectorize &&
6327 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6328 assert(!OpTE->isAltShuffle() &&
6329 "Alternate instructions are only supported by BinaryOperator "
6333 unsigned E = Order.size();
6336 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6339 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6342 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6344 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
6345 const auto AllowsReordering = [&](
const TreeEntry *TE) {
6346 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6347 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6348 (IgnoreReorder && TE->Idx == 0))
6350 if (TE->isGather()) {
6359 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
6360 TreeEntry *UserTE = EI.
UserTE;
6361 if (!VisitedUsers.
insert(UserTE).second)
6366 if (AllowsReordering(UserTE))
6374 if (
static_cast<unsigned>(
count_if(
6375 Ops, [UserTE, &AllowsReordering](
6376 const std::pair<unsigned, TreeEntry *> &
Op) {
6377 return AllowsReordering(
Op.second) &&
6380 return EI.UserTE == UserTE;
6382 })) <= Ops.
size() / 2)
6383 ++Res.first->second;
6386 if (OrdersUses.empty()) {
6387 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6388 OrderedEntries.remove(
Op.second);
6392 unsigned IdentityCnt = 0;
6393 unsigned VF =
Data.second.front().second->getVectorFactor();
6395 for (
auto &Pair : OrdersUses) {
6397 IdentityCnt += Pair.second;
6402 unsigned Cnt = IdentityCnt;
6403 for (
auto &Pair : OrdersUses) {
6407 if (Cnt < Pair.second) {
6409 BestOrder = Pair.first;
6417 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6418 OrderedEntries.remove(
Op.second);
6427 unsigned E = BestOrder.
size();
6429 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6431 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
6432 TreeEntry *TE =
Op.second;
6433 OrderedEntries.remove(TE);
6434 if (!VisitedOps.
insert(TE).second)
6436 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
6437 reorderNodeWithReuses(*TE, Mask);
6441 if (TE->State != TreeEntry::Vectorize &&
6442 TE->State != TreeEntry::StridedVectorize &&
6443 (TE->State != TreeEntry::ScatterVectorize ||
6444 TE->ReorderIndices.empty()))
6446 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
6447 TE->ReorderIndices.empty()) &&
6448 "Non-matching sizes of user/operand entries.");
6450 if (IgnoreReorder && TE == VectorizableTree.front().get())
6451 IgnoreReorder =
false;
6454 for (TreeEntry *
Gather : GatherOps) {
6456 "Unexpected reordering of gathers.");
6457 if (!
Gather->ReuseShuffleIndices.empty()) {
6463 OrderedEntries.remove(
Gather);
6467 if (
Data.first->State != TreeEntry::Vectorize ||
6468 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6469 Data.first->getMainOp()) ||
6470 Data.first->isAltShuffle())
6471 Data.first->reorderOperands(Mask);
6472 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
6473 Data.first->isAltShuffle() ||
6474 Data.first->State == TreeEntry::StridedVectorize) {
6478 if (
Data.first->ReuseShuffleIndices.empty() &&
6479 !
Data.first->ReorderIndices.empty() &&
6480 !
Data.first->isAltShuffle()) {
6483 OrderedEntries.insert(
Data.first);
6491 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6492 VectorizableTree.front()->ReuseShuffleIndices.empty())
6493 VectorizableTree.front()->ReorderIndices.clear();
6496Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
6497 if ((Entry.getOpcode() == Instruction::Store ||
6498 Entry.getOpcode() == Instruction::Load) &&
6499 Entry.State == TreeEntry::StridedVectorize &&
6500 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
6501 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6502 return dyn_cast<Instruction>(Entry.Scalars.front());
6509 for (
auto &TEPtr : VectorizableTree) {
6510 TreeEntry *Entry = TEPtr.get();
6513 if (Entry->isGather())
6517 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6518 Value *Scalar = Entry->Scalars[Lane];
6519 if (!isa<Instruction>(Scalar))
6522 auto It = ScalarToExtUses.
find(Scalar);
6523 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6527 const auto ExtI = ExternallyUsedValues.
find(Scalar);
6528 if (ExtI != ExternallyUsedValues.
end()) {
6529 int FoundLane = Entry->findLaneForValue(Scalar);
6530 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
6531 << FoundLane <<
" from " << *Scalar <<
".\n");
6532 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6533 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
6536 for (
User *U : Scalar->users()) {
6544 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6548 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6552 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6554 Scalar, getRootEntryInstruction(*UseEntry), TLI,
TTI)) {
6555 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
6557 assert(!UseEntry->isGather() &&
"Bad state");
6561 if (It != ScalarToExtUses.
end()) {
6562 ExternalUses[It->second].User =
nullptr;
6567 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6569 int FoundLane = Entry->findLaneForValue(Scalar);
6571 <<
" from lane " << FoundLane <<
" from " << *Scalar
6573 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6574 ExternalUses.emplace_back(Scalar, U, FoundLane);
6583BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
6587 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6588 Value *V = TE->Scalars[Lane];
6590 if (!isa<Instruction>(V))
6597 for (
User *U : V->users()) {
6598 auto *SI = dyn_cast<StoreInst>(U);
6601 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6605 if (getTreeEntry(U))
6610 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6611 SI->getValueOperand()->getType(),
Ptr}];
6614 if (StoresVec.size() > Lane)
6616 if (!StoresVec.empty()) {
6618 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6619 SI->getValueOperand()->getType(),
6620 StoresVec.front()->getPointerOperand(), *
DL, *SE,
6626 StoresVec.push_back(SI);
6631 for (
auto &
P : PtrToStoresMap) {
6632 Res[
I].swap(
P.second);
6639 OrdersType &ReorderIndices)
const {
6650 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
6652 std::optional<int> Diff =
6654 SI->getPointerOperand(), *
DL, *SE,
6660 if (StoreOffsetVec.
size() != StoresVec.
size())
6662 sort(StoreOffsetVec,
6663 [](
const std::pair<int, unsigned> &L,
6664 const std::pair<int, unsigned> &R) {
return L.first <
R.first; });
6667 for (
const auto &
P : StoreOffsetVec) {
6668 if (
Idx > 0 &&
P.first != PrevDist + 1)
6676 ReorderIndices.assign(StoresVec.
size(), 0);
6677 bool IsIdentity =
true;
6679 ReorderIndices[
P.second] =
I;
6680 IsIdentity &=
P.second ==
I;
6686 ReorderIndices.clear();
6693 for (
unsigned Idx : Order)
6700BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
6701 unsigned NumLanes =
TE->Scalars.size();
6714 if (StoresVec.
size() != NumLanes)
6719 if (!canFormVector(StoresVec, ReorderIndices))
6724 ExternalReorderIndices.
push_back(ReorderIndices);
6726 return ExternalReorderIndices;
6732 UserIgnoreList = &UserIgnoreLst;
6735 buildTree_rec(Roots, 0,
EdgeInfo());
6742 buildTree_rec(Roots, 0,
EdgeInfo());
6751 bool AddNew =
true) {
6759 for (
Value *V : VL) {
6760 auto *LI = dyn_cast<LoadInst>(V);
6763 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6765 bool IsFound =
false;
6766 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
6767 assert(LI->getParent() ==
Data.front().first->getParent() &&
6768 LI->getType() ==
Data.front().first->getType() &&
6772 "Expected loads with the same type, same parent and same "
6773 "underlying pointer.");
6775 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
6776 Data.front().first->getPointerOperand(),
DL, SE,
6780 auto It = Map.find(*Dist);
6781 if (It != Map.end() && It->second != LI)
6783 if (It == Map.end()) {
6784 Data.emplace_back(LI, *Dist);
6785 Map.try_emplace(*Dist, LI);
6795 auto FindMatchingLoads =
6800 int &
Offset,
unsigned &Start) {
6802 return GatheredLoads.
end();
6812 std::optional<int> Dist =
6814 Data.front().first->getType(),
6815 Data.front().first->getPointerOperand(),
DL, SE,
6821 for (std::pair<LoadInst *, int>
P :
Data) {
6827 unsigned NumUniques = 0;
6828 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
6829 bool Used = DataLoads.
contains(Pair.first);
6830 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
6834 Repeated.insert(Cnt);
6837 if (NumUniques > 0 &&
6838 (Loads.
size() == NumUniques ||
6839 (Loads.
size() - NumUniques >= 2 &&
6840 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
6846 return std::next(GatheredLoads.
begin(),
Idx);
6850 return GatheredLoads.
end();
6852 for (
ArrayRef<std::pair<LoadInst *, int>>
Data : ClusteredLoads) {
6856 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
6858 while (It != GatheredLoads.
end()) {
6859 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
6860 for (
unsigned Idx : LocalToAdd)
6862 ToAdd.
insert(LocalToAdd.begin(), LocalToAdd.end());
6863 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
6867 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6871 for (
unsigned Idx : seq<unsigned>(
Data.size())) {
6880 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6881 return PD.front().first->getParent() == LI->
getParent() &&
6882 PD.front().first->getType() == LI->
getType();
6884 while (It != GatheredLoads.
end()) {
6887 std::next(It), GatheredLoads.
end(),
6888 [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6889 return PD.front().first->getParent() == LI->getParent() &&
6890 PD.front().first->getType() == LI->getType();
6894 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
6895 AddNewLoads(GatheredLoads.emplace_back());
6900void BoUpSLP::tryToVectorizeGatheredLoads(
6903 8> &GatheredLoads) {
6904 GatheredLoadsEntriesFirst = VectorizableTree.size();
6907 LoadEntriesToVectorize.
size());
6908 for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6909 Set.insert(VectorizableTree[
Idx]->Scalars.begin(),
6910 VectorizableTree[
Idx]->Scalars.end());
6913 auto LoadSorter = [](
const std::pair<LoadInst *, int> &L1,
6914 const std::pair<LoadInst *, int> &L2) {
6915 return L1.second > L2.second;
6921 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6922 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
6930 bool Final,
unsigned MaxVF) {
6932 unsigned StartIdx = 0;
6937 *
TTI, Loads.
front()->getType(), MaxVF);
6939 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
6945 if (Final && CandidateVFs.
empty())
6948 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
6949 for (
unsigned NumElts : CandidateVFs) {
6950 if (Final && NumElts > BestVF)
6953 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
6957 if (VectorizedLoads.count(Slice.
front()) ||
6958 VectorizedLoads.count(Slice.
back()) ||
6964 bool AllowToVectorize =
false;
6972 if (LI->hasOneUse())
6978 if (
static_cast<unsigned int>(std::distance(
6979 LI->user_begin(), LI->user_end())) != LI->getNumUses())
6981 if (!IsLegalBroadcastLoad)
6985 for (
User *U : LI->users()) {
6986 if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
6988 if (
const TreeEntry *UTE = getTreeEntry(U)) {
6989 for (
int I : seq<int>(UTE->getNumOperands())) {
6990 if (
all_of(UTE->getOperand(
I),
6991 [LI](
Value *V) { return V == LI; }))
7000 AllowToVectorize = CheckIfAllowed(Slice);
7004 any_of(ValueToGatherNodes.at(Slice.front()),
7005 [=](
const TreeEntry *TE) {
7006 return TE->Scalars.size() == 2 &&
7007 ((TE->Scalars.front() == Slice.front() &&
7008 TE->Scalars.back() == Slice.back()) ||
7009 (TE->Scalars.front() == Slice.back() &&
7010 TE->Scalars.back() == Slice.front()));
7015 if (AllowToVectorize) {
7020 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
7022 PointerOps, &BestVF);
7024 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7026 if (MaskedGatherVectorized.
empty() ||
7027 Cnt >= MaskedGatherVectorized.
back() + NumElts)
7032 Results.emplace_back(Values, LS);
7033 VectorizedLoads.insert(Slice.begin(), Slice.end());
7036 if (Cnt == StartIdx)
7037 StartIdx += NumElts;
7040 if (StartIdx >= Loads.
size())
7044 if (!MaskedGatherVectorized.
empty() &&
7045 Cnt < MaskedGatherVectorized.
back() + NumElts)
7051 if (!AllowToVectorize || BestVF == 0)
7055 for (
unsigned Cnt : MaskedGatherVectorized) {
7057 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
7061 VectorizedLoads.insert(Slice.
begin(), Slice.
end());
7063 if (Cnt == StartIdx)
7064 StartIdx += NumElts;
7068 if (!VectorizedLoads.contains(LI))
7069 NonVectorized.push_back(LI);
7073 auto ProcessGatheredLoads =
7076 bool Final =
false) {
7078 for (
ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7079 if (LoadsDists.size() <= 1) {
7080 NonVectorized.
push_back(LoadsDists.back().first);
7085 transform(LoadsDists, OriginalLoads.begin(),
7086 [](
const std::pair<LoadInst *, int> &L) ->
LoadInst * {
7091 unsigned MaxConsecutiveDistance = 0;
7092 unsigned CurrentConsecutiveDist = 1;
7093 int LastDist = LocalLoadsDists.
front().second;
7094 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7095 for (
const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7096 if (getTreeEntry(
L.first))
7098 assert(LastDist >=
L.second &&
7099 "Expected first distance always not less than second");
7100 if (
static_cast<unsigned>(LastDist -
L.second) ==
7101 CurrentConsecutiveDist) {
7102 ++CurrentConsecutiveDist;
7103 MaxConsecutiveDistance =
7104 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7108 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7111 CurrentConsecutiveDist = 1;
7112 LastDist =
L.second;
7115 if (Loads.
size() <= 1)
7117 if (AllowMaskedGather)
7118 MaxConsecutiveDistance = Loads.
size();
7119 else if (MaxConsecutiveDistance < 2)
7124 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7125 Final, MaxConsecutiveDistance);
7127 OriginalLoads.size() == Loads.
size() &&
7128 MaxConsecutiveDistance == Loads.
size() &&
7133 VectorizedLoads.
clear();
7137 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7138 UnsortedNonVectorized, Final,
7139 OriginalLoads.size());
7140 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
7141 SortedNonVectorized.
swap(UnsortedNonVectorized);
7142 Results.swap(UnsortedResults);
7147 << Slice.
size() <<
")\n");
7148 if (
any_of(Slice, [&](
Value *V) {
return getTreeEntry(V); })) {
7149 for (
Value *L : Slice)
7150 if (!getTreeEntry(L))
7151 SortedNonVectorized.
push_back(cast<LoadInst>(L));
7157 unsigned MaxVF = Slice.size();
7158 unsigned UserMaxVF = 0;
7159 unsigned InterleaveFactor = 0;
7164 std::optional<unsigned> InterleavedLoadsDistance = 0;
7166 std::optional<unsigned> CommonVF = 0;
7170 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
7171 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
7174 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
7176 if (*CommonVF == 0) {
7177 CommonVF =
E->Scalars.size();
7180 if (*CommonVF !=
E->Scalars.size())
7184 if (Pos !=
Idx && InterleavedLoadsDistance) {
7187 if (isa<Constant>(V))
7189 if (getTreeEntry(V))
7191 const auto &Nodes = ValueToGatherNodes.at(V);
7192 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7193 !is_contained(Slice, V);
7195 InterleavedLoadsDistance.reset();
7199 if (*InterleavedLoadsDistance == 0) {
7200 InterleavedLoadsDistance =
Idx - Pos;
7203 if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7204 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
7205 InterleavedLoadsDistance.reset();
7206 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7210 DeinterleavedNodes.
clear();
7212 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7213 CommonVF.value_or(0) != 0) {
7214 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
7215 unsigned VF = *CommonVF;
7219 if (InterleaveFactor <= Slice.size() &&
7223 cast<LoadInst>(Slice.front())->getAlign(),
7224 cast<LoadInst>(Slice.front())
7228 UserMaxVF = InterleaveFactor * VF;
7230 InterleaveFactor = 0;
7235 unsigned ConsecutiveNodesSize = 0;
7236 if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
7237 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7238 [&, Slice = Slice](
const auto &
P) {
7240 return std::get<1>(
P).contains(V);
7242 if (It == Slice.end())
7245 VectorizableTree[std::get<0>(
P)]->Scalars;
7246 ConsecutiveNodesSize += VL.
size();
7247 unsigned Start = std::distance(Slice.begin(), It);
7248 unsigned Sz = Slice.size() - Start;
7249 return Sz < VL.
size() ||
7250 Slice.slice(std::distance(Slice.begin(), It),
7256 if (InterleaveFactor == 0 &&
7257 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7258 [&, Slice = Slice](
unsigned Idx) {
7260 SmallVector<Value *> PointerOps;
7261 return canVectorizeLoads(
7262 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7263 Slice[Idx * UserMaxVF], Order,
7265 LoadsState::ScatterVectorize;
7268 if (Slice.size() != ConsecutiveNodesSize)
7269 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7271 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7272 bool IsVectorized =
true;
7273 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
7275 Slice.
slice(
I, std::min(VF,
E -
I));
7276 if (getTreeEntry(SubSlice.
front()))
7280 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7281 [&](
const auto &
P) {
7283 VectorizableTree[std::get<0>(
P)]
7288 unsigned Sz = VectorizableTree.size();
7289 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7290 if (Sz == VectorizableTree.size()) {
7291 IsVectorized =
false;
7294 if (InterleaveFactor > 0) {
7295 VF = 2 * (MaxVF / InterleaveFactor);
7296 InterleaveFactor = 0;
7305 NonVectorized.
append(SortedNonVectorized);
7307 return NonVectorized;
7309 for (
const auto &GLs : GatheredLoads) {
7310 const auto &
Ref = GLs.second;
7312 if (!
Ref.empty() && !NonVectorized.
empty() &&
7314 Ref.begin(),
Ref.end(), 0u,
7316 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) ->
unsigned {
7317 return S + LoadsDists.size();
7318 }) != NonVectorized.
size() &&
7319 IsMaskedGatherSupported(NonVectorized)) {
7321 for (
LoadInst *LI : NonVectorized) {
7329 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
7333 for (
unsigned Idx : LoadEntriesToVectorize) {
7334 const TreeEntry &
E = *VectorizableTree[
Idx];
7337 if (!
E.ReorderIndices.empty()) {
7344 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7348 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7349 VectorizableTree.size())
7350 GatheredLoadsEntriesFirst.reset();
7357 Value *NeedsScheduling =
nullptr;
7358 for (
Value *V : VL) {
7361 if (!NeedsScheduling) {
7362 NeedsScheduling = V;
7367 return NeedsScheduling;
7378 bool AllowAlternate) {
7382 if (
auto *LI = dyn_cast<LoadInst>(V)) {
7385 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
7390 if (isa<ExtractElementInst, UndefValue>(V))
7392 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
7394 !isa<UndefValue>(EI->getIndexOperand()))
7397 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
7400 if ((isa<BinaryOperator, CastInst>(
I)) &&
7410 : cast<CastInst>(
I)->getOperand(0)->getType()));
7412 if (isa<CastInst>(
I)) {
7413 std::pair<size_t, size_t> OpVals =
7419 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
7421 if (CI->isCommutative())
7427 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
7441 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
7442 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7443 SubKey =
hash_value(Gep->getPointerOperand());
7447 !isa<ConstantInt>(
I->getOperand(1))) {
7455 return std::make_pair(Key, SubKey);
7465bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
7467 unsigned Opcode0 = S.getOpcode();
7468 unsigned Opcode1 = S.getAltOpcode();
7472 Opcode0, Opcode1, OpcodeMask))
7475 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7478 for (
Value *V : VL) {
7479 if (isa<PoisonValue>(V)) {
7484 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
7489 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7495 switch (Res.value_or(0)) {
7510 constexpr unsigned NumAltInsts = 3;
7511 unsigned NonInstCnt = 0;
7514 unsigned UndefCnt = 0;
7516 unsigned ExtraShuffleInsts = 0;
7525 return is_contained(Operands.back(), V);
7528 ++ExtraShuffleInsts;
7545 if (isa<Constant, ExtractElementInst>(V) ||
7546 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
7547 if (isa<UndefValue>(V))
7553 if (!Res.second && Res.first->second == 1)
7554 ++ExtraShuffleInsts;
7555 ++Res.first->getSecond();
7556 if (
auto *
I = dyn_cast<Instruction>(V))
7557 UniqueOpcodes.
insert(
I->getOpcode());
7558 else if (Res.second)
7561 return none_of(Uniques, [&](
const auto &
P) {
7562 return P.first->hasNUsesOrMore(
P.second + 1) &&
7564 return getTreeEntry(U) || Uniques.contains(U);
7573 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7574 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
7575 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7578BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7580 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7583 "Expected instructions with same/alternate opcodes only.");
7585 unsigned ShuffleOrOp =
7586 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
7588 switch (ShuffleOrOp) {
7589 case Instruction::PHI: {
7592 return TreeEntry::NeedToGather;
7594 for (
Value *V : VL) {
7595 auto *
PHI = dyn_cast<PHINode>(V);
7600 if (Term &&
Term->isTerminator()) {
7602 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
7603 return TreeEntry::NeedToGather;
7608 return TreeEntry::Vectorize;
7610 case Instruction::ExtractValue:
7611 case Instruction::ExtractElement: {
7612 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
7615 return TreeEntry::NeedToGather;
7616 if (Reuse || !CurrentOrder.empty())
7617 return TreeEntry::Vectorize;
7619 return TreeEntry::NeedToGather;
7621 case Instruction::InsertElement: {
7625 for (
Value *V : VL) {
7626 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
7628 "Non-constant or undef index?");
7632 return !SourceVectors.contains(V);
7635 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7636 "different source vectors.\n");
7637 return TreeEntry::NeedToGather;
7642 return SourceVectors.contains(V) && !
V->hasOneUse();
7645 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7646 "multiple uses.\n");
7647 return TreeEntry::NeedToGather;
7650 return TreeEntry::Vectorize;
7652 case Instruction::Load: {
7661 return TreeEntry::Vectorize;
7663 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7665 LoadEntriesToVectorize.insert(VectorizableTree.size());
7666 return TreeEntry::NeedToGather;
7668 return TreeEntry::ScatterVectorize;
7670 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7672 LoadEntriesToVectorize.insert(VectorizableTree.size());
7673 return TreeEntry::NeedToGather;
7675 return TreeEntry::StridedVectorize;
7679 if (
DL->getTypeSizeInBits(ScalarTy) !=
7680 DL->getTypeAllocSizeInBits(ScalarTy))
7681 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
7683 auto *LI = dyn_cast<LoadInst>(V);
7684 return !LI || !LI->isSimple();
7691 return TreeEntry::NeedToGather;
7695 case Instruction::ZExt:
7696 case Instruction::SExt:
7697 case Instruction::FPToUI:
7698 case Instruction::FPToSI:
7699 case Instruction::FPExt:
7700 case Instruction::PtrToInt:
7701 case Instruction::IntToPtr:
7702 case Instruction::SIToFP:
7703 case Instruction::UIToFP:
7704 case Instruction::Trunc:
7705 case Instruction::FPTrunc:
7706 case Instruction::BitCast: {
7708 for (
Value *V : VL) {
7709 if (isa<PoisonValue>(V))
7711 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7714 dbgs() <<
"SLP: Gathering casts with different src types.\n");
7715 return TreeEntry::NeedToGather;
7718 return TreeEntry::Vectorize;
7720 case Instruction::ICmp:
7721 case Instruction::FCmp: {
7726 for (
Value *V : VL) {
7727 if (isa<PoisonValue>(V))
7729 auto *
Cmp = cast<CmpInst>(V);
7730 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
7731 Cmp->getOperand(0)->getType() != ComparedTy) {
7732 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
7733 return TreeEntry::NeedToGather;
7736 return TreeEntry::Vectorize;
7738 case Instruction::Select:
7739 case Instruction::FNeg:
7740 case Instruction::Add:
7741 case Instruction::FAdd:
7742 case Instruction::Sub:
7743 case Instruction::FSub:
7744 case Instruction::Mul:
7745 case Instruction::FMul:
7746 case Instruction::UDiv:
7747 case Instruction::SDiv:
7748 case Instruction::FDiv:
7749 case Instruction::URem:
7750 case Instruction::SRem:
7751 case Instruction::FRem:
7752 case Instruction::Shl:
7753 case Instruction::LShr:
7754 case Instruction::AShr:
7755 case Instruction::And:
7756 case Instruction::Or:
7757 case Instruction::Xor:
7758 case Instruction::Freeze:
7759 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7761 auto *
I = dyn_cast<Instruction>(V);
7762 return I &&
I->isBinaryOp() && !
I->isFast();
7764 return TreeEntry::NeedToGather;
7765 return TreeEntry::Vectorize;
7766 case Instruction::GetElementPtr: {
7768 for (
Value *V : VL) {
7769 auto *
I = dyn_cast<GetElementPtrInst>(V);
7772 if (
I->getNumOperands() != 2) {
7773 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
7774 return TreeEntry::NeedToGather;
7780 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7781 for (
Value *V : VL) {
7782 auto *
GEP = dyn_cast<GEPOperator>(V);
7785 Type *CurTy =
GEP->getSourceElementType();
7787 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
7788 return TreeEntry::NeedToGather;
7794 for (
Value *V : VL) {
7795 auto *
I = dyn_cast<GetElementPtrInst>(V);
7798 auto *
Op =
I->getOperand(1);
7799 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7800 (
Op->getType() != Ty1 &&
7801 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7802 Op->getType()->getScalarSizeInBits() >
7803 DL->getIndexSizeInBits(
7804 V->getType()->getPointerAddressSpace())))) {
7806 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
7807 return TreeEntry::NeedToGather;
7811 return TreeEntry::Vectorize;
7813 case Instruction::Store: {
7815 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7818 if (
DL->getTypeSizeInBits(ScalarTy) !=
7819 DL->getTypeAllocSizeInBits(ScalarTy)) {
7820 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
7821 return TreeEntry::NeedToGather;
7825 for (
Value *V : VL) {
7826 auto *
SI = cast<StoreInst>(V);
7827 if (!
SI->isSimple()) {
7829 return TreeEntry::NeedToGather;
7838 if (CurrentOrder.empty()) {
7839 Ptr0 = PointerOps.
front();
7840 PtrN = PointerOps.
back();
7842 Ptr0 = PointerOps[CurrentOrder.front()];
7843 PtrN = PointerOps[CurrentOrder.back()];
7845 std::optional<int> Dist =
7848 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
7849 return TreeEntry::Vectorize;
7853 return TreeEntry::NeedToGather;
7855 case Instruction::Call: {
7856 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7858 auto *
I = dyn_cast<Instruction>(V);
7859 return I && !
I->isFast();
7861 return TreeEntry::NeedToGather;
7864 CallInst *CI = cast<CallInst>(VL0);
7875 return TreeEntry::NeedToGather;
7880 for (
unsigned J = 0; J != NumArgs; ++J)
7883 for (
Value *V : VL) {
7884 CallInst *CI2 = dyn_cast<CallInst>(V);
7890 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
7892 return TreeEntry::NeedToGather;
7896 for (
unsigned J = 0; J != NumArgs; ++J) {
7899 if (ScalarArgs[J] != A1J) {
7901 <<
"SLP: mismatched arguments in call:" << *CI
7902 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
7903 return TreeEntry::NeedToGather;
7912 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
7913 <<
"!=" << *V <<
'\n');
7914 return TreeEntry::NeedToGather;
7918 return TreeEntry::Vectorize;
7920 case Instruction::ShuffleVector: {
7921 if (!S.isAltShuffle()) {
7924 return TreeEntry::Vectorize;
7927 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
7928 return TreeEntry::NeedToGather;
7933 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
7934 "the whole alt sequence is not profitable.\n");
7935 return TreeEntry::NeedToGather;
7938 return TreeEntry::Vectorize;
7942 return TreeEntry::NeedToGather;
7956 PHIHandler() =
delete;
7958 : DT(DT), Main(Main), Phis(Phis),
7959 Operands(Main->getNumIncomingValues(),
7961 void buildOperands() {
7962 constexpr unsigned FastLimit = 4;
7972 auto *
P = dyn_cast<PHINode>(V);
7974 assert(isa<PoisonValue>(V) &&
7975 "Expected isa instruction or poison value.");
7979 if (
P->getIncomingBlock(
I) == InBB)
7994 Blocks.try_emplace(InBB).first->second.push_back(
I);
7997 if (isa<PoisonValue>(V)) {
8002 auto *
P = cast<PHINode>(V);
8003 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
8011 auto It =
Blocks.find(InBB);
8017 for (
const auto &
P :
Blocks) {
8018 if (
P.getSecond().size() <= 1)
8020 unsigned BasicI =
P.getSecond().front();
8023 [&](
const auto &Data) {
8024 return !Data.value() ||
8025 Data.value() ==
Operands[BasicI][Data.index()];
8027 "Expected empty operands list.");
8037 const EdgeInfo &UserTreeIdx,
8038 unsigned InterleaveFactor) {
8044 auto TryToFindDuplicates = [&](
const InstructionsState &S,
8045 bool DoNotFail =
false) {
8048 for (
Value *V : VL) {
8055 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
8060 size_t NumUniqueScalarValues = UniqueValues.
size();
8063 if (NumUniqueScalarValues == VL.size() &&
8065 ReuseShuffleIndices.
clear();
8068 if ((UserTreeIdx.UserTE &&
8069 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI)) ||
8071 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
8072 "for nodes with padding.\n");
8073 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8077 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8078 (UniquePositions.size() == 1 &&
all_of(UniqueValues, [](
Value *V) {
8081 if (DoNotFail && UniquePositions.size() > 1 &&
8082 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8083 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8086 *
TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
8087 if (PWSz == VL.size()) {
8088 ReuseShuffleIndices.
clear();
8090 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
8092 PWSz - UniqueValues.
size(),
8094 VL = NonUniqueValueVL;
8099 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8112 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8114 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8120 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8121 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp()
8123 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8124 auto It = MultiNodeScalars.
find(S.getMainOp());
8125 if (It != MultiNodeScalars.
end()) {
8126 auto *TEIt =
find_if(It->getSecond(),
8127 [&](TreeEntry *ME) { return ME->isSame(VL); });
8128 if (TEIt != It->getSecond().end())
8138 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
8139 if (TryToFindDuplicates(S))
8140 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8141 ReuseShuffleIndices);
8145 Nodes.
insert(getTreeEntry(S.getMainOp()));
8146 for (
const TreeEntry *E : MultiNodeScalars.
lookup(S.getMainOp()))
8149 if (
any_of(Nodes, [&](
const TreeEntry *E) {
8151 [&](
Value *V) { return Values.contains(V); }))
8156 all_of(VL, [&](
Value *V) {
return EValues.contains(V); }));
8159 if (TryToFindDuplicates(S))
8160 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8161 ReuseShuffleIndices);
8168 E->UserTreeIndices.push_back(UserTreeIdx);
8169 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
8180 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8185 cast<Instruction>(
I)->getOpcode() == S.getOpcode();
8187 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
8188 if (TryToFindDuplicates(S))
8189 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8190 ReuseShuffleIndices);
8195 if (S && S.getOpcode() == Instruction::ExtractElement &&
8196 isa<ScalableVectorType>(
8197 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8198 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
8199 if (TryToFindDuplicates(S))
8200 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8201 ReuseShuffleIndices);
8208 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8217 auto &&NotProfitableForVectorization = [&S,
this,
8219 if (!S || !S.isAltShuffle() || VL.size() > 2)
8228 for (
Value *V : VL) {
8229 auto *
I = cast<Instruction>(V);
8231 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8234 bool IsCommutative =
8236 if ((IsCommutative &&
8237 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
8239 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
8241 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
8243 auto *
I1 = cast<Instruction>(VL.front());
8244 auto *I2 = cast<Instruction>(VL.back());
8245 for (
int Op : seq<int>(S.getMainOp()->getNumOperands()))
8247 I2->getOperand(
Op));
8248 if (
static_cast<unsigned>(
count_if(
8249 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8251 })) >= S.getMainOp()->getNumOperands() / 2)
8253 if (S.getMainOp()->getNumOperands() > 2)
8255 if (IsCommutative) {
8260 I2->getOperand((
Op + 1) % E));
8262 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8271 bool IsScatterVectorizeUserTE =
8272 UserTreeIdx.UserTE &&
8273 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8275 bool AreScatterAllGEPSameBlock =
8276 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8280 auto *
I = dyn_cast<GetElementPtrInst>(V);
8284 BB =
I->getParent();
8285 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
8288 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8290 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8293 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8296 NotProfitableForVectorization(VL)) {
8297 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8298 if (TryToFindDuplicates(S))
8299 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8300 ReuseShuffleIndices);
8305 if (S && !EphValues.
empty()) {
8306 for (
Value *V : VL) {
8307 if (EphValues.
count(V)) {
8309 <<
") is ephemeral.\n");
8310 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8320 for (
Value *V : VL) {
8321 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8324 if (getTreeEntry(V)) {
8326 <<
") is already in tree.\n");
8327 if (TryToFindDuplicates(S))
8328 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8329 ReuseShuffleIndices);
8335 if (UserIgnoreList && !UserIgnoreList->empty()) {
8336 for (
Value *V : VL) {
8337 if (UserIgnoreList->contains(V)) {
8338 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
8339 if (TryToFindDuplicates(S))
8340 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8341 ReuseShuffleIndices);
8349 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8350 assert(VL.front()->getType()->isPointerTy() &&
8351 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8352 "Expected pointers only.");
8354 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
8355 assert(It != VL.end() &&
"Expected at least one GEP.");
8372 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8377 if (!TryToFindDuplicates(S,
true))
8383 TreeEntry::EntryState State = getScalarsVectorizationState(
8384 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8385 if (State == TreeEntry::NeedToGather) {
8386 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8387 ReuseShuffleIndices);
8391 auto &BSRef = BlocksSchedules[BB];
8393 BSRef = std::make_unique<BlockScheduling>(BB);
8395 BlockScheduling &BS = *BSRef;
8397 std::optional<ScheduleData *> Bundle =
8398 BS.tryScheduleBundle(UniqueValues,
this, S);
8399#ifdef EXPENSIVE_CHECKS
8404 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
8405 assert((!BS.getScheduleData(VL0) ||
8406 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8407 "tryScheduleBundle should cancelScheduling on failure");
8408 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8409 ReuseShuffleIndices);
8410 NonScheduledFirst.insert(VL.front());
8411 if (S.getOpcode() == Instruction::Load &&
8412 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8416 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
8418 unsigned ShuffleOrOp =
8419 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
8420 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
8423 for (
unsigned I : seq<unsigned>(
Operands.size())) {
8428 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8433 for (
unsigned I : PHIOps)
8436 switch (ShuffleOrOp) {
8437 case Instruction::PHI: {
8438 auto *PH = cast<PHINode>(VL0);
8441 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8446 PHIHandler Handler(*DT, PH, VL);
8447 Handler.buildOperands();
8448 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8449 TE->setOperand(
I, Handler.getOperands(
I));
8451 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8456 case Instruction::ExtractValue:
8457 case Instruction::ExtractElement: {
8458 if (CurrentOrder.empty()) {
8459 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
8462 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
8464 for (
unsigned Idx : CurrentOrder)
8472 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8473 ReuseShuffleIndices, CurrentOrder);
8475 "(ExtractValueInst/ExtractElementInst).\n";
8479 TE->setOperand(*
this);
8482 case Instruction::InsertElement: {
8483 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
8485 auto OrdCompare = [](
const std::pair<int, int> &P1,
8486 const std::pair<int, int> &P2) {
8487 return P1.first > P2.first;
8490 decltype(OrdCompare)>
8491 Indices(OrdCompare);
8492 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8494 Indices.emplace(
Idx,
I);
8496 OrdersType CurrentOrder(VL.size(), VL.size());
8497 bool IsIdentity =
true;
8498 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8499 CurrentOrder[Indices.top().second] =
I;
8500 IsIdentity &= Indices.top().second ==
I;
8504 CurrentOrder.clear();
8505 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8507 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
8510 TE->setOperand(*
this);
8511 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
8514 case Instruction::Load: {
8521 TreeEntry *
TE =
nullptr;
8524 case TreeEntry::Vectorize:
8525 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8526 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8527 if (CurrentOrder.empty())
8532 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
8535 case TreeEntry::StridedVectorize:
8537 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8538 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8539 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
8542 case TreeEntry::ScatterVectorize:
8544 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8545 UserTreeIdx, ReuseShuffleIndices);
8548 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8551 case TreeEntry::CombinedVectorize:
8552 case TreeEntry::NeedToGather:
8555 TE->setOperand(*
this);
8556 if (State == TreeEntry::ScatterVectorize)
8557 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
8560 case Instruction::ZExt:
8561 case Instruction::SExt:
8562 case Instruction::FPToUI:
8563 case Instruction::FPToSI:
8564 case Instruction::FPExt:
8565 case Instruction::PtrToInt:
8566 case Instruction::IntToPtr:
8567 case Instruction::SIToFP:
8568 case Instruction::UIToFP:
8569 case Instruction::Trunc:
8570 case Instruction::FPTrunc:
8571 case Instruction::BitCast: {
8572 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8573 std::make_pair(std::numeric_limits<unsigned>::min(),
8574 std::numeric_limits<unsigned>::max()));
8575 if (ShuffleOrOp == Instruction::ZExt ||
8576 ShuffleOrOp == Instruction::SExt) {
8577 CastMaxMinBWSizes = std::make_pair(
8583 }
else if (ShuffleOrOp == Instruction::Trunc) {
8584 CastMaxMinBWSizes = std::make_pair(
8591 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8592 ReuseShuffleIndices);
8596 TE->setOperand(*
this);
8598 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8599 if (ShuffleOrOp == Instruction::Trunc) {
8600 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8601 }
else if (ShuffleOrOp == Instruction::SIToFP ||
8602 ShuffleOrOp == Instruction::UIToFP) {
8603 unsigned NumSignBits =
8605 if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
8607 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
8609 if (NumSignBits * 2 >=
8611 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8615 case Instruction::ICmp:
8616 case Instruction::FCmp: {
8619 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8620 ReuseShuffleIndices);
8625 VLOperands Ops(VL, VL0, *
this);
8630 "Commutative Predicate mismatch");
8632 Left = Ops.getVL(0);
8633 Right = Ops.getVL(1);
8636 for (
Value *V : VL) {
8637 if (isa<PoisonValue>(V)) {
8642 auto *
Cmp = cast<CmpInst>(V);
8645 if (
Cmp->getPredicate() != P0)
8647 Left.push_back(LHS);
8648 Right.push_back(RHS);
8655 if (ShuffleOrOp == Instruction::ICmp) {
8656 unsigned NumSignBits0 =
8658 if (NumSignBits0 * 2 >=
8660 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8661 unsigned NumSignBits1 =
8663 if (NumSignBits1 * 2 >=
8665 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
8669 case Instruction::Select:
8670 case Instruction::FNeg:
8671 case Instruction::Add:
8672 case Instruction::FAdd:
8673 case Instruction::Sub:
8674 case Instruction::FSub:
8675 case Instruction::Mul:
8676 case Instruction::FMul:
8677 case Instruction::UDiv:
8678 case Instruction::SDiv:
8679 case Instruction::FDiv:
8680 case Instruction::URem:
8681 case Instruction::SRem:
8682 case Instruction::FRem:
8683 case Instruction::Shl:
8684 case Instruction::LShr:
8685 case Instruction::AShr:
8686 case Instruction::And:
8687 case Instruction::Or:
8688 case Instruction::Xor:
8689 case Instruction::Freeze: {
8690 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8691 ReuseShuffleIndices);
8693 dbgs() <<
"SLP: added a new TreeEntry "
8694 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8697 TE->setOperand(*
this, isa<BinaryOperator>(VL0) &&
isCommutative(VL0));
8699 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8702 case Instruction::GetElementPtr: {
8703 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8704 ReuseShuffleIndices);
8705 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
8709 for (
Value *V : VL) {
8710 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8715 Operands.front().push_back(
GEP->getPointerOperand());
8726 [VL0Ty, IndexIdx](
Value *V) {
8727 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8730 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
8734 ->getPointerOperandType()
8737 for (
Value *V : VL) {
8738 auto *
I = dyn_cast<GetElementPtrInst>(V);
8741 ConstantInt::get(Ty, 0,
false));
8744 auto *
Op =
I->getOperand(IndexIdx);
8745 auto *CI = dyn_cast<ConstantInt>(
Op);
8750 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8754 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
8758 case Instruction::Store: {
8759 bool Consecutive = CurrentOrder.empty();
8762 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8763 ReuseShuffleIndices, CurrentOrder);
8765 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
8769 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
8771 TE->setOperand(*
this);
8772 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
8775 case Instruction::Call: {
8778 CallInst *CI = cast<CallInst>(VL0);
8781 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8782 ReuseShuffleIndices);
8786 for (
unsigned I : seq<unsigned>(CI->
arg_size())) {
8791 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8795 case Instruction::ShuffleVector: {
8796 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8797 ReuseShuffleIndices);
8798 if (S.isAltShuffle()) {
8799 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
8804 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8809 auto *CI = dyn_cast<CmpInst>(VL0);
8811 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8813 auto *MainCI = cast<CmpInst>(S.getMainOp());
8814 auto *AltCI = cast<CmpInst>(S.getAltOp());
8818 "Expected different main/alternate predicates.");
8822 for (
Value *V : VL) {
8823 if (isa<PoisonValue>(V)) {
8828 auto *
Cmp = cast<CmpInst>(V);
8839 Left.push_back(LHS);
8840 Right.push_back(RHS);
8849 TE->setOperand(*
this, isa<BinaryOperator>(VL0) || CI);
8851 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8864 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8867 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
8869 for (
const auto *Ty : ST->elements())
8870 if (Ty != *ST->element_begin())
8872 N *= ST->getNumElements();
8873 EltTy = *ST->element_begin();
8874 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
8875 N *= AT->getNumElements();
8876 EltTy = AT->getElementType();
8878 auto *VT = cast<FixedVectorType>(EltTy);
8879 N *= VT->getNumElements();
8880 EltTy = VT->getElementType();
8887 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8895 bool ResizeAllowed)
const {
8896 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8897 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
8898 auto *E0 = cast<Instruction>(*It);
8900 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8904 Value *Vec = E0->getOperand(0);
8906 CurrentOrder.
clear();
8910 if (E0->getOpcode() == Instruction::ExtractValue) {
8915 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8919 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
8922 unsigned E = VL.
size();
8923 if (!ResizeAllowed && NElts != E)
8926 unsigned MinIdx = NElts, MaxIdx = 0;
8928 auto *Inst = dyn_cast<Instruction>(V);
8931 if (Inst->getOperand(0) != Vec)
8933 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
8934 if (isa<UndefValue>(EE->getIndexOperand()))
8939 const unsigned ExtIdx = *
Idx;
8940 if (ExtIdx >= NElts)
8942 Indices[
I] = ExtIdx;
8943 if (MinIdx > ExtIdx)
8945 if (MaxIdx < ExtIdx)
8948 if (MaxIdx - MinIdx + 1 > E)
8950 if (MaxIdx + 1 <= E)
8954 bool ShouldKeepOrder =
true;
8960 CurrentOrder.
assign(E, E);
8961 for (
unsigned I = 0;
I < E; ++
I) {
8964 const unsigned ExtIdx = Indices[
I] - MinIdx;
8965 if (CurrentOrder[ExtIdx] != E) {
8966 CurrentOrder.
clear();
8969 ShouldKeepOrder &= ExtIdx ==
I;
8970 CurrentOrder[ExtIdx] =
I;
8972 if (ShouldKeepOrder)
8973 CurrentOrder.
clear();
8975 return ShouldKeepOrder;
8978bool BoUpSLP::areAllUsersVectorized(
8980 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
8982 return ScalarToTreeEntry.contains(U) ||
8983 isVectorLikeInstWithConstOps(U) ||
8984 (isa<ExtractElementInst>(U) && MustGather.contains(U));
8988static std::pair<InstructionCost, InstructionCost>
8996 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
8997 FMF = FPCI->getFastMathFlags();
9000 dyn_cast<IntrinsicInst>(CI));
9001 auto IntrinsicCost =
9008 auto LibCost = IntrinsicCost;
9015 return {IntrinsicCost, LibCost};
9018void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9022 unsigned Sz = Scalars.size();
9025 if (!ReorderIndices.empty())
9027 for (
unsigned I = 0;
I < Sz; ++
I) {
9029 if (!ReorderIndices.empty())
9031 if (isa<PoisonValue>(Scalars[
Idx]))
9033 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
9034 if (IsAltOp(OpInst)) {
9044 if (!ReuseShuffleIndices.
empty()) {
9047 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9057 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9058 auto *AltCI = cast<CmpInst>(AltOp);
9061 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
9062 auto *CI = cast<CmpInst>(
I);
9070 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
9071 "CmpInst expected to match either main or alternate predicate or "
9074 return MainP !=
P && MainP != SwappedP;
9081 const auto *Op0 = Ops.
front();
9087 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
9091 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
9093 if (
auto *CI = dyn_cast<ConstantInt>(V))
9094 return CI->getValue().isPowerOf2();
9097 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
9099 if (
auto *CI = dyn_cast<ConstantInt>(V))
9100 return CI->getValue().isNegatedPowerOf2();
9105 if (IsConstant && IsUniform)
9107 else if (IsConstant)
9121class BaseShuffleAnalysis {
9123 Type *ScalarTy =
nullptr;
9125 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
9133 unsigned getVF(
Value *V)
const {
9134 assert(V &&
"V cannot be nullptr");
9135 assert(isa<FixedVectorType>(
V->getType()) &&
9136 "V does not have FixedVectorType");
9137 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
9139 unsigned VNumElements =
9140 cast<FixedVectorType>(
V->getType())->getNumElements();
9141 assert(VNumElements > ScalarTyNumElements &&
9142 "the number of elements of V is not large enough");
9143 assert(VNumElements % ScalarTyNumElements == 0 &&
9144 "the number of elements of V is not a vectorized value");
9145 return VNumElements / ScalarTyNumElements;
9153 int Limit =
Mask.size();
9165 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
9181 unsigned VF =
Mask.size();
9183 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
9186 int MaskedIdx =
Mask[ExtMask[
I] % VF];
9227 bool SinglePermute) {
9231 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
9233 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9239 if (isIdentityMask(Mask, SVTy,
false)) {
9240 if (!IdentityOp || !SinglePermute ||
9241 (isIdentityMask(Mask, SVTy,
true) &&
9243 IdentityMask.
size()))) {
9248 IdentityMask.
assign(Mask);
9268 if (SV->isZeroEltSplat()) {
9270 IdentityMask.
assign(Mask);
9272 int LocalVF =
Mask.size();
9274 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9275 LocalVF = SVOpTy->getNumElements();
9279 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
9281 ExtMask[
Idx] = SV->getMaskValue(
I);
9291 if (!IsOp1Undef && !IsOp2Undef) {
9293 for (
int &
I : Mask) {
9296 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
9303 combineMasks(LocalVF, ShuffleMask, Mask);
9304 Mask.swap(ShuffleMask);
9306 Op = SV->getOperand(0);
9308 Op = SV->getOperand(1);
9310 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
9311 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9316 "Expected masks of same sizes.");
9321 Mask.swap(IdentityMask);
9322 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9323 return SinglePermute &&
9324 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
9326 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
9327 Shuffle->isZeroEltSplat() &&
9340 template <
typename T,
typename ShuffleBuilderTy>
9342 ShuffleBuilderTy &Builder) {
9343 assert(V1 &&
"Expected at least one vector value.");
9345 Builder.resizeToMatch(V1, V2);
9346 int VF =
Mask.size();
9347 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
9348 VF = FTy->getNumElements();
9349 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9356 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
9359 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9361 CombinedMask1[
I] =
Mask[
I];
9363 CombinedMask2[
I] =
Mask[
I] - VF;
9370 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
9371 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
9374 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9375 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9380 ExtMask1[
Idx] = SV1->getMaskValue(
I);
9383 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9385 ExtMask1, UseMask::SecondArg);
9390 ExtMask2[
Idx] = SV2->getMaskValue(
I);
9393 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9395 ExtMask2, UseMask::SecondArg);
9396 if (SV1->getOperand(0)->getType() ==
9397 SV2->getOperand(0)->getType() &&
9398 SV1->getOperand(0)->getType() != SV1->getType() &&
9401 Op1 = SV1->getOperand(0);
9402 Op2 = SV2->getOperand(0);
9404 int LocalVF = ShuffleMask1.size();
9405 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
9406 LocalVF = FTy->getNumElements();
9407 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9408 CombinedMask1.swap(ShuffleMask1);
9410 LocalVF = ShuffleMask2.size();
9411 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
9412 LocalVF = FTy->getNumElements();
9413 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9414 CombinedMask2.swap(ShuffleMask2);
9417 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
9418 Builder.resizeToMatch(Op1, Op2);
9419 VF = std::max(cast<VectorType>(Op1->
getType())
9421 .getKnownMinValue(),
9422 cast<VectorType>(Op2->
getType())
9424 .getKnownMinValue());
9425 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9428 "Expected undefined mask element");
9429 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
9435 isa<ShuffleVectorInst>(Op1) &&
9436 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9438 return Builder.createIdentity(Op1);
9439 return Builder.createShuffleVector(
9443 if (isa<PoisonValue>(V1))
9444 return Builder.createPoison(
9445 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
9447 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
9448 assert(V1 &&
"Expected non-null value after looking through shuffles.");
9451 return Builder.createShuffleVector(V1, NewMask);
9452 return Builder.createIdentity(V1);
9458static std::pair<InstructionCost, InstructionCost>
9469 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9479 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9483 for (
Value *V : Ptrs) {
9488 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9493 if (!
Ptr || !
Ptr->hasOneUse())
9497 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
9503 TTI::PointersChainInfo::getKnownStride(),
9513 [](
const Value *V) {
9514 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9515 return Ptr && !
Ptr->hasAllConstantIndices();
9517 ? TTI::PointersChainInfo::getUnknownStride()
9518 : TTI::PointersChainInfo::getKnownStride();
9522 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9524 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
9525 if (It != Ptrs.
end())
9526 BaseGEP = cast<GEPOperator>(*It);
9531 BaseGEP->getPointerOperand(), Indices, VecTy,
9536 return std::make_pair(ScalarCost, VecCost);
9539void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9540 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
9541 "Expected gather node without reordering.");
9547 if (
TE.Scalars.size() == 2 || (
TE.getOpcode() && !
TE.isAltShuffle()) ||
9551 if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsigned Idx) {
9552 return VectorizableTree[Idx]->isSame(TE.Scalars);
9556 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
9561 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
9562 if (LIt != LoadsMap.
end()) {
9563 for (
LoadInst *RLI : LIt->second) {
9569 for (
LoadInst *RLI : LIt->second) {
9576 if (LIt->second.size() > 2) {
9578 hash_value(LIt->second.back()->getPointerOperand());
9584 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
9589 bool IsOrdered =
true;
9590 unsigned NumInstructions = 0;
9595 if (
auto *Inst = dyn_cast<Instruction>(V);
9596 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9602 auto &Container = SortedValues[
Key];
9603 if (IsOrdered && !KeyToIndex.
contains(V) &&
9604 !(isa<Constant, ExtractElementInst>(V) ||
9606 ((Container.contains(
Idx) &&
9607 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
9608 (!Container.empty() && !Container.contains(
Idx) &&
9609 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
9611 auto &KTI = KeyToIndex[
V];
9613 Container[
Idx].push_back(V);
9618 if (!IsOrdered && NumInstructions > 1) {
9620 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
9621 for (
const auto &
D : SortedValues) {
9622 for (
const auto &
P :
D.second) {
9624 for (
Value *V :
P.second) {
9627 TE.ReorderIndices[Cnt +
K] =
Idx;
9628 TE.Scalars[Cnt +
K] =
V;
9630 Sz += Indices.
size();
9631 Cnt += Indices.
size();
9633 if (Sz > 1 && isa<Instruction>(
P.second.front())) {
9635 *
TTI,
TE.Scalars.front()->getType(), Sz);
9637 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9639 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
9640 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9647 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
9652 auto *ScalarTy =
TE.Scalars.front()->getType();
9654 for (
auto [
Idx, Sz] : SubVectors) {
9658 if (
auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9663 for (
unsigned I : seq<unsigned>(
TE.Scalars.size()))
9664 if (DemandedElts[
I])
9667 CostKind,
I * ScalarTyNumElements, FTy);
9672 int Sz =
TE.Scalars.size();
9674 TE.ReorderIndices.end());
9675 for (
unsigned I : seq<unsigned>(Sz)) {
9677 if (isa<PoisonValue>(V)) {
9680 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
9684 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
9687 VecTy, ReorderMask);
9690 for (
unsigned I : seq<unsigned>(Sz)) {
9694 if (!isa<PoisonValue>(V))
9697 ReorderMask[
I] =
I + Sz;
9701 VecTy, DemandedElts,
true,
false,
CostKind);
9704 if (
Cost >= BVCost) {
9707 TE.ReorderIndices.clear();
9713 BaseGraphSize = VectorizableTree.size();
9715 class GraphTransformModeRAAI {
9716 bool &SavedIsGraphTransformMode;
9719 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
9720 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9721 IsGraphTransformMode =
true;
9723 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
9724 } TransformContext(IsGraphTransformMode);
9733 const InstructionsState &S) {
9735 for (
unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9737 I2->getOperand(
Op));
9739 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
9741 [](
const std::pair<Value *, Value *> &
P) {
9742 return isa<Constant>(
P.first) ||
9743 isa<Constant>(
P.second) ||
P.first ==
P.second;
9750 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9751 TreeEntry &E = *VectorizableTree[
Idx];
9753 reorderGatherNode(E);
9757 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9758 TreeEntry &E = *VectorizableTree[
Idx];
9765 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
9766 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9772 unsigned StartIdx = 0;
9777 *
TTI, VL.
front()->getType(), VF - 1)) {
9778 if (StartIdx + VF >
End)
9781 for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
9785 if (
const TreeEntry *SE = getTreeEntry(Slice.
front());
9786 SE || getTreeEntry(Slice.
back())) {
9789 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9797 bool IsSplat =
isSplat(Slice);
9798 if (Slices.
empty() || !IsSplat ||
9800 Slice.
front()->getType(), VF)),
9803 Slice.
front()->getType(), 2 * VF)),
9806 static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
9812 (S.getOpcode() == Instruction::Load &&
9819 if ((!UserIgnoreList || E.Idx != 0) &&
9823 if (isa<PoisonValue>(V))
9825 return areAllUsersVectorized(cast<Instruction>(V),
9829 if (S.getOpcode() == Instruction::Load) {
9841 if (UserIgnoreList && E.Idx == 0)
9846 }
else if (S.getOpcode() == Instruction::ExtractElement ||
9849 !CheckOperandsProfitability(
9852 IsaPred<Instruction>)),
9863 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
9864 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
9865 if (StartIdx == Cnt)
9866 StartIdx = Cnt + Sz;
9867 if (
End == Cnt + Sz)
9870 for (
auto [Cnt, Sz] : Slices) {
9873 if (TreeEntry *SE = getTreeEntry(Slice.
front());
9874 SE || getTreeEntry(Slice.
back())) {
9877 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9879 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9880 AddCombinedNode(SE->Idx, Cnt, Sz);
9883 unsigned PrevSize = VectorizableTree.size();
9884 [[maybe_unused]]
unsigned PrevEntriesSize =
9885 LoadEntriesToVectorize.size();
9886 buildTree_rec(Slice, 0,
EdgeInfo(&E, UINT_MAX));
9887 if (PrevSize + 1 == VectorizableTree.size() &&
9888 VectorizableTree[PrevSize]->isGather() &&
9889 VectorizableTree[PrevSize]->getOpcode() !=
9890 Instruction::ExtractElement &&
9892 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9894 VectorizableTree.pop_back();
9895 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9896 "LoadEntriesToVectorize expected to remain the same");
9899 AddCombinedNode(PrevSize, Cnt, Sz);
9903 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9906 E.ReorderIndices.clear();
9909 switch (E.getOpcode()) {
9910 case Instruction::Load: {
9913 if (E.State != TreeEntry::Vectorize)
9915 Type *ScalarTy = E.getMainOp()->getType();
9917 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9920 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9924 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9931 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9932 false, CommonAlignment,
CostKind, BaseLI);
9933 if (StridedCost < OriginalVecCost)
9936 E.State = TreeEntry::StridedVectorize;
9940 case Instruction::Store: {
9942 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9944 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9947 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9951 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9958 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9959 false, CommonAlignment,
CostKind, BaseSI);
9960 if (StridedCost < OriginalVecCost)
9963 E.State = TreeEntry::StridedVectorize;
9964 }
else if (!E.ReorderIndices.empty()) {
9967 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
9968 assert(Mask.size() > 1 &&
"Expected mask greater than 1 element.");
9969 if (Mask.size() < 4)
9971 for (
unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
9975 VecTy, Factor, BaseSI->getAlign(),
9983 unsigned InterleaveFactor = IsInterleaveMask(Mask);
9984 if (InterleaveFactor != 0)
9985 E.setInterleave(InterleaveFactor);
9989 case Instruction::Select: {
9990 if (E.State != TreeEntry::Vectorize)
9996 E.CombinedOp = TreeEntry::MinMax;
9997 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
9998 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
9999 CondEntry->State == TreeEntry::Vectorize) {
10001 CondEntry->State = TreeEntry::CombinedVectorize;
10010 if (LoadEntriesToVectorize.empty()) {
10012 if (VectorizableTree.size() <= 1 &&
10013 VectorizableTree.front()->getOpcode() == Instruction::Load)
10016 constexpr unsigned SmallTree = 3;
10017 constexpr unsigned SmallVF = 2;
10018 if ((VectorizableTree.size() <= SmallTree &&
10019 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10020 (VectorizableTree.size() <= 2 && UserIgnoreList))
10023 if (VectorizableTree.front()->isNonPowOf2Vec() &&
10027 [](
const std::unique_ptr<TreeEntry> &TE) {
10028 return TE->isGather() &&
10029 TE->getOpcode() == Instruction::Load &&
10041 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10042 TreeEntry &E = *TE;
10043 if (E.isGather() &&
10044 (E.getOpcode() == Instruction::Load ||
10045 (!E.getOpcode() &&
any_of(E.Scalars,
10047 return isa<LoadInst>(V) &&
10048 !isVectorized(V) &&
10049 !isDeleted(cast<Instruction>(V));
10052 for (
Value *V : E.Scalars) {
10053 auto *LI = dyn_cast<LoadInst>(V);
10059 *
this, V, *DL, *SE, *
TTI,
10060 GatheredLoads[std::make_tuple(
10068 if (!GatheredLoads.
empty())
10069 tryToVectorizeGatheredLoads(GatheredLoads);
10079 bool IsFinalized =
false;
10092 bool SameNodesEstimated =
true;
10101 if (
auto *VTy = dyn_cast<VectorType>(Ty))
10117 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
10118 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
10121 count(VL, *It) > 1 &&
10123 if (!NeedShuffle) {
10124 if (isa<FixedVectorType>(ScalarTy)) {
10129 cast<FixedVectorType>(ScalarTy));
10132 CostKind, std::distance(VL.
begin(), It),
10138 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10145 VecTy, ShuffleMask, CostKind,
10149 return GatherCost +
10150 (
all_of(Gathers, IsaPred<UndefValue>)
10152 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
10160 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10161 unsigned NumParts) {
10162 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
10164 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
10165 auto *EE = dyn_cast<ExtractElementInst>(V);
10168 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10171 return std::max(Sz, VecTy->getNumElements());
10177 -> std::optional<TTI::ShuffleKind> {
10178 if (NumElts <= EltsPerVector)
10179 return std::nullopt;
10181 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10183 if (I == PoisonMaskElem)
10185 return std::min(S, I);
10188 int OffsetReg1 = OffsetReg0;
10192 int FirstRegId = -1;
10193 Indices.assign(1, OffsetReg0);
10197 int Idx =
I - OffsetReg0;
10199 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
10200 if (FirstRegId < 0)
10201 FirstRegId = RegId;
10202 RegIndices.
insert(RegId);
10203 if (RegIndices.
size() > 2)
10204 return std::nullopt;
10205 if (RegIndices.
size() == 2) {
10207 if (Indices.
size() == 1) {
10210 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10211 [&](
int S,
int I) {
10212 if (I == PoisonMaskElem)
10214 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10215 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10216 if (RegId == FirstRegId)
10218 return std::min(S, I);
10221 Indices.push_back(OffsetReg1 % NumElts);
10223 Idx =
I - OffsetReg1;
10225 I = (
Idx % NumElts) % EltsPerVector +
10226 (RegId == FirstRegId ? 0 : EltsPerVector);
10228 return ShuffleKind;
10235 for (
unsigned Part : seq<unsigned>(NumParts)) {
10236 if (!ShuffleKinds[Part])
10239 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
10243 std::optional<TTI::ShuffleKind> RegShuffleKind =
10244 CheckPerRegistersShuffle(SubMask, Indices);
10245 if (!RegShuffleKind) {
10248 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
10261 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
10262 for (
unsigned Idx : Indices) {
10263 assert((
Idx + EltsPerVector) <= BaseVF &&
10264 "SK_ExtractSubvector index out of range");
10275 if (OriginalCost <
Cost)
10276 Cost = OriginalCost;
10284 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10291 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
10293 unsigned SliceSize) {
10294 if (SameNodesEstimated) {
10300 if ((InVectors.
size() == 2 &&
10301 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
10302 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
10303 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
10304 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
10307 "Expected all poisoned elements.");
10309 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
10314 Cost += createShuffle(InVectors.
front(),
10315 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
10317 transformMaskAfterShuffle(CommonMask, CommonMask);
10318 }
else if (InVectors.
size() == 2) {
10319 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10320 transformMaskAfterShuffle(CommonMask, CommonMask);
10322 SameNodesEstimated =
false;
10323 if (!E2 && InVectors.
size() == 1) {
10324 unsigned VF = E1.getVectorFactor();
10327 cast<FixedVectorType>(V1->
getType())->getNumElements());
10329 const auto *E = cast<const TreeEntry *>(InVectors.
front());
10330 VF = std::max(VF, E->getVectorFactor());
10332 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10334 CommonMask[
Idx] = Mask[
Idx] + VF;
10335 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
10336 transformMaskAfterShuffle(CommonMask, CommonMask);
10338 auto P = InVectors.
front();
10339 Cost += createShuffle(&E1, E2, Mask);
10340 unsigned VF = Mask.size();
10345 const auto *E = cast<const TreeEntry *>(
P);
10346 VF = std::max(VF, E->getVectorFactor());
10348 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10350 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
10351 Cost += createShuffle(
P, InVectors.
front(), CommonMask);
10352 transformMaskAfterShuffle(CommonMask, CommonMask);
10356 class ShuffleCostBuilder {
10359 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
10361 return Mask.empty() ||
10362 (VF == Mask.size() &&
10370 ~ShuffleCostBuilder() =
default;
10375 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10376 if (isEmptyOrIdentity(Mask, VF))
10379 cast<VectorType>(V1->
getType()), Mask);
10384 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10385 if (isEmptyOrIdentity(Mask, VF))
10388 cast<VectorType>(V1->
getType()), Mask);
10394 void resizeToMatch(
Value *&,
Value *&)
const {}
10404 ShuffleCostBuilder Builder(
TTI);
10407 unsigned CommonVF = Mask.size();
10409 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
10413 Type *EScalarTy = E.Scalars.front()->getType();
10414 bool IsSigned =
true;
10415 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10417 IsSigned = It->second.second;
10419 if (EScalarTy != ScalarTy) {
10420 unsigned CastOpcode = Instruction::Trunc;
10421 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10422 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10424 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10432 if (isa<Constant>(V))
10434 auto *VecTy = cast<VectorType>(V->getType());
10436 if (EScalarTy != ScalarTy) {
10438 unsigned CastOpcode = Instruction::Trunc;
10439 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10440 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10442 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10449 if (!V1 && !V2 && !P2.
isNull()) {
10451 const TreeEntry *E = cast<const TreeEntry *>(P1);
10452 unsigned VF = E->getVectorFactor();
10453 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10454 CommonVF = std::max(VF, E2->getVectorFactor());
10457 return Idx < 2 * static_cast<int>(CommonVF);
10459 "All elements in mask must be less than 2 * CommonVF.");
10460 if (E->Scalars.size() == E2->Scalars.size()) {
10464 for (
int &
Idx : CommonMask) {
10467 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
10469 else if (
Idx >=
static_cast<int>(CommonVF))
10470 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
10474 CommonVF = E->Scalars.size();
10475 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10476 GetNodeMinBWAffectedCost(*E2, CommonVF);
10478 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10479 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10482 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10483 }
else if (!V1 && P2.
isNull()) {
10485 const TreeEntry *E = cast<const TreeEntry *>(P1);
10486 unsigned VF = E->getVectorFactor();
10490 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10491 "All elements in mask must be less than CommonVF.");
10492 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10494 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
10495 for (
int &
Idx : CommonMask) {
10499 CommonVF = E->Scalars.size();
10500 }
else if (
unsigned Factor = E->getInterleaveFactor();
10501 Factor > 0 && E->Scalars.size() != Mask.size() &&
10505 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
10507 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10510 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10511 CommonVF == CommonMask.
size() &&
10513 [](
const auto &&
P) {
10515 static_cast<unsigned>(
P.value()) !=
P.index();
10523 }
else if (V1 && P2.
isNull()) {
10525 ExtraCost += GetValueMinBWAffectedCost(V1);
10526 CommonVF = getVF(V1);
10529 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10530 "All elements in mask must be less than CommonVF.");
10531 }
else if (V1 && !V2) {
10533 unsigned VF = getVF(V1);
10534 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10535 CommonVF = std::max(VF, E2->getVectorFactor());
10538 return Idx < 2 * static_cast<int>(CommonVF);
10540 "All elements in mask must be less than 2 * CommonVF.");
10541 if (E2->Scalars.size() == VF && VF != CommonVF) {
10543 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
10544 for (
int &
Idx : CommonMask) {
10547 if (
Idx >=
static_cast<int>(CommonVF))
10548 Idx = E2Mask[
Idx - CommonVF] + VF;
10552 ExtraCost += GetValueMinBWAffectedCost(V1);
10554 ExtraCost += GetNodeMinBWAffectedCost(
10555 *E2, std::min(CommonVF, E2->getVectorFactor()));
10556 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10557 }
else if (!V1 && V2) {
10559 unsigned VF = getVF(V2);
10560 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10561 CommonVF = std::max(VF, E1->getVectorFactor());
10564 return Idx < 2 * static_cast<int>(CommonVF);
10566 "All elements in mask must be less than 2 * CommonVF.");
10567 if (E1->Scalars.size() == VF && VF != CommonVF) {
10569 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
10570 for (
int &
Idx : CommonMask) {
10573 if (
Idx >=
static_cast<int>(CommonVF))
10574 Idx = E1Mask[
Idx - CommonVF] + VF;
10580 ExtraCost += GetNodeMinBWAffectedCost(
10581 *E1, std::min(CommonVF, E1->getVectorFactor()));
10583 ExtraCost += GetValueMinBWAffectedCost(V2);
10584 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10586 assert(V1 && V2 &&
"Expected both vectors.");
10587 unsigned VF = getVF(V1);
10588 CommonVF = std::max(VF, getVF(V2));
10591 return Idx < 2 * static_cast<int>(CommonVF);
10593 "All elements in mask must be less than 2 * CommonVF.");
10595 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10596 if (V1->
getType() != V2->getType()) {
10598 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10600 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
10602 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10603 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10606 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10611 InVectors.
front() =
10613 if (InVectors.
size() == 2)
10615 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10616 V1, V2, CommonMask, Builder);
10623 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
10624 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10625 CheckedExtracts(CheckedExtracts) {}
10627 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10628 unsigned NumParts,
bool &UseVecBaseAsInput) {
10629 UseVecBaseAsInput =
false;
10632 Value *VecBase =
nullptr;
10634 if (!E->ReorderIndices.empty()) {
10636 E->ReorderIndices.end());
10641 bool PrevNodeFound =
any_of(
10643 [&](
const std::unique_ptr<TreeEntry> &TE) {
10644 return ((!TE->isAltShuffle() &&
10645 TE->getOpcode() == Instruction::ExtractElement) ||
10647 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10648 return VL.size() > Data.index() &&
10649 (Mask[Data.index()] == PoisonMaskElem ||
10650 isa<UndefValue>(VL[Data.index()]) ||
10651 Data.value() == VL[Data.index()]);
10656 for (
unsigned Part : seq<unsigned>(NumParts)) {
10658 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10662 if (isa<UndefValue>(V) ||
10671 auto *EE = cast<ExtractElementInst>(V);
10672 VecBase = EE->getVectorOperand();
10673 UniqueBases.
insert(VecBase);
10674 const TreeEntry *VE = R.getTreeEntry(V);
10675 if (!CheckedExtracts.
insert(V).second ||
10676 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10679 return isa<GetElementPtrInst>(U) &&
10680 !R.areAllUsersVectorized(cast<Instruction>(U),
10688 unsigned Idx = *EEIdx;
10690 if (EE->hasOneUse() || !PrevNodeFound) {
10692 if (isa<SExtInst, ZExtInst>(Ext) &&
10693 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10698 EE->getVectorOperandType(),
Idx);
10701 Ext->getOpcode(), Ext->getType(), EE->getType(),
10716 if (!PrevNodeFound)
10717 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10720 transformMaskAfterShuffle(CommonMask, CommonMask);
10721 SameNodesEstimated =
false;
10722 if (NumParts != 1 && UniqueBases.
size() != 1) {
10723 UseVecBaseAsInput =
true;
10731 std::optional<InstructionCost>
10735 return std::nullopt;
10741 return Idx < static_cast<int>(E1.getVectorFactor());
10743 "Expected single vector shuffle mask.");
10747 if (InVectors.
empty()) {
10748 CommonMask.
assign(Mask.begin(), Mask.end());
10749 InVectors.
assign({&E1, &E2});
10752 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10755 if (NumParts == 0 || NumParts >= Mask.size() ||
10756 MaskVecTy->getNumElements() % NumParts != 0 ||
10758 MaskVecTy->getNumElements() / NumParts))
10763 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10764 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10767 if (InVectors.
empty()) {
10768 CommonMask.
assign(Mask.begin(), Mask.end());
10769 InVectors.
assign(1, &E1);
10772 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10775 if (NumParts == 0 || NumParts >= Mask.size() ||
10776 MaskVecTy->getNumElements() % NumParts != 0 ||
10778 MaskVecTy->getNumElements() / NumParts))
10783 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10784 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
10785 if (!SameNodesEstimated && InVectors.
size() == 1)
10797 auto *EI = cast<ExtractElementInst>(
10798 cast<const TreeEntry *>(InVectors.
front())
10799 ->getOrdered(
P.index()));
10800 return EI->getVectorOperand() == V1 ||
10801 EI->getVectorOperand() == V2;
10803 "Expected extractelement vectors.");
10807 if (InVectors.
empty()) {
10809 "Expected empty input mask/vectors.");
10810 CommonMask.
assign(Mask.begin(), Mask.end());
10811 InVectors.
assign(1, V1);
10816 assert(InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10817 !CommonMask.
empty() &&
10820 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10821 ->getOrdered(
P.index());
10823 return P.value() == Mask[
P.index()] ||
10824 isa<UndefValue>(Scalar);
10825 if (isa<Constant>(V1))
10827 auto *EI = cast<ExtractElementInst>(Scalar);
10828 return EI->getVectorOperand() == V1;
10830 "Expected only tree entry for extractelement vectors.");
10834 "Expected only tree entries from extracts/reused buildvectors.");
10835 unsigned VF = getVF(V1);
10836 if (InVectors.
size() == 2) {
10837 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10838 transformMaskAfterShuffle(CommonMask, CommonMask);
10839 VF = std::max<unsigned>(VF, CommonMask.
size());
10840 }
else if (
const auto *InTE =
10841 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
10842 VF = std::max(VF, InTE->getVectorFactor());
10845 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
10846 ->getNumElements());
10849 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10851 CommonMask[
Idx] = Mask[
Idx] + VF;
10854 Value *Root =
nullptr) {
10855 Cost += getBuildVectorCost(VL, Root);
10859 unsigned VF = VL.
size();
10861 VF = std::min(VF, MaskVF);
10863 if (isa<UndefValue>(V)) {
10869 if (
auto *VecTy = dyn_cast<FixedVectorType>(Vals.
front()->getType())) {
10876 Type *ScalarTy = V->getType()->getScalarType();
10878 if (isa<PoisonValue>(V))
10880 else if (isa<UndefValue>(V))
10884 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
10887 Vals.
swap(NewVals);
10893 cast<FixedVectorType>(Root->
getType())->getNumElements()),
10900 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10903 IsFinalized =
true;
10906 if (InVectors.
size() == 2)
10907 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10909 Cost += createShuffle(Vec,
nullptr, CommonMask);
10910 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10914 "Expected vector length for the final value before action.");
10915 Value *V = cast<Value *>(Vec);
10916 Action(V, CommonMask);
10917 InVectors.
front() = V;
10919 if (!SubVectors.empty()) {
10921 if (InVectors.
size() == 2)
10922 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10924 Cost += createShuffle(Vec,
nullptr, CommonMask);
10925 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10929 if (!SubVectorsMask.
empty()) {
10931 "Expected same size of masks for subvectors and common mask.");
10933 copy(SubVectorsMask, SVMask.begin());
10934 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
10937 I1 = I2 + CommonMask.
size();
10944 for (
auto [E,
Idx] : SubVectors) {
10945 Type *EScalarTy = E->Scalars.front()->getType();
10946 bool IsSigned =
true;
10947 if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
10950 IsSigned = It->second.second;
10952 if (ScalarTy != EScalarTy) {
10953 unsigned CastOpcode = Instruction::Trunc;
10954 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
10955 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
10957 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10967 if (!CommonMask.
empty()) {
10968 std::iota(std::next(CommonMask.
begin(),
Idx),
10969 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
10975 if (!ExtMask.
empty()) {
10976 if (CommonMask.
empty()) {
10980 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
10983 NewMask[
I] = CommonMask[ExtMask[
I]];
10985 CommonMask.
swap(NewMask);
10988 if (CommonMask.
empty()) {
10989 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
10993 createShuffle(InVectors.
front(),
10994 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
11000 "Shuffle construction must be finalized.");
11004const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
11005 unsigned Idx)
const {
11006 if (
const TreeEntry *VE = getMatchedVectorizedOperand(E,
Idx))
11009 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11010 return TE->isGather() &&
11011 find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11012 return EI.EdgeIdx == Idx && EI.UserTE == E;
11013 }) != TE->UserTreeIndices.end();
11015 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
11020 if (TE.State == TreeEntry::ScatterVectorize ||
11021 TE.State == TreeEntry::StridedVectorize)
11023 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11024 !TE.isAltShuffle()) {
11025 if (TE.ReorderIndices.empty())
11039 const unsigned VF,
unsigned MinBW,
11071 auto It = MinBWs.
find(E);
11072 Type *OrigScalarTy = ScalarTy;
11073 if (It != MinBWs.
end()) {
11074 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11080 unsigned EntryVF = E->getVectorFactor();
11083 if (E->isGather()) {
11086 if (isa<InsertElementInst>(VL[0]))
11088 if (isa<CmpInst>(VL.
front()))
11089 ScalarTy = VL.
front()->getType();
11090 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11091 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
11095 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11098 if (E->getOpcode() == Instruction::Store) {
11100 NewMask.
resize(E->ReorderIndices.size());
11101 copy(E->ReorderIndices, NewMask.
begin());
11107 if (!E->ReuseShuffleIndices.empty())
11108 ::addMask(Mask, E->ReuseShuffleIndices);
11112 assert((E->State == TreeEntry::Vectorize ||
11113 E->State == TreeEntry::ScatterVectorize ||
11114 E->State == TreeEntry::StridedVectorize) &&
11115 "Unhandled state");
11116 assert(E->getOpcode() &&
11118 (E->getOpcode() == Instruction::GetElementPtr &&
11119 E->getMainOp()->getType()->isPointerTy())) &&
11122 unsigned ShuffleOrOp =
11123 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
11124 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11125 ShuffleOrOp = E->CombinedOp;
11127 const unsigned Sz = UniqueValues.
size();
11129 for (
unsigned I = 0;
I < Sz; ++
I) {
11130 if (isa<Instruction>(UniqueValues[
I]) && getTreeEntry(UniqueValues[
I]) == E)
11132 UsedScalars.set(
I);
11134 auto GetCastContextHint = [&](
Value *
V) {
11135 if (
const TreeEntry *OpTE = getTreeEntry(V))
11136 return getCastContextHint(*OpTE);
11137 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
11138 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11139 !SrcState.isAltShuffle())
11148 if (isa<CastInst, CallInst>(VL0)) {
11152 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11154 for (
unsigned I = 0;
I < Sz; ++
I) {
11155 if (UsedScalars.test(
I))
11157 ScalarCost += ScalarEltCost(
I);
11166 (E->getOpcode() != Instruction::Load ||
11167 !E->UserTreeIndices.empty())) {
11168 const EdgeInfo &EI =
11169 *
find_if(E->UserTreeIndices, [](
const EdgeInfo &EI) {
11170 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11172 if (EI.UserTE->getOpcode() != Instruction::Select ||
11174 auto UserBWIt = MinBWs.
find(EI.UserTE);
11175 Type *UserScalarTy =
11176 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11177 if (UserBWIt != MinBWs.
end())
11179 UserBWIt->second.first);
11180 if (ScalarTy != UserScalarTy) {
11181 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11182 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
11183 unsigned VecOpcode;
11184 auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
11185 if (BWSz > SrcBWSz)
11186 VecOpcode = Instruction::Trunc;
11189 It->second.second ? Instruction::SExt : Instruction::ZExt;
11196 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11197 ScalarCost,
"Calculated costs for Tree"));
11198 return VecCost - ScalarCost;
11203 assert((E->State == TreeEntry::Vectorize ||
11204 E->State == TreeEntry::StridedVectorize) &&
11205 "Entry state expected to be Vectorize or StridedVectorize here.");
11209 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
11210 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11211 "Calculated GEPs cost for Tree"));
11213 return VecCost - ScalarCost;
11220 Type *CanonicalType = Ty;
11227 {CanonicalType, CanonicalType});
11232 if (VI && SelectOnly) {
11234 "Expected only for scalar type.");
11235 auto *CI = cast<CmpInst>(
VI->getOperand(0));
11237 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11238 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11239 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11241 return IntrinsicCost;
11243 switch (ShuffleOrOp) {
11244 case Instruction::PHI: {
11248 for (
Value *V : UniqueValues) {
11249 auto *
PHI = dyn_cast<PHINode>(V);
11254 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
11258 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
11260 if (!OpTE->ReuseShuffleIndices.empty())
11261 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11262 OpTE->Scalars.size());
11265 return CommonCost - ScalarCost;
11267 case Instruction::ExtractValue:
11268 case Instruction::ExtractElement: {
11269 auto GetScalarCost = [&](
unsigned Idx) {
11270 if (isa<PoisonValue>(UniqueValues[
Idx]))
11273 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
11275 if (ShuffleOrOp == Instruction::ExtractElement) {
11276 auto *EE = cast<ExtractElementInst>(
I);
11277 SrcVecTy = EE->getVectorOperandType();
11279 auto *EV = cast<ExtractValueInst>(
I);
11280 Type *AggregateTy = EV->getAggregateOperand()->getType();
11282 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11283 NumElts = ATy->getNumElements();
11288 if (
I->hasOneUse()) {
11290 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11291 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
11298 Ext->getOpcode(),
Ext->getType(),
I->getType(),
11306 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
11307 return GetCostDiff(GetScalarCost, GetVectorCost);
11309 case Instruction::InsertElement: {
11310 assert(E->ReuseShuffleIndices.empty() &&
11311 "Unique insertelements only are expected.");
11312 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
11313 unsigned const NumElts = SrcVecTy->getNumElements();
11314 unsigned const NumScalars = VL.
size();
11320 unsigned OffsetEnd = OffsetBeg;
11321 InsertMask[OffsetBeg] = 0;
11324 if (OffsetBeg >
Idx)
11326 else if (OffsetEnd <
Idx)
11328 InsertMask[
Idx] =
I + 1;
11331 if (NumOfParts > 0 && NumOfParts < NumElts)
11332 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11333 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11335 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11336 unsigned InsertVecSz = std::min<unsigned>(
11338 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11339 bool IsWholeSubvector =
11340 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11344 if (OffsetBeg + InsertVecSz > VecSz) {
11347 InsertVecSz = VecSz;
11353 if (!E->ReorderIndices.empty()) {
11358 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
11360 bool IsIdentity =
true;
11362 Mask.swap(PrevMask);
11363 for (
unsigned I = 0;
I < NumScalars; ++
I) {
11365 DemandedElts.
setBit(InsertIdx);
11366 IsIdentity &= InsertIdx - OffsetBeg ==
I;
11367 Mask[InsertIdx - OffsetBeg] =
I;
11369 assert(
Offset < NumElts &&
"Failed to find vector index offset");
11383 InsertVecTy, Mask);
11384 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
11385 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11393 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11394 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
11395 if (InsertVecSz != VecSz) {
11406 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
11415 case Instruction::ZExt:
11416 case Instruction::SExt:
11417 case Instruction::FPToUI:
11418 case Instruction::FPToSI:
11419 case Instruction::FPExt:
11420 case Instruction::PtrToInt:
11421 case Instruction::IntToPtr:
11422 case Instruction::SIToFP:
11423 case Instruction::UIToFP:
11424 case Instruction::Trunc:
11425 case Instruction::FPTrunc:
11426 case Instruction::BitCast: {
11427 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11430 unsigned Opcode = ShuffleOrOp;
11431 unsigned VecOpcode = Opcode;
11433 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
11435 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
11436 if (SrcIt != MinBWs.
end()) {
11437 SrcBWSz = SrcIt->second.first;
11444 if (BWSz == SrcBWSz) {
11445 VecOpcode = Instruction::BitCast;
11446 }
else if (BWSz < SrcBWSz) {
11447 VecOpcode = Instruction::Trunc;
11448 }
else if (It != MinBWs.
end()) {
11449 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11450 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11451 }
else if (SrcIt != MinBWs.
end()) {
11452 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11454 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11456 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
11457 !SrcIt->second.second) {
11458 VecOpcode = Instruction::UIToFP;
11461 assert(
Idx == 0 &&
"Expected 0 index only");
11469 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11471 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
11474 bool IsArithmeticExtendedReduction =
11475 E->Idx == 0 && UserIgnoreList &&
11477 auto *
I = cast<Instruction>(V);
11478 return is_contained({Instruction::Add, Instruction::FAdd,
11479 Instruction::Mul, Instruction::FMul,
11480 Instruction::And, Instruction::Or,
11484 if (IsArithmeticExtendedReduction &&
11485 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11487 return CommonCost +
11489 VecOpcode == Opcode ? VI :
nullptr);
11491 return GetCostDiff(GetScalarCost, GetVectorCost);
11493 case Instruction::FCmp:
11494 case Instruction::ICmp:
11495 case Instruction::Select: {
11499 match(VL0, MatchCmp))
11505 auto GetScalarCost = [&](
unsigned Idx) {
11506 if (isa<PoisonValue>(UniqueValues[
Idx]))
11509 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11516 !
match(VI, MatchCmp)) ||
11524 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11525 CostKind, getOperandInfo(
VI->getOperand(0)),
11526 getOperandInfo(
VI->getOperand(1)), VI);
11529 ScalarCost = IntrinsicCost;
11538 CostKind, getOperandInfo(E->getOperand(0)),
11539 getOperandInfo(E->getOperand(1)), VL0);
11540 if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
11543 unsigned CondNumElements = CondType->getNumElements();
11545 assert(VecTyNumElements >= CondNumElements &&
11546 VecTyNumElements % CondNumElements == 0 &&
11547 "Cannot vectorize Instruction::Select");
11548 if (CondNumElements != VecTyNumElements) {
11557 return VecCost + CommonCost;
11559 return GetCostDiff(GetScalarCost, GetVectorCost);
11561 case TreeEntry::MinMax: {
11562 auto GetScalarCost = [&](
unsigned Idx) {
11563 return GetMinMaxCost(OrigScalarTy);
11567 return VecCost + CommonCost;
11569 return GetCostDiff(GetScalarCost, GetVectorCost);
11571 case Instruction::FNeg:
11572 case Instruction::Add:
11573 case Instruction::FAdd:
11574 case Instruction::Sub:
11575 case Instruction::FSub:
11576 case Instruction::Mul:
11577 case Instruction::FMul:
11578 case Instruction::UDiv:
11579 case Instruction::SDiv:
11580 case Instruction::FDiv:
11581 case Instruction::URem:
11582 case Instruction::SRem:
11583 case Instruction::FRem:
11584 case Instruction::Shl:
11585 case Instruction::LShr:
11586 case Instruction::AShr:
11587 case Instruction::And:
11588 case Instruction::Or:
11589 case Instruction::Xor: {
11590 auto GetScalarCost = [&](
unsigned Idx) {
11591 if (isa<PoisonValue>(UniqueValues[
Idx]))
11594 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11595 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11604 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
11605 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11608 auto *CI = dyn_cast<ConstantInt>(
Op);
11609 return CI && CI->getValue().countr_one() >= It->second.first;
11614 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11618 Op2Info, {},
nullptr, TLI) +
11621 return GetCostDiff(GetScalarCost, GetVectorCost);
11623 case Instruction::GetElementPtr: {
11624 return CommonCost + GetGEPCostDiff(VL, VL0);
11626 case Instruction::Load: {
11627 auto GetScalarCost = [&](
unsigned Idx) {
11628 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
11630 VI->getAlign(),
VI->getPointerAddressSpace(),
11633 auto *LI0 = cast<LoadInst>(VL0);
11636 switch (E->State) {
11637 case TreeEntry::Vectorize:
11638 if (
unsigned Factor = E->getInterleaveFactor()) {
11640 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11641 LI0->getPointerAddressSpace(),
CostKind);
11645 Instruction::Load, VecTy, LI0->getAlign(),
11649 case TreeEntry::StridedVectorize: {
11650 Align CommonAlignment =
11651 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11653 Instruction::Load, VecTy, LI0->getPointerOperand(),
11654 false, CommonAlignment,
CostKind);
11657 case TreeEntry::ScatterVectorize: {
11658 Align CommonAlignment =
11659 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11661 Instruction::Load, VecTy, LI0->getPointerOperand(),
11662 false, CommonAlignment,
CostKind);
11665 case TreeEntry::CombinedVectorize:
11666 case TreeEntry::NeedToGather:
11669 return VecLdCost + CommonCost;
11675 if (E->State == TreeEntry::ScatterVectorize)
11681 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
11682 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11684 case Instruction::Store: {
11685 bool IsReorder = !E->ReorderIndices.empty();
11686 auto GetScalarCost = [=](
unsigned Idx) {
11687 auto *
VI = cast<StoreInst>(VL[
Idx]);
11690 VI->getAlign(),
VI->getPointerAddressSpace(),
11694 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11698 if (E->State == TreeEntry::StridedVectorize) {
11699 Align CommonAlignment =
11700 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11702 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11703 false, CommonAlignment,
CostKind);
11705 assert(E->State == TreeEntry::Vectorize &&
11706 "Expected either strided or consecutive stores.");
11707 if (
unsigned Factor = E->getInterleaveFactor()) {
11708 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11709 "No reused shuffles expected");
11712 Instruction::Store, VecTy, Factor, std::nullopt,
11713 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),
CostKind);
11717 Instruction::Store, VecTy, BaseSI->getAlign(),
11718 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
11721 return VecStCost + CommonCost;
11725 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
11726 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
11729 return GetCostDiff(GetScalarCost, GetVectorCost) +
11730 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11732 case Instruction::Call: {
11733 auto GetScalarCost = [&](
unsigned Idx) {
11734 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
11745 auto *CI = cast<CallInst>(VL0);
11749 It != MinBWs.
end() ? It->second.first : 0,
TTI);
11751 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11753 return GetCostDiff(GetScalarCost, GetVectorCost);
11755 case Instruction::ShuffleVector: {
11756 if (!
SLPReVec || E->isAltShuffle())
11757 assert(E->isAltShuffle() &&
11762 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11763 "Invalid Shuffle Vector Operand");
11766 auto TryFindNodeWithEqualOperands = [=]() {
11767 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11770 if (
TE->isAltShuffle() &&
11771 ((
TE->getOpcode() == E->getOpcode() &&
11772 TE->getAltOpcode() == E->getAltOpcode()) ||
11773 (
TE->getOpcode() == E->getAltOpcode() &&
11774 TE->getAltOpcode() == E->getOpcode())) &&
11775 TE->hasEqualOperands(*E))
11780 auto GetScalarCost = [&](
unsigned Idx) {
11781 if (isa<PoisonValue>(UniqueValues[
Idx]))
11784 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11785 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
11795 if (TryFindNodeWithEqualOperands()) {
11797 dbgs() <<
"SLP: diamond match for alternate node found.\n";
11804 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
11806 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
11807 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11809 VecCost = TTIRef.getCmpSelInstrCost(
11810 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
11811 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11813 VecCost += TTIRef.getCmpSelInstrCost(
11814 E->getOpcode(), VecTy, MaskTy,
11815 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
11816 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11819 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11822 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11823 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11825 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11826 if (SrcIt != MinBWs.
end()) {
11827 SrcBWSz = SrcIt->second.first;
11831 if (BWSz <= SrcBWSz) {
11832 if (BWSz < SrcBWSz)
11834 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11838 <<
"SLP: alternate extension, which should be truncated.\n";
11844 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11847 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11851 E->buildAltOpShuffleMask(
11853 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
11864 unsigned Opcode0 = E->getOpcode();
11865 unsigned Opcode1 = E->getAltOpcode();
11869 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11871 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
11872 return AltVecCost < VecCost ? AltVecCost : VecCost;
11877 if (
SLPReVec && !E->isAltShuffle())
11878 return GetCostDiff(
11883 "Not supported shufflevector usage.");
11884 auto *SV = cast<ShuffleVectorInst>(VL.
front());
11885 unsigned SVNumElements =
11886 cast<FixedVectorType>(SV->getOperand(0)->getType())
11887 ->getNumElements();
11888 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11889 for (
size_t I = 0,
End = VL.
size();
I !=
End;
I += GroupSize) {
11893 assert(isa<ShuffleVectorInst>(V) &&
11894 "Not supported shufflevector usage.");
11895 auto *SV = cast<ShuffleVectorInst>(V);
11897 [[maybe_unused]]
bool IsExtractSubvectorMask =
11898 SV->isExtractSubvectorMask(Index);
11899 assert(IsExtractSubvectorMask &&
11900 "Not supported shufflevector usage.");
11901 if (NextIndex != Index)
11903 NextIndex += SV->getShuffleMask().size();
11906 return ::getShuffleCost(
11912 return GetCostDiff(GetScalarCost, GetVectorCost);
11914 case Instruction::Freeze:
11921bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
11923 << VectorizableTree.size() <<
" is fully vectorizable .\n");
11925 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
11927 return TE->isGather() &&
11929 [
this](
Value *V) { return EphValues.contains(V); }) &&
11931 TE->Scalars.size() < Limit ||
11932 ((
TE->getOpcode() == Instruction::ExtractElement ||
11933 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11935 (
TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()) ||
11936 any_of(
TE->Scalars, IsaPred<LoadInst>));
11940 if (VectorizableTree.size() == 1 &&
11941 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11942 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11944 AreVectorizableGathers(VectorizableTree[0].
get(),
11945 VectorizableTree[0]->Scalars.size()) &&
11946 VectorizableTree[0]->getVectorFactor() > 2)))
11949 if (VectorizableTree.size() != 2)
11957 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11958 AreVectorizableGathers(VectorizableTree[1].
get(),
11959 VectorizableTree[0]->Scalars.size()))
11963 if (VectorizableTree[0]->
isGather() ||
11964 (VectorizableTree[1]->isGather() &&
11965 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11966 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11974 bool MustMatchOrInst) {
11978 Value *ZextLoad = Root;
11979 const APInt *ShAmtC;
11980 bool FoundOr =
false;
11981 while (!isa<ConstantExpr>(ZextLoad) &&
11984 ShAmtC->
urem(8) == 0))) {
11985 auto *BinOp = cast<BinaryOperator>(ZextLoad);
11986 ZextLoad = BinOp->getOperand(0);
11987 if (BinOp->getOpcode() == Instruction::Or)
11992 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
11999 Type *SrcTy = Load->getType();
12006 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
12007 << *(cast<Instruction>(Root)) <<
"\n");
12016 unsigned NumElts = VectorizableTree[0]->Scalars.size();
12017 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12025 unsigned NumElts = Stores.
size();
12026 for (
Value *Scalar : Stores) {
12040 if (VectorizableTree.empty()) {
12041 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
12047 if (VectorizableTree.size() == 2 &&
12048 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12049 VectorizableTree[1]->isGather() &&
12050 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12051 !(
isSplat(VectorizableTree[1]->Scalars) ||
12059 constexpr int Limit = 4;
12061 !VectorizableTree.empty() &&
12062 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12063 return (TE->isGather() &&
12064 TE->getOpcode() != Instruction::ExtractElement &&
12065 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12066 TE->getOpcode() == Instruction::PHI;
12077 if (isFullyVectorizableTinyTree(ForReduction))
12082 bool IsAllowedSingleBVNode =
12083 VectorizableTree.size() > 1 ||
12084 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
12085 !VectorizableTree.front()->isAltShuffle() &&
12086 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12087 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12089 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12090 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
12091 return isa<ExtractElementInst, UndefValue>(V) ||
12092 (IsAllowedSingleBVNode &&
12093 !V->hasNUsesOrMore(UsesLimit) &&
12094 any_of(V->users(), IsaPred<InsertElementInst>));
12099 if (VectorizableTree.back()->isGather() &&
12100 VectorizableTree.back()->isAltShuffle() &&
12101 VectorizableTree.back()->getVectorFactor() > 2 &&
12103 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12105 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12106 VectorizableTree.back()->getVectorFactor()),
12119 constexpr unsigned SmallTree = 3;
12120 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12123 [](
const std::unique_ptr<TreeEntry> &TE) {
12124 return TE->isGather() &&
12125 TE->getOpcode() == Instruction::Load &&
12133 TreeEntry &E = *VectorizableTree[
Idx];
12136 if (E.getOpcode() && E.getOpcode() != Instruction::Load)
12150 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12163 for (
const auto &TEPtr : VectorizableTree) {
12164 if (TEPtr->State != TreeEntry::Vectorize)
12166 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12172 auto *NodeA = DT->
getNode(
A->getParent());
12173 auto *NodeB = DT->
getNode(
B->getParent());
12174 assert(NodeA &&
"Should only process reachable instructions");
12175 assert(NodeB &&
"Should only process reachable instructions");
12176 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12177 "Different nodes should have different DFS numbers");
12178 if (NodeA != NodeB)
12179 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12180 return B->comesBefore(
A);
12190 LiveValues.
erase(PrevInst);
12191 for (
auto &J : PrevInst->
operands()) {
12192 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12193 LiveValues.
insert(cast<Instruction>(&*J));
12197 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
12198 for (
auto *
X : LiveValues)
12199 dbgs() <<
" " <<
X->getName();
12200 dbgs() <<
", Looking at ";
12205 unsigned NumCalls = 0;
12209 while (InstIt != PrevInstIt) {
12210 if (PrevInstIt == PrevInst->
getParent()->rend()) {
12211 PrevInstIt = Inst->getParent()->rbegin();
12216 if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
12217 if (
II->isAssumeLikeIntrinsic())
12221 for (
auto &ArgOp :
II->args())
12222 Tys.push_back(ArgOp->getType());
12223 if (
auto *FPMO = dyn_cast<FPMathOperator>(
II))
12224 FMF = FPMO->getFastMathFlags();
12231 if (IntrCost < CallCost)
12238 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12239 &*PrevInstIt != PrevInst)
12247 for (
auto *
II : LiveValues) {
12248 auto *ScalarTy =
II->getType();
12249 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12250 ScalarTy = VectorTy->getElementType();
12268 const auto *I1 = IE1;
12269 const auto *I2 = IE2;
12281 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12283 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12284 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
12286 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12287 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12294struct ValueSelect {
12295 template <
typename U>
12296 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
12299 template <
typename U>
12300 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
12318template <
typename T>
12324 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
12326 auto VMIt = std::next(ShuffleMask.begin());
12329 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12331 if (!IsBaseUndef.
all()) {
12333 std::pair<T *, bool> Res =
12334 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
12336 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
12340 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
12342 auto *V = ValueSelect::get<T *>(
Base);
12344 assert((!V || GetVF(V) == Mask.size()) &&
12345 "Expected base vector of VF number of elements.");
12346 Prev = Action(Mask, {
nullptr, Res.first});
12347 }
else if (ShuffleMask.size() == 1) {
12350 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12356 Prev = Action(Mask, {ShuffleMask.begin()->first});
12360 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12361 unsigned Vec2VF = GetVF(VMIt->first);
12362 if (Vec1VF == Vec2VF) {
12366 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12369 Mask[
I] = SecMask[
I] + Vec1VF;
12372 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12375 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12377 std::pair<T *, bool> Res2 =
12378 ResizeAction(VMIt->first, VMIt->second,
false);
12380 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12387 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
12390 Prev = Action(Mask, {Res1.first, Res2.first});
12392 VMIt = std::next(VMIt);
12394 bool IsBaseNotUndef = !IsBaseUndef.
all();
12395 (void)IsBaseNotUndef;
12397 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12399 std::pair<T *, bool> Res =
12400 ResizeAction(VMIt->first, VMIt->second,
false);
12402 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12405 "Multiple uses of scalars.");
12406 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
12411 Prev = Action(Mask, {Prev, Res.first});
12419template <
typename T>
struct ShuffledInsertData {
12430 << VectorizableTree.size() <<
".\n");
12432 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12435 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
12436 TreeEntry &TE = *VectorizableTree[
I];
12439 if (TE.State == TreeEntry::CombinedVectorize) {
12441 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
12442 << *TE.Scalars[0] <<
".\n";
12443 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12446 if (TE.isGather()) {
12447 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
12448 E && E->getVectorFactor() == TE.getVectorFactor() &&
12449 E->isSame(TE.Scalars)) {
12454 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12461 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12462 "Expected gather nodes with users only.");
12468 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12477 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12484 for (ExternalUser &EU : ExternalUses) {
12485 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
12487 for (ExternalUser &EU : ExternalUses) {
12491 if (EphValues.
count(EU.User))
12497 EU.User ? cast<Instruction>(EU.User)->
getParent() :
nullptr;
12500 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12504 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12505 !ExtractCostCalculated.
insert(EU.Scalar).second)
12509 if (isa<FixedVectorType>(EU.Scalar->getType()))
12514 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12516 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
12517 if (!UsedInserts.
insert(VU).second)
12521 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12524 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
12529 Value *Op0 =
II->getOperand(0);
12530 if (getTreeEntry(
II) && !getTreeEntry(Op0))
12536 if (It == ShuffledInserts.
end()) {
12538 Data.InsertElements.emplace_back(VU);
12540 VecId = ShuffledInserts.
size() - 1;
12541 auto It = MinBWs.
find(ScalarTE);
12542 if (It != MinBWs.
end() &&
12544 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
12546 unsigned BWSz = It->second.first;
12547 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
12548 unsigned VecOpcode;
12549 if (DstBWSz < BWSz)
12550 VecOpcode = Instruction::Trunc;
12553 It->second.second ? Instruction::SExt : Instruction::ZExt;
12558 FTy->getNumElements()),
12561 <<
" for extending externally used vector with "
12562 "non-equal minimum bitwidth.\n");
12567 It->InsertElements.front() = VU;
12568 VecId = std::distance(ShuffledInserts.
begin(), It);
12570 int InIdx = *InsertIdx;
12572 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12575 Mask[InIdx] = EU.Lane;
12576 DemandedElts[VecId].setBit(InIdx);
12587 auto *VecTy =
getWidenedType(EU.Scalar->getType(), BundleWidth);
12588 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12589 auto It = MinBWs.
find(Entry);
12590 if (It != MinBWs.
end()) {
12593 ? Instruction::ZExt
12594 : Instruction::SExt;
12601 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12604 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12605 Entry->getOpcode() == Instruction::Load) {
12607 auto IsPhiInLoop = [&](
const ExternalUser &U) {
12608 if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12609 auto *
I = cast<Instruction>(U.Scalar);
12610 const Loop *L = LI->getLoopFor(Phi->getParent());
12611 return L && (Phi->getParent() ==
I->getParent() ||
12612 L == LI->getLoopFor(
I->getParent()));
12616 if (!ValueToExtUses) {
12617 ValueToExtUses.emplace();
12620 if (IsPhiInLoop(
P.value()))
12623 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
12628 auto *Inst = cast<Instruction>(EU.Scalar);
12630 auto OperandIsScalar = [&](
Value *V) {
12631 if (!getTreeEntry(V)) {
12635 if (
auto *EE = dyn_cast<ExtractElementInst>(V))
12636 return !EE->hasOneUse() || !MustGather.contains(EE);
12639 return ValueToExtUses->contains(V);
12641 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
12642 bool CanBeUsedAsScalarCast =
false;
12643 if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12644 if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
12645 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
12647 (getTreeEntry(
Op) && !ValueToExtUses->contains(
Op))
12650 if (ScalarCost + OpCost <= ExtraCost) {
12651 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
12652 ScalarCost += OpCost;
12656 if (CanBeUsedAsScalar) {
12657 bool KeepScalar = ScalarCost <= ExtraCost;
12661 bool IsProfitablePHIUser =
12663 VectorizableTree.front()->Scalars.size() > 2)) &&
12664 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12668 auto *PHIUser = dyn_cast<PHINode>(U);
12669 return (!PHIUser ||
12670 PHIUser->getParent() !=
12672 VectorizableTree.front()->getMainOp())
12677 return ValueToExtUses->contains(V);
12679 if (IsProfitablePHIUser) {
12683 (!GatheredLoadsEntriesFirst.has_value() ||
12684 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12685 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
12686 return ValueToExtUses->contains(V);
12688 auto It = ExtractsCount.
find(Entry);
12689 if (It != ExtractsCount.
end()) {
12690 assert(ScalarUsesCount >= It->getSecond().size() &&
12691 "Expected total number of external uses not less than "
12692 "number of scalar uses.");
12693 ScalarUsesCount -= It->getSecond().size();
12698 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
12701 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
12703 auto It = ValueToExtUses->find(V);
12704 if (It != ValueToExtUses->end()) {
12706 ExternalUses[It->second].User = nullptr;
12709 ExtraCost = ScalarCost;
12710 if (!IsPhiInLoop(EU))
12711 ExtractsCount[Entry].
insert(Inst);
12712 if (CanBeUsedAsScalarCast) {
12713 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
12716 if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12718 auto It = ValueToExtUses->find(V);
12719 if (It != ValueToExtUses->end()) {
12721 ExternalUses[It->second].User = nullptr;
12730 ExtractCost += ExtraCost;
12734 for (
Value *V : ScalarOpsFromCasts) {
12735 ExternalUsesAsOriginalScalar.
insert(V);
12736 if (
const TreeEntry *E = getTreeEntry(V)) {
12737 ExternalUses.emplace_back(V,
nullptr, E->findLaneForValue(V));
12741 if (!VectorizedVals.
empty()) {
12742 const TreeEntry &Root = *VectorizableTree.front();
12743 auto BWIt = MinBWs.find(&Root);
12744 if (BWIt != MinBWs.end()) {
12745 Type *DstTy = Root.Scalars.front()->getType();
12748 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12749 if (OriginalSz != SrcSz) {
12750 unsigned Opcode = Instruction::Trunc;
12751 if (OriginalSz > SrcSz)
12752 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12754 if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12766 Cost += SpillCost + ExtractCost;
12770 unsigned VF =
Mask.size();
12771 unsigned VecVF =
TE->getVectorFactor();
12773 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
12776 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
12782 dbgs() <<
"SLP: Adding cost " <<
C
12783 <<
" for final shuffle of insertelement external users.\n";
12784 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12786 return std::make_pair(TE,
true);
12788 return std::make_pair(TE,
false);
12791 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
12792 Value *
Base = ShuffledInserts[
I].InsertElements.front()->getOperand(0);
12793 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
12797 assert((TEs.size() == 1 || TEs.size() == 2) &&
12798 "Expected exactly 1 or 2 tree entries.");
12799 if (TEs.size() == 1) {
12801 VF = TEs.front()->getVectorFactor();
12802 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12806 (
Data.index() < VF &&
12807 static_cast<int>(
Data.index()) ==
Data.value());
12812 <<
" for final shuffle of insertelement "
12813 "external users.\n";
12814 TEs.front()->
dump();
12815 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12821 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12822 VF = TEs.front()->getVectorFactor();
12826 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12830 <<
" for final shuffle of vector node and external "
12831 "insertelement users.\n";
12832 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12833 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12839 (void)performExtractsShuffleAction<const TreeEntry>(
12841 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
12842 EstimateShufflesCost);
12844 cast<FixedVectorType>(
12845 ShuffledInserts[
I].InsertElements.front()->getType()),
12848 Cost -= InsertCost;
12852 if (ReductionBitWidth != 0) {
12853 assert(UserIgnoreList &&
"Expected reduction tree.");
12854 const TreeEntry &E = *VectorizableTree.front();
12855 auto It = MinBWs.find(&E);
12856 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12857 unsigned SrcSize = It->second.first;
12858 unsigned DstSize = ReductionBitWidth;
12859 unsigned Opcode = Instruction::Trunc;
12860 if (SrcSize < DstSize) {
12861 bool IsArithmeticExtendedReduction =
12863 auto *
I = cast<Instruction>(V);
12864 return is_contained({Instruction::Add, Instruction::FAdd,
12865 Instruction::Mul, Instruction::FMul,
12866 Instruction::And, Instruction::Or,
12870 if (IsArithmeticExtendedReduction)
12872 Instruction::BitCast;
12874 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12876 if (Opcode != Instruction::BitCast) {
12878 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12880 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12883 switch (E.getOpcode()) {
12884 case Instruction::SExt:
12885 case Instruction::ZExt:
12886 case Instruction::Trunc: {
12887 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12888 CCH = getCastContextHint(*OpTE);
12898 <<
" for final resize for reduction from " << SrcVecTy
12899 <<
" to " << DstVecTy <<
"\n";
12900 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12909 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
12910 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
12911 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
12915 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
12926std::optional<TTI::ShuffleKind>
12927BoUpSLP::tryToGatherSingleRegisterExtractElements(
12933 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
12934 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12936 if (isa<UndefValue>(VL[
I]))
12940 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12941 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12954 ExtractMask.reset(*
Idx);
12959 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
12964 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
12965 return P1.second.size() > P2.second.size();
12968 const int UndefSz = UndefVectorExtracts.
size();
12969 unsigned SingleMax = 0;
12970 unsigned PairMax = 0;
12971 if (!Vectors.
empty()) {
12972 SingleMax = Vectors.
front().second.size() + UndefSz;
12973 if (Vectors.
size() > 1) {
12974 auto *ItNext = std::next(Vectors.
begin());
12975 PairMax = SingleMax + ItNext->second.size();
12978 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12979 return std::nullopt;
12985 if (SingleMax >= PairMax && SingleMax) {
12986 for (
int Idx : Vectors.
front().second)
12988 }
else if (!Vectors.
empty()) {
12989 for (
unsigned Idx : {0, 1})
12990 for (
int Idx : Vectors[
Idx].second)
12994 for (
int Idx : UndefVectorExtracts)
12998 std::optional<TTI::ShuffleKind> Res =
13004 return std::nullopt;
13008 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
13009 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
13010 isa<UndefValue>(GatheredExtracts[
I])) {
13014 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
13015 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13016 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13031 unsigned NumParts)
const {
13032 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
13036 for (
unsigned Part : seq<unsigned>(NumParts)) {
13042 std::optional<TTI::ShuffleKind> Res =
13043 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13044 ShufflesRes[Part] = Res;
13045 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
13047 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
13048 return Res.has_value();
13050 ShufflesRes.clear();
13051 return ShufflesRes;
13054std::optional<TargetTransformInfo::ShuffleKind>
13055BoUpSLP::isGatherShuffledSingleRegisterEntry(
13061 const EdgeInfo &TEUseEI =
TE == VectorizableTree.front().get()
13062 ? EdgeInfo(
const_cast<TreeEntry *
>(TE), 0)
13063 :
TE->UserTreeIndices.front();
13064 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13068 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13069 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13072 TEInsertBlock = TEInsertPt->
getParent();
13075 return std::nullopt;
13076 auto *NodeUI = DT->
getNode(TEInsertBlock);
13077 assert(NodeUI &&
"Should only process reachable instructions");
13079 auto CheckOrdering = [&](
const Instruction *InsertPt) {
13093 auto *NodeEUI = DT->
getNode(InsertBlock);
13096 assert((NodeUI == NodeEUI) ==
13097 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13098 "Different nodes should have different DFS numbers");
13100 if (TEInsertPt->
getParent() != InsertBlock &&
13103 if (TEInsertPt->
getParent() == InsertBlock &&
13117 for (
Value *V : VL) {
13122 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13123 if (TEPtr == TE || TEPtr->Idx == 0)
13126 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
13127 "Must contain at least single gathered value.");
13128 assert(TEPtr->UserTreeIndices.size() == 1 &&
13129 "Expected only single user of a gather node.");
13130 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13132 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13135 : &getLastInstructionInBundle(UseEI.UserTE);
13136 if (TEInsertPt == InsertPt) {
13140 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13144 if (TEUseEI.UserTE != UseEI.UserTE &&
13145 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13151 if ((TEInsertBlock != InsertPt->
getParent() ||
13152 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13153 !CheckOrdering(InsertPt))
13157 if (
const TreeEntry *VTE = getTreeEntry(V)) {
13158 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13159 if (VTE->State != TreeEntry::Vectorize) {
13160 auto It = MultiNodeScalars.
find(V);
13161 if (It == MultiNodeScalars.
end())
13163 VTE = *It->getSecond().begin();
13165 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
13166 return MTE->State == TreeEntry::Vectorize;
13168 if (MIt == It->getSecond().end())
13173 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13174 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13178 if (VToTEs.
empty())
13180 if (UsedTEs.
empty()) {
13194 if (!VToTEs.
empty()) {
13200 VToTEs = SavedVToTEs;
13209 if (UsedTEs.
size() == 2)
13211 UsedTEs.push_back(SavedVToTEs);
13218 if (UsedTEs.
empty()) {
13220 return std::nullopt;
13224 if (UsedTEs.
size() == 1) {
13227 UsedTEs.front().
end());
13228 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13229 return TE1->Idx < TE2->Idx;
13232 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
13233 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
13235 if (It != FirstEntries.end() &&
13236 ((*It)->getVectorFactor() == VL.size() ||
13237 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
13238 TE->ReuseShuffleIndices.size() == VL.size() &&
13239 (*It)->isSame(
TE->Scalars)))) {
13240 Entries.push_back(*It);
13241 if ((*It)->getVectorFactor() == VL.size()) {
13242 std::iota(std::next(
Mask.begin(), Part * VL.size()),
13243 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
13249 for (
unsigned I : seq<unsigned>(VL.size()))
13250 if (isa<PoisonValue>(VL[
I]))
13256 Entries.push_back(FirstEntries.front());
13257 VF = FirstEntries.front()->getVectorFactor();
13260 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
13263 for (
const TreeEntry *TE : UsedTEs.front()) {
13264 unsigned VF =
TE->getVectorFactor();
13265 auto It = VFToTE.
find(VF);
13266 if (It != VFToTE.
end()) {
13267 if (It->second->Idx >
TE->Idx)
13268 It->getSecond() =
TE;
13275 UsedTEs.back().
end());
13276 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13277 return TE1->Idx < TE2->Idx;
13279 for (
const TreeEntry *TE : SecondEntries) {
13280 auto It = VFToTE.
find(
TE->getVectorFactor());
13281 if (It != VFToTE.
end()) {
13283 Entries.push_back(It->second);
13284 Entries.push_back(TE);
13290 if (Entries.empty()) {
13292 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13293 return TE1->Idx < TE2->Idx;
13295 Entries.push_back(SecondEntries.front());
13296 VF = std::max(Entries.front()->getVectorFactor(),
13297 Entries.back()->getVectorFactor());
13299 VF = Entries.front()->getVectorFactor();
13303 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
13306 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
13307 auto *
PHI = cast<PHINode>(V);
13308 auto *PHI1 = cast<PHINode>(V1);
13313 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
13315 Value *In1 = PHI1->getIncomingValue(
I);
13320 if (cast<Instruction>(In)->
getParent() !=
13330 auto MightBeIgnored = [=](
Value *
V) {
13331 auto *
I = dyn_cast<Instruction>(V);
13332 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
13334 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
13339 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
13341 bool UsedInSameVTE =
false;
13342 auto It = UsedValuesEntry.
find(V1);
13343 if (It != UsedValuesEntry.
end())
13344 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
13345 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13347 cast<Instruction>(V)->getParent() ==
13348 cast<Instruction>(V1)->getParent() &&
13349 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13354 for (
int I = 0, E = VL.size();
I < E; ++
I) {
13356 auto It = UsedValuesEntry.
find(V);
13357 if (It == UsedValuesEntry.
end())
13363 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
13364 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
13366 unsigned Idx = It->second;
13373 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
13374 if (!UsedIdxs.test(
I))
13380 for (std::pair<unsigned, int> &Pair : EntryLanes)
13381 if (Pair.first ==
I)
13382 Pair.first = TempEntries.
size();
13385 Entries.swap(TempEntries);
13386 if (EntryLanes.size() == Entries.size() &&
13388 .
slice(Part * VL.size(),
13389 std::min<int>(VL.size(),
TE->Scalars.size())))) {
13395 return std::nullopt;
13398 bool IsIdentity = Entries.size() == 1;
13401 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
13402 unsigned Idx = Part * VL.size() + Pair.second;
13405 (ForOrder ? std::distance(
13406 Entries[Pair.first]->Scalars.begin(),
13407 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13408 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13409 IsIdentity &=
Mask[
Idx] == Pair.second;
13411 if (ForOrder || IsIdentity || Entries.empty()) {
13412 switch (Entries.size()) {
13414 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13418 if (EntryLanes.size() > 2 || VL.size() <= 2)
13424 }
else if (!isa<VectorType>(VL.front()->getType()) &&
13425 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13428 std::next(
Mask.begin(), (Part + 1) * VL.size()));
13429 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
13430 for (
int Idx : SubMask) {
13438 assert(MaxElement >= 0 && MinElement >= 0 &&
13439 MaxElement % VF >= MinElement % VF &&
13440 "Expected at least single element.");
13441 unsigned NewVF = std::max<unsigned>(
13443 (MaxElement % VF) -
13444 (MinElement % VF) + 1));
13449 Idx = ((
Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13450 (
Idx >=
static_cast<int>(VF) ? NewVF : 0);
13458 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
13459 auto GetShuffleCost = [&,
13463 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13465 Mask, Entries.front()->getInterleaveFactor()))
13467 return ::getShuffleCost(
TTI,
13472 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13475 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13476 FirstShuffleCost = ShuffleCost;
13480 bool IsIdentity =
true;
13482 if (
Idx >=
static_cast<int>(NewVF)) {
13487 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13491 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13493 MaskVecTy, DemandedElts,
true,
13498 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13499 SecondShuffleCost = ShuffleCost;
13503 bool IsIdentity =
true;
13505 if (
Idx <
static_cast<int>(NewVF) &&
Idx >= 0) {
13511 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13516 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13518 MaskVecTy, DemandedElts,
true,
13528 const TreeEntry *BestEntry =
nullptr;
13529 if (FirstShuffleCost < ShuffleCost) {
13530 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13531 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13533 if (Idx >= static_cast<int>(VF))
13534 Idx = PoisonMaskElem;
13536 BestEntry = Entries.front();
13537 ShuffleCost = FirstShuffleCost;
13539 if (SecondShuffleCost < ShuffleCost) {
13540 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13541 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13543 if (Idx < static_cast<int>(VF))
13544 Idx = PoisonMaskElem;
13548 BestEntry = Entries[1];
13549 ShuffleCost = SecondShuffleCost;
13551 if (BuildVectorCost >= ShuffleCost) {
13554 Entries.push_back(BestEntry);
13562 std::fill(std::next(
Mask.begin(), Part * VL.size()),
13564 return std::nullopt;
13568BoUpSLP::isGatherShuffledEntry(
13572 assert(NumParts > 0 && NumParts < VL.
size() &&
13573 "Expected positive number of registers.");
13576 if (TE == VectorizableTree.front().get() &&
13577 (!GatheredLoadsEntriesFirst.has_value() ||
13579 [](
const std::unique_ptr<TreeEntry> &TE) {
13580 return !
TE->isGather();
13584 if (
TE->isNonPowOf2Vec())
13587 assert((
TE->UserTreeIndices.size() == 1 ||
13588 TE == VectorizableTree.front().get()) &&
13589 "Expected only single user of the gather node.");
13591 "Number of scalars must be divisible by NumParts.");
13592 if (!
TE->UserTreeIndices.empty() &&
13593 TE->UserTreeIndices.front().UserTE->isGather() &&
13594 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13595 assert((
TE->Idx == 0 ||
TE->getOpcode() == Instruction::ExtractElement ||
13597 "Expected splat or extractelements only node.");
13602 for (
unsigned Part : seq<unsigned>(NumParts)) {
13606 std::optional<TTI::ShuffleKind> SubRes =
13607 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13610 SubEntries.
clear();
13613 SubEntries.
front()->getVectorFactor() == VL.
size() &&
13614 (SubEntries.
front()->isSame(
TE->Scalars) ||
13615 SubEntries.
front()->isSame(VL))) {
13617 LocalSubEntries.
swap(SubEntries);
13620 std::iota(
Mask.begin(),
Mask.end(), 0);
13622 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
13623 if (isa<PoisonValue>(VL[
I]))
13625 Entries.emplace_back(1, LocalSubEntries.
front());
13631 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
13639 Type *ScalarTy)
const {
13641 bool DuplicateNonConst =
false;
13649 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
13650 if (
V->getType() != ScalarTy) {
13661 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
13664 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
13672 EstimateInsertCost(
I, V);
13673 ShuffleMask[
I] =
I;
13677 DuplicateNonConst =
true;
13679 ShuffleMask[
I] = Res.first->second;
13681 if (ForPoisonSrc) {
13682 if (isa<FixedVectorType>(ScalarTy)) {
13688 for (
unsigned I : seq<unsigned>(VL.
size()))
13689 if (!ShuffledElements[
I])
13692 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13700 if (DuplicateNonConst)
13702 VecTy, ShuffleMask);
13706Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
13707 auto &Res = EntryToLastInstruction.
try_emplace(E).first->second;
13713 auto *Front = E->getMainOp();
13715 assert(((GatheredLoadsEntriesFirst.has_value() &&
13716 E->getOpcode() == Instruction::Load && E->isGather() &&
13717 E->Idx < *GatheredLoadsEntriesFirst) ||
13719 [=](
Value *V) ->
bool {
13720 if (E->getOpcode() == Instruction::GetElementPtr &&
13721 !isa<GetElementPtrInst>(V))
13723 auto *I = dyn_cast<Instruction>(V);
13724 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13725 isVectorLikeInstWithConstOps(I);
13727 "Expected gathered loads or GEPs or instructions from same basic "
13730 auto FindLastInst = [&]() {
13732 for (
Value *V : E->Scalars) {
13733 auto *
I = dyn_cast<Instruction>(V);
13736 if (LastInst->
getParent() ==
I->getParent()) {
13741 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13742 !isa<GetElementPtrInst>(
I)) ||
13745 (GatheredLoadsEntriesFirst.has_value() &&
13746 E->getOpcode() == Instruction::Load && E->isGather() &&
13747 E->Idx < *GatheredLoadsEntriesFirst)) &&
13748 "Expected vector-like or non-GEP in GEP node insts only.");
13756 auto *NodeB = DT->
getNode(
I->getParent());
13757 assert(NodeA &&
"Should only process reachable instructions");
13758 assert(NodeB &&
"Should only process reachable instructions");
13759 assert((NodeA == NodeB) ==
13760 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13761 "Different nodes should have different DFS numbers");
13762 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13769 auto FindFirstInst = [&]() {
13771 for (
Value *V : E->Scalars) {
13772 auto *
I = dyn_cast<Instruction>(V);
13775 if (FirstInst->
getParent() ==
I->getParent()) {
13776 if (
I->comesBefore(FirstInst))
13780 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13781 !isa<GetElementPtrInst>(
I)) ||
13784 "Expected vector-like or non-GEP in GEP node insts only.");
13792 auto *NodeB = DT->
getNode(
I->getParent());
13793 assert(NodeA &&
"Should only process reachable instructions");
13794 assert(NodeB &&
"Should only process reachable instructions");
13795 assert((NodeA == NodeB) ==
13796 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13797 "Different nodes should have different DFS numbers");
13798 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13805 if (GatheredLoadsEntriesFirst.has_value() &&
13806 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13807 E->getOpcode() == Instruction::Load) {
13808 Res = FindFirstInst();
13816 if ((E->getOpcode() == Instruction::GetElementPtr &&
13819 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13823 return isa<PoisonValue>(V) ||
13824 (!isVectorLikeInstWithConstOps(V) &&
13825 isUsedOutsideBlock(V));
13827 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
13828 return isa<ExtractElementInst, UndefValue>(V) ||
13829 areAllOperandsNonInsts(V);
13831 Res = FindLastInst();
13833 Res = FindFirstInst();
13841 if (BlocksSchedules.count(BB) && !E->isGather()) {
13842 Value *
V = E->isOneOf(E->Scalars.back());
13845 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13846 if (Bundle && Bundle->isPartOfBundle())
13847 for (; Bundle; Bundle = Bundle->NextInBundle)
13848 Res = Bundle->Inst;
13870 Res = FindLastInst();
13871 assert(Res &&
"Failed to find last instruction in bundle");
13875void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
13876 auto *Front = E->getMainOp();
13877 Instruction *LastInst = &getLastInstructionInBundle(E);
13878 assert(LastInst &&
"Failed to find last instruction in bundle");
13881 bool IsPHI = isa<PHINode>(LastInst);
13883 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
13885 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
13889 Builder.SetInsertPoint(
13893 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13896Value *BoUpSLP::gather(
13905 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
13908 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
13909 InsertBB = InsertBB->getSinglePredecessor();
13910 return InsertBB && InsertBB == InstBB;
13912 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13913 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
13914 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13915 getTreeEntry(Inst) ||
13916 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
13917 PostponedIndices.
insert(
I).second)
13921 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
13924 if (
Scalar->getType() != Ty) {
13928 if (
auto *CI = dyn_cast<CastInst>(Scalar);
13929 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13931 if (
auto *IOp = dyn_cast<Instruction>(
Op);
13932 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
13935 Scalar = Builder.CreateIntCast(
13940 if (
auto *VecTy = dyn_cast<FixedVectorType>(
Scalar->getType())) {
13944 auto *
II = dyn_cast<IntrinsicInst>(InsElt);
13945 if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
13948 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13949 InsElt = dyn_cast<InsertElementInst>(Vec);
13953 GatherShuffleExtractSeq.
insert(InsElt);
13956 if (isa<Instruction>(V)) {
13957 if (TreeEntry *Entry = getTreeEntry(V)) {
13959 User *UserOp =
nullptr;
13961 if (
auto *SI = dyn_cast<Instruction>(Scalar))
13967 unsigned FoundLane =
Entry->findLaneForValue(V);
13968 ExternalUses.emplace_back(V, UserOp, FoundLane);
13978 std::iota(
Mask.begin(),
Mask.end(), 0);
13979 Value *OriginalRoot = Root;
13980 if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
13981 SV && isa<PoisonValue>(SV->getOperand(1)) &&
13982 SV->getOperand(0)->getType() == VecTy) {
13983 Root = SV->getOperand(0);
13984 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
13987 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13994 if (isa<PoisonValue>(VL[
I]))
13996 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14000 if (isa<PoisonValue>(Vec)) {
14001 Vec = OriginalRoot;
14003 Vec = CreateShuffle(Root, Vec, Mask);
14004 if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
14005 OI && OI->hasNUses(0) &&
14006 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14007 return TE->VectorizedValue == OI;
14013 for (
int I : NonConsts)
14014 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14017 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14018 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14056 bool IsFinalized =
false;
14069 class ShuffleIRBuilder {
14082 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14083 CSEBlocks(CSEBlocks),
DL(
DL) {}
14084 ~ShuffleIRBuilder() =
default;
14087 if (V1->
getType() != V2->getType()) {
14090 "Expected integer vector types only.");
14091 if (V1->
getType() != V2->getType()) {
14092 if (cast<VectorType>(V2->getType())
14094 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
14096 ->getIntegerBitWidth())
14105 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14106 GatherShuffleExtractSeq.
insert(
I);
14107 CSEBlocks.
insert(
I->getParent());
14116 unsigned VF = Mask.size();
14117 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14121 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14122 GatherShuffleExtractSeq.
insert(
I);
14123 CSEBlocks.
insert(
I->getParent());
14127 Value *createIdentity(
Value *V) {
return V; }
14128 Value *createPoison(
Type *Ty,
unsigned VF) {
14133 void resizeToMatch(
Value *&V1,
Value *&V2) {
14134 if (V1->
getType() == V2->getType())
14136 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14137 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14138 int VF = std::max(V1VF, V2VF);
14139 int MinVF = std::min(V1VF, V2VF);
14141 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
14143 Value *&
Op = MinVF == V1VF ? V1 : V2;
14145 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
14146 GatherShuffleExtractSeq.
insert(
I);
14147 CSEBlocks.
insert(
I->getParent());
14160 assert(V1 &&
"Expected at least one vector value.");
14161 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14162 R.CSEBlocks, *R.DL);
14163 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
14171 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14179 std::optional<bool> IsSigned = std::nullopt) {
14180 auto *VecTy = cast<VectorType>(V->getType());
14191 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14195 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14196 unsigned NumParts,
bool &UseVecBaseAsInput) {
14197 UseVecBaseAsInput =
false;
14199 Value *VecBase =
nullptr;
14201 if (!E->ReorderIndices.empty()) {
14203 E->ReorderIndices.end());
14206 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
14210 auto *EI = cast<ExtractElementInst>(VL[
I]);
14211 VecBase = EI->getVectorOperand();
14212 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
14213 VecBase = TE->VectorizedValue;
14214 assert(VecBase &&
"Expected vectorized value.");
14215 UniqueBases.
insert(VecBase);
14218 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14219 (NumParts != 1 &&
count(VL, EI) > 1) ||
14221 const TreeEntry *UTE = R.getTreeEntry(U);
14222 return !UTE || R.MultiNodeScalars.contains(U) ||
14223 (isa<GetElementPtrInst>(U) &&
14224 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14225 count_if(R.VectorizableTree,
14226 [&](const std::unique_ptr<TreeEntry> &TE) {
14227 return any_of(TE->UserTreeIndices,
14228 [&](const EdgeInfo &Edge) {
14229 return Edge.UserTE == UTE;
14231 is_contained(VL, EI);
14235 R.eraseInstruction(EI);
14237 if (NumParts == 1 || UniqueBases.
size() == 1) {
14238 assert(VecBase &&
"Expected vectorized value.");
14239 return castToScalarTyElem(VecBase);
14241 UseVecBaseAsInput =
true;
14251 Value *Vec =
nullptr;
14254 for (
unsigned Part : seq<unsigned>(NumParts)) {
14258 constexpr int MaxBases = 2;
14260 auto VLMask =
zip(SubVL, SubMask);
14261 const unsigned VF = std::accumulate(
14262 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
14263 if (std::get<1>(D) == PoisonMaskElem)
14266 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14267 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14268 VecOp = TE->VectorizedValue;
14269 assert(VecOp &&
"Expected vectorized value.");
14270 const unsigned Size =
14271 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14272 return std::max(S, Size);
14274 for (
const auto [V,
I] : VLMask) {
14277 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14278 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
14279 VecOp = TE->VectorizedValue;
14280 assert(VecOp &&
"Expected vectorized value.");
14281 VecOp = castToScalarTyElem(VecOp);
14282 Bases[
I / VF] = VecOp;
14284 if (!Bases.front())
14287 if (Bases.back()) {
14288 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14289 TransformToIdentity(SubMask);
14291 SubVec = Bases.front();
14298 Mask.slice(
P * SliceSize,
14305 "Expected first part or all previous parts masked.");
14306 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14309 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14311 unsigned SubVecVF =
14312 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
14313 NewVF = std::max(NewVF, SubVecVF);
14316 for (
int &
Idx : SubMask)
14319 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14320 Vec = createShuffle(Vec, SubVec, VecMask);
14321 TransformToIdentity(VecMask);
14329 std::optional<Value *>
14335 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
14337 return std::nullopt;
14340 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
14349 Value *V1 = E1.VectorizedValue;
14351 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14352 if (isa<PoisonValue>(V))
14354 return !isKnownNonNegative(
14355 V, SimplifyQuery(*R.DL));
14357 Value *V2 = E2.VectorizedValue;
14358 if (V2->getType()->isIntOrIntVectorTy())
14359 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
14360 if (isa<PoisonValue>(V))
14362 return !isKnownNonNegative(
14363 V, SimplifyQuery(*R.DL));
14370 Value *V1 = E1.VectorizedValue;
14372 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14373 if (isa<PoisonValue>(V))
14375 return !isKnownNonNegative(
14376 V, SimplifyQuery(*R.DL));
14382 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
14384 isa<FixedVectorType>(V2->getType()) &&
14385 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14386 V1 = castToScalarTyElem(V1);
14387 V2 = castToScalarTyElem(V2);
14388 if (InVectors.
empty()) {
14391 CommonMask.
assign(Mask.begin(), Mask.end());
14395 if (InVectors.
size() == 2) {
14396 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14397 transformMaskAfterShuffle(CommonMask, CommonMask);
14398 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
14400 Vec = createShuffle(Vec,
nullptr, CommonMask);
14401 transformMaskAfterShuffle(CommonMask, CommonMask);
14403 V1 = createShuffle(V1, V2, Mask);
14404 unsigned VF = std::max(getVF(V1), getVF(Vec));
14405 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14407 CommonMask[
Idx] =
Idx + VF;
14408 InVectors.
front() = Vec;
14409 if (InVectors.
size() == 2)
14410 InVectors.
back() = V1;
14417 "castToScalarTyElem expects V1 to be FixedVectorType");
14418 V1 = castToScalarTyElem(V1);
14419 if (InVectors.
empty()) {
14421 CommonMask.
assign(Mask.begin(), Mask.end());
14424 const auto *It =
find(InVectors, V1);
14425 if (It == InVectors.
end()) {
14426 if (InVectors.
size() == 2 ||
14429 if (InVectors.
size() == 2) {
14430 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14431 transformMaskAfterShuffle(CommonMask, CommonMask);
14432 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14433 CommonMask.
size()) {
14434 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
14435 transformMaskAfterShuffle(CommonMask, CommonMask);
14437 unsigned VF = std::max(CommonMask.
size(), Mask.size());
14438 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14441 V->getType() != V1->
getType()
14443 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
14444 ->getNumElements();
14445 if (V->getType() != V1->
getType())
14446 V1 = createShuffle(V1,
nullptr, Mask);
14447 InVectors.
front() = V;
14448 if (InVectors.
size() == 2)
14449 InVectors.
back() = V1;
14456 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14462 int VF = getVF(V1);
14463 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14465 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
14474 Value *Root =
nullptr) {
14475 return R.gather(VL, Root, ScalarTy,
14477 return createShuffle(V1, V2, Mask);
14486 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14489 IsFinalized =
true;
14492 if (ScalarTyNumElements != 1) {
14496 ExtMask = NewExtMask;
14500 if (InVectors.
size() == 2) {
14501 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14504 Vec = createShuffle(Vec,
nullptr, CommonMask);
14506 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14510 "Expected vector length for the final value before action.");
14511 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14514 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14515 Vec = createShuffle(Vec,
nullptr, ResizeMask);
14517 Action(Vec, CommonMask);
14518 InVectors.
front() = Vec;
14520 if (!SubVectors.empty()) {
14522 if (InVectors.
size() == 2) {
14523 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14526 Vec = createShuffle(Vec,
nullptr, CommonMask);
14528 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14531 auto CreateSubVectors = [&](
Value *Vec,
14533 for (
auto [E,
Idx] : SubVectors) {
14534 Value *
V = E->VectorizedValue;
14535 if (
V->getType()->isIntOrIntVectorTy())
14536 V = castToScalarTyElem(V,
any_of(E->Scalars, [&](
Value *V) {
14537 if (isa<PoisonValue>(V))
14539 return !isKnownNonNegative(
14540 V, SimplifyQuery(*R.DL));
14542 unsigned InsertionIndex =
Idx * ScalarTyNumElements;
14544 Builder, Vec, V, InsertionIndex,
14545 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
14547 if (!CommonMask.
empty()) {
14549 std::next(CommonMask.
begin(), InsertionIndex),
14550 std::next(CommonMask.
begin(),
14551 (
Idx + E->getVectorFactor()) * ScalarTyNumElements),
14557 if (SubVectorsMask.
empty()) {
14558 Vec = CreateSubVectors(Vec, CommonMask);
14561 copy(SubVectorsMask, SVMask.begin());
14562 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14565 I1 = I2 + CommonMask.
size();
14570 Vec = createShuffle(InsertVec, Vec, SVMask);
14571 for (
unsigned I : seq<unsigned>(CommonMask.
size())) {
14576 InVectors.
front() = Vec;
14579 if (!ExtMask.
empty()) {
14580 if (CommonMask.
empty()) {
14584 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14587 NewMask[
I] = CommonMask[ExtMask[
I]];
14589 CommonMask.
swap(NewMask);
14592 if (CommonMask.
empty()) {
14593 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14594 return InVectors.
front();
14596 if (InVectors.
size() == 2)
14597 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14598 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
14603 "Shuffle construction must be finalized.");
14607BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(
const TreeEntry *E,
14608 unsigned NodeIdx) {
14612 if (!S && VL.
front()->getType()->isPointerTy()) {
14613 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
14614 if (It != VL.
end())
14619 auto CheckSameVE = [&](
const TreeEntry *VE) {
14620 return VE->isSame(VL) &&
14621 (
any_of(VE->UserTreeIndices,
14622 [E, NodeIdx](
const EdgeInfo &EI) {
14623 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14625 any_of(VectorizableTree,
14626 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
14627 return TE->isOperandGatherNode(
14628 {
const_cast<TreeEntry *
>(E), NodeIdx}) &&
14629 VE->isSame(TE->Scalars);
14632 TreeEntry *VE = getTreeEntry(S.getMainOp());
14633 if (VE && CheckSameVE(VE))
14635 auto It = MultiNodeScalars.
find(S.getMainOp());
14636 if (It != MultiNodeScalars.
end()) {
14637 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
14638 return TE != VE && CheckSameVE(TE);
14640 if (
I != It->getSecond().end())
14646Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
14647 bool PostponedPHIs) {
14648 ValueList &VL = E->getOperand(NodeIdx);
14649 const unsigned VF = VL.size();
14650 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14655 Type *ScalarTy = cast<VectorType>(
V->getType())->getElementType();
14657 ShuffleInstructionBuilder ShuffleBuilder(
14661 ShuffleBuilder.add(V, Mask);
14663 E->CombinedEntriesWithIndices.size());
14664 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14665 [&](
const auto &
P) {
14666 return std::make_pair(VectorizableTree[P.first].get(),
14669 assert((E->CombinedEntriesWithIndices.empty() ||
14670 E->ReorderIndices.empty()) &&
14671 "Expected either combined subnodes or reordering");
14672 return ShuffleBuilder.finalize({}, SubVectors, {});
14676 cast<FixedVectorType>(
V->getType())->getNumElements()) {
14677 if (!VE->ReuseShuffleIndices.empty()) {
14698 if (isa<PoisonValue>(V))
14700 Mask[
I] = VE->findLaneForValue(V);
14702 V = FinalShuffle(V, Mask);
14704 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
14705 "Expected vectorization factor less "
14706 "than original vector size.");
14708 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14709 V = FinalShuffle(V, UniformMask);
14715 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
14716 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14717 }) == VE->UserTreeIndices.end()) {
14719 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14720 return TE->isGather() &&
TE->UserTreeIndices.front().UserTE == E &&
14721 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14723 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
14724 (*It)->VectorizedValue =
V;
14732 auto *
I =
find_if(VectorizableTree,
14733 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
14734 return TE->isOperandGatherNode({E, NodeIdx});
14736 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
14737 assert(
I->get()->UserTreeIndices.size() == 1 &&
14738 "Expected only single user for the gather node.");
14739 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
14743template <
typename BVTy,
typename ResTy,
typename...
Args>
14744ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
14746 assert(E->isGather() &&
"Expected gather node.");
14747 unsigned VF = E->getVectorFactor();
14749 bool NeedFreeze =
false;
14751 E->ReuseShuffleIndices.end());
14754 for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
14756 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
14759 E->CombinedEntriesWithIndices.size());
14760 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14761 [&](
const auto &
P) {
14762 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14767 E->ReorderIndices.end());
14768 if (!ReorderMask.empty())
14774 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
14775 for (
unsigned I : seq<unsigned>(GatheredScalars.size()))
14776 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
14779 SubVectorsMask.
clear();
14783 unsigned I,
unsigned SliceSize,
14784 bool IsNotPoisonous) {
14786 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14789 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14790 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14791 if (UserTE->getNumOperands() != 2)
14793 if (!IsNotPoisonous) {
14795 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
14796 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
14797 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14798 }) !=
TE->UserTreeIndices.end();
14800 if (It == VectorizableTree.end())
14803 if (!(*It)->ReorderIndices.empty()) {
14807 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
14808 Value *V0 = std::get<0>(
P);
14809 Value *V1 = std::get<1>(
P);
14810 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14811 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14817 if ((
Mask.size() < InputVF &&
14820 (
Mask.size() == InputVF &&
14823 std::next(
Mask.begin(),
I * SliceSize),
14824 std::next(
Mask.begin(),
14831 std::next(
Mask.begin(),
I * SliceSize),
14832 std::next(
Mask.begin(),
14838 BVTy ShuffleBuilder(ScalarTy, Params...);
14839 ResTy Res = ResTy();
14843 Value *ExtractVecBase =
nullptr;
14844 bool UseVecBaseAsInput =
false;
14847 Type *OrigScalarTy = GatheredScalars.front()->getType();
14850 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14855 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
14857 bool Resized =
false;
14859 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14860 if (!ExtractShuffles.
empty()) {
14865 if (
const auto *TE = getTreeEntry(
14866 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand()))
14869 if (std::optional<ResTy> Delayed =
14870 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14872 PostponedGathers.
insert(E);
14877 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
14878 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14879 ExtractVecBase = VecBase;
14880 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14881 if (VF == VecBaseTy->getNumElements() &&
14882 GatheredScalars.size() != VF) {
14884 GatheredScalars.append(VF - GatheredScalars.size(),
14890 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
14891 ((E->getOpcode() == Instruction::Load ||
14892 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14895 return isa<LoadInst>(V) && getTreeEntry(V);
14897 E->isAltShuffle() ||
14898 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
14900 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14902 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14904 if (!GatherShuffles.
empty()) {
14905 if (std::optional<ResTy> Delayed =
14906 ShuffleBuilder.needToDelay(E, Entries)) {
14908 PostponedGathers.
insert(E);
14913 if (GatherShuffles.
size() == 1 &&
14915 Entries.front().front()->isSame(E->Scalars)) {
14918 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
14921 Mask.resize(E->Scalars.size());
14922 const TreeEntry *FrontTE = Entries.front().front();
14923 if (FrontTE->ReorderIndices.empty() &&
14924 ((FrontTE->ReuseShuffleIndices.empty() &&
14925 E->Scalars.size() == FrontTE->Scalars.size()) ||
14926 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14927 std::iota(
Mask.begin(),
Mask.end(), 0);
14930 if (isa<PoisonValue>(V)) {
14934 Mask[
I] = FrontTE->findLaneForValue(V);
14937 ShuffleBuilder.add(*FrontTE, Mask);
14938 Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors,
14943 if (GatheredScalars.size() != VF &&
14945 return any_of(TEs, [&](
const TreeEntry *TE) {
14946 return TE->getVectorFactor() == VF;
14949 GatheredScalars.append(VF - GatheredScalars.size(),
14953 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
14961 bool IsRootPoison) {
14964 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
14971 int NumNonConsts = 0;
14974 if (isa<UndefValue>(V)) {
14975 if (!isa<PoisonValue>(V)) {
14990 Scalars.
front() = OrigV;
14993 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
14994 Scalars[Res.first->second] = OrigV;
14995 ReuseMask[
I] = Res.first->second;
14998 if (NumNonConsts == 1) {
15003 if (!UndefPos.
empty() && UndefPos.
front() == 0)
15006 ReuseMask[SinglePos] = SinglePos;
15007 }
else if (!UndefPos.
empty() && IsSplat) {
15012 return !isa<UndefValue>(V) &&
15014 (E->UserTreeIndices.size() == 1 &&
15018 return E->UserTreeIndices.front().EdgeIdx !=
15019 U.getOperandNo() &&
15021 E->UserTreeIndices.front().UserTE->Scalars,
15025 if (It != Scalars.
end()) {
15027 int Pos = std::distance(Scalars.
begin(), It);
15028 for (
int I : UndefPos) {
15030 ReuseMask[
I] = Pos;
15039 for (
int I : UndefPos) {
15041 if (isa<UndefValue>(Scalars[
I]))
15048 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
15049 bool IsNonPoisoned =
true;
15050 bool IsUsedInExpr =
true;
15051 Value *Vec1 =
nullptr;
15052 if (!ExtractShuffles.
empty()) {
15056 Value *Vec2 =
nullptr;
15057 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15061 if (UseVecBaseAsInput) {
15062 Vec1 = ExtractVecBase;
15064 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15067 if (isa<UndefValue>(E->Scalars[
I]))
15069 auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
15070 Value *VecOp = EI->getVectorOperand();
15071 if (
const auto *TE = getTreeEntry(VecOp))
15072 if (
TE->VectorizedValue)
15073 VecOp =
TE->VectorizedValue;
15076 }
else if (Vec1 != VecOp) {
15077 assert((!Vec2 || Vec2 == VecOp) &&
15078 "Expected only 1 or 2 vectors shuffle.");
15084 IsUsedInExpr =
false;
15087 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15090 IsUsedInExpr &= FindReusedSplat(
15092 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
15093 ExtractMask.size(), IsNotPoisonedVec);
15094 ShuffleBuilder.add(Vec1, ExtractMask,
true);
15095 IsNonPoisoned &= IsNotPoisonedVec;
15097 IsUsedInExpr =
false;
15102 if (!GatherShuffles.
empty()) {
15105 for (
const auto [
I, TEs] :
enumerate(Entries)) {
15108 "No shuffles with empty entries list expected.");
15112 "Expected shuffle of 1 or 2 entries.");
15116 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
15117 if (TEs.
size() == 1) {
15118 bool IsNotPoisonedVec =
15119 TEs.
front()->VectorizedValue
15123 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
15124 SliceSize, IsNotPoisonedVec);
15125 ShuffleBuilder.add(*TEs.
front(), VecMask);
15126 IsNonPoisoned &= IsNotPoisonedVec;
15128 IsUsedInExpr =
false;
15129 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
15130 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
15141 int EMSz = ExtractMask.size();
15142 int MSz =
Mask.size();
15145 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
15146 bool IsIdentityShuffle =
15147 ((UseVecBaseAsInput ||
15149 [](
const std::optional<TTI::ShuffleKind> &SK) {
15153 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
15155 (!GatherShuffles.
empty() &&
15157 [](
const std::optional<TTI::ShuffleKind> &SK) {
15161 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
15163 bool EnoughConstsForShuffle =
15167 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15171 return isa<Constant>(V) && !isa<UndefValue>(V);
15173 (!IsIdentityShuffle ||
15174 (GatheredScalars.size() == 2 &&
15176 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
15178 return isa<Constant>(V) && !isa<PoisonValue>(V);
15182 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
15183 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
15189 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15191 TryPackScalars(GatheredScalars, BVMask,
true);
15192 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15193 ShuffleBuilder.add(BV, BVMask);
15196 return isa<PoisonValue>(V) ||
15197 (IsSingleShuffle && ((IsIdentityShuffle &&
15198 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15200 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15203 Res = ShuffleBuilder.finalize(
15204 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15206 TryPackScalars(NonConstants, Mask,
false);
15207 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
15212 TryPackScalars(GatheredScalars, ReuseMask,
true);
15213 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
15214 ShuffleBuilder.add(BV, ReuseMask);
15215 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15220 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
15221 if (!isa<PoisonValue>(V))
15224 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15225 ShuffleBuilder.add(BV, Mask);
15226 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15231 Res = ShuffleBuilder.createFreeze(Res);
15235Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
15236 bool PostponedPHIs) {
15237 for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
15239 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15247 for (
Value *V : VL)
15248 if (isa<Instruction>(V))
15256 if (E->VectorizedValue &&
15257 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15258 E->isAltShuffle())) {
15259 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
15260 return E->VectorizedValue;
15263 Value *
V = E->Scalars.front();
15264 Type *ScalarTy =
V->getType();
15265 if (!isa<CmpInst>(V))
15267 auto It = MinBWs.
find(E);
15268 if (It != MinBWs.
end()) {
15269 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15275 if (E->isGather()) {
15277 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
15278 setInsertPointAfterBundle(E);
15279 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15280 E->VectorizedValue = Vec;
15284 bool IsReverseOrder =
15285 !E->ReorderIndices.empty() &&
isReverseOrder(E->ReorderIndices);
15286 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
15287 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
15288 if (E->getOpcode() == Instruction::Store &&
15289 E->State == TreeEntry::Vectorize) {
15291 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
15292 E->ReorderIndices.size());
15293 ShuffleBuilder.add(V, Mask);
15294 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15295 ShuffleBuilder.addOrdered(V, {});
15297 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15300 E->CombinedEntriesWithIndices.size());
15302 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
15303 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15306 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15307 "Expected either combined subnodes or reordering");
15308 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15311 assert(!E->isGather() &&
"Unhandled state");
15312 unsigned ShuffleOrOp =
15313 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
15315 auto GetOperandSignedness = [&](
unsigned Idx) {
15316 const TreeEntry *OpE = getOperandEntry(E,
Idx);
15317 bool IsSigned =
false;
15318 auto It = MinBWs.
find(OpE);
15319 if (It != MinBWs.
end())
15320 IsSigned = It->second.second;
15323 if (isa<PoisonValue>(V))
15325 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15329 switch (ShuffleOrOp) {
15330 case Instruction::PHI: {
15331 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15332 E != VectorizableTree.front().get() ||
15333 !E->UserTreeIndices.empty()) &&
15334 "PHI reordering is free.");
15335 if (PostponedPHIs && E->VectorizedValue)
15336 return E->VectorizedValue;
15337 auto *PH = cast<PHINode>(VL0);
15339 PH->getParent()->getFirstNonPHIIt());
15341 if (PostponedPHIs || !E->VectorizedValue) {
15348 PH->getParent()->getFirstInsertionPt());
15351 V = FinalShuffle(V, E);
15353 E->VectorizedValue =
V;
15357 PHINode *NewPhi = cast<PHINode>(E->PHI);
15366 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15372 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15376 if (!VisitedBBs.
insert(IBB).second) {
15383 Value *Vec = vectorizeOperand(E,
I,
true);
15384 if (VecTy != Vec->
getType()) {
15386 MinBWs.
contains(getOperandEntry(E,
I))) &&
15387 "Expected item in MinBWs.");
15388 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
15394 "Invalid number of incoming values");
15395 assert(E->VectorizedValue &&
"Expected vectorized value.");
15396 return E->VectorizedValue;
15399 case Instruction::ExtractElement: {
15400 Value *
V = E->getSingleOperand(0);
15401 if (
const TreeEntry *TE = getTreeEntry(V))
15402 V =
TE->VectorizedValue;
15403 setInsertPointAfterBundle(E);
15404 V = FinalShuffle(V, E);
15405 E->VectorizedValue =
V;
15408 case Instruction::ExtractValue: {
15409 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15414 NewV = FinalShuffle(NewV, E);
15415 E->VectorizedValue = NewV;
15418 case Instruction::InsertElement: {
15419 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
15421 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
15423 Type *ScalarTy =
Op.front()->getType();
15424 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
15426 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
15427 assert(Res.first > 0 &&
"Expected item in MinBWs.");
15432 cast<FixedVectorType>(
V->getType())->getNumElements()),
15437 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
15438 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15440 const unsigned NumElts =
15441 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15442 const unsigned NumScalars = E->Scalars.size();
15445 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15449 if (!E->ReorderIndices.empty()) {
15454 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
15457 bool IsIdentity =
true;
15459 Mask.swap(PrevMask);
15460 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15463 IsIdentity &= InsertIdx -
Offset ==
I;
15466 if (!IsIdentity || NumElts != NumScalars) {
15468 bool IsVNonPoisonous =
15471 if (NumElts != NumScalars &&
Offset == 0) {
15480 InsertMask[*InsertIdx] = *InsertIdx;
15481 if (!
Ins->hasOneUse())
15483 Ins = dyn_cast_or_null<InsertElementInst>(
15484 Ins->getUniqueUndroppableUser());
15487 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15489 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15492 if (!IsFirstPoison.
all()) {
15494 for (
unsigned I = 0;
I < NumElts;
I++) {
15496 IsFirstUndef.
test(
I)) {
15497 if (IsVNonPoisonous) {
15498 InsertMask[
I] =
I < NumScalars ?
I : 0;
15503 if (
Idx >= NumScalars)
15504 Idx = NumScalars - 1;
15505 InsertMask[
I] = NumScalars +
Idx;
15519 if (
auto *
I = dyn_cast<Instruction>(V)) {
15520 GatherShuffleExtractSeq.
insert(
I);
15521 CSEBlocks.
insert(
I->getParent());
15526 for (
unsigned I = 0;
I < NumElts;
I++) {
15531 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15534 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
15535 NumElts != NumScalars) {
15536 if (IsFirstUndef.
all()) {
15539 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15540 if (!IsFirstPoison.
all()) {
15541 for (
unsigned I = 0;
I < NumElts;
I++) {
15543 InsertMask[
I] =
I + NumElts;
15550 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
15551 if (
auto *
I = dyn_cast<Instruction>(V)) {
15552 GatherShuffleExtractSeq.
insert(
I);
15553 CSEBlocks.
insert(
I->getParent());
15558 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15559 for (
unsigned I = 0;
I < NumElts;
I++) {
15563 InsertMask[
I] += NumElts;
15566 FirstInsert->getOperand(0), V, InsertMask,
15567 cast<Instruction>(E->Scalars.back())->getName());
15568 if (
auto *
I = dyn_cast<Instruction>(V)) {
15569 GatherShuffleExtractSeq.
insert(
I);
15570 CSEBlocks.
insert(
I->getParent());
15575 ++NumVectorInstructions;
15576 E->VectorizedValue =
V;
15579 case Instruction::ZExt:
15580 case Instruction::SExt:
15581 case Instruction::FPToUI:
15582 case Instruction::FPToSI:
15583 case Instruction::FPExt:
15584 case Instruction::PtrToInt:
15585 case Instruction::IntToPtr:
15586 case Instruction::SIToFP:
15587 case Instruction::UIToFP:
15588 case Instruction::Trunc:
15589 case Instruction::FPTrunc:
15590 case Instruction::BitCast: {
15591 setInsertPointAfterBundle(E);
15593 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15594 if (E->VectorizedValue) {
15595 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15596 return E->VectorizedValue;
15599 auto *CI = cast<CastInst>(VL0);
15601 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
15602 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15604 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
15607 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
15608 if (SrcIt != MinBWs.
end())
15609 SrcBWSz = SrcIt->second.first;
15611 if (BWSz == SrcBWSz) {
15612 VecOpcode = Instruction::BitCast;
15613 }
else if (BWSz < SrcBWSz) {
15614 VecOpcode = Instruction::Trunc;
15615 }
else if (It != MinBWs.
end()) {
15616 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15617 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15618 }
else if (SrcIt != MinBWs.
end()) {
15619 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15621 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15623 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
15624 !SrcIt->second.second) {
15625 VecOpcode = Instruction::UIToFP;
15627 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15629 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
15630 V = FinalShuffle(V, E);
15632 E->VectorizedValue =
V;
15633 ++NumVectorInstructions;
15636 case Instruction::FCmp:
15637 case Instruction::ICmp: {
15638 setInsertPointAfterBundle(E);
15640 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
15641 if (E->VectorizedValue) {
15642 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15643 return E->VectorizedValue;
15645 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
15646 if (E->VectorizedValue) {
15647 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15648 return E->VectorizedValue;
15650 if (
L->getType() !=
R->getType()) {
15652 getOperandEntry(E, 1)->
isGather() ||
15653 MinBWs.
contains(getOperandEntry(E, 0)) ||
15654 MinBWs.
contains(getOperandEntry(E, 1))) &&
15655 "Expected item in MinBWs.");
15656 if (cast<VectorType>(
L->getType())
15658 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
15660 ->getIntegerBitWidth()) {
15661 Type *CastTy =
R->getType();
15664 Type *CastTy =
L->getType();
15672 if (
auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.
end())
15673 ICmp->setSameSign(
false);
15675 VecTy = cast<FixedVectorType>(
V->getType());
15676 V = FinalShuffle(V, E);
15678 E->VectorizedValue =
V;
15679 ++NumVectorInstructions;
15682 case Instruction::Select: {
15683 setInsertPointAfterBundle(E);
15685 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
15686 if (E->VectorizedValue) {
15687 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15688 return E->VectorizedValue;
15690 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15691 if (E->VectorizedValue) {
15692 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15693 return E->VectorizedValue;
15695 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15696 if (E->VectorizedValue) {
15697 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15698 return E->VectorizedValue;
15702 getOperandEntry(E, 2)->
isGather() ||
15703 MinBWs.
contains(getOperandEntry(E, 1)) ||
15704 MinBWs.
contains(getOperandEntry(E, 2))) &&
15705 "Expected item in MinBWs.");
15706 if (True->
getType() != VecTy)
15707 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
15708 if (False->
getType() != VecTy)
15709 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
15714 assert(TrueNumElements >= CondNumElements &&
15715 TrueNumElements % CondNumElements == 0 &&
15716 "Cannot vectorize Instruction::Select");
15718 "Cannot vectorize Instruction::Select");
15719 if (CondNumElements != TrueNumElements) {
15727 "Cannot vectorize Instruction::Select");
15729 V = FinalShuffle(V, E);
15731 E->VectorizedValue =
V;
15732 ++NumVectorInstructions;
15735 case Instruction::FNeg: {
15736 setInsertPointAfterBundle(E);
15738 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15740 if (E->VectorizedValue) {
15741 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15742 return E->VectorizedValue;
15748 if (
auto *
I = dyn_cast<Instruction>(V))
15751 V = FinalShuffle(V, E);
15753 E->VectorizedValue =
V;
15754 ++NumVectorInstructions;
15758 case Instruction::Freeze: {
15759 setInsertPointAfterBundle(E);
15761 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15763 if (E->VectorizedValue) {
15764 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15765 return E->VectorizedValue;
15768 if (
Op->getType() != VecTy) {
15770 MinBWs.
contains(getOperandEntry(E, 0))) &&
15771 "Expected item in MinBWs.");
15775 V = FinalShuffle(V, E);
15777 E->VectorizedValue =
V;
15778 ++NumVectorInstructions;
15782 case Instruction::Add:
15783 case Instruction::FAdd:
15784 case Instruction::Sub:
15785 case Instruction::FSub:
15786 case Instruction::Mul:
15787 case Instruction::FMul:
15788 case Instruction::UDiv:
15789 case Instruction::SDiv:
15790 case Instruction::FDiv:
15791 case Instruction::URem:
15792 case Instruction::SRem:
15793 case Instruction::FRem:
15794 case Instruction::Shl:
15795 case Instruction::LShr:
15796 case Instruction::AShr:
15797 case Instruction::And:
15798 case Instruction::Or:
15799 case Instruction::Xor: {
15800 setInsertPointAfterBundle(E);
15802 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
15803 if (E->VectorizedValue) {
15804 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15805 return E->VectorizedValue;
15807 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
15808 if (E->VectorizedValue) {
15809 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15810 return E->VectorizedValue;
15812 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
15813 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15816 auto *CI = dyn_cast<ConstantInt>(
Op);
15817 return CI && CI->getValue().countr_one() >= It->second.first;
15819 V = FinalShuffle(
I == 0 ? RHS : LHS, E);
15820 E->VectorizedValue =
V;
15821 ++NumVectorInstructions;
15828 getOperandEntry(E, 1)->
isGather() ||
15829 MinBWs.
contains(getOperandEntry(E, 0)) ||
15830 MinBWs.
contains(getOperandEntry(E, 1))) &&
15831 "Expected item in MinBWs.");
15842 if (
auto *
I = dyn_cast<Instruction>(V)) {
15845 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
15847 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15849 I->setHasNoUnsignedWrap(
false);
15852 V = FinalShuffle(V, E);
15854 E->VectorizedValue =
V;
15855 ++NumVectorInstructions;
15859 case Instruction::Load: {
15862 setInsertPointAfterBundle(E);
15864 LoadInst *LI = cast<LoadInst>(VL0);
15867 if (E->State == TreeEntry::Vectorize) {
15869 }
else if (E->State == TreeEntry::StridedVectorize) {
15870 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15871 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15872 PO = IsReverseOrder ? PtrN : Ptr0;
15878 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
15880 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15881 DL->getTypeAllocSize(ScalarTy));
15885 return cast<LoadInst>(V)->getPointerOperand();
15888 std::optional<Value *> Stride =
15897 (IsReverseOrder ? -1 : 1) *
15898 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
15900 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15902 Intrinsic::experimental_vp_strided_load,
15903 {VecTy, PO->
getType(), StrideTy},
15905 Builder.
getInt32(E->Scalars.size())});
15911 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
15912 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15913 if (E->VectorizedValue) {
15914 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15915 return E->VectorizedValue;
15917 if (isa<FixedVectorType>(ScalarTy)) {
15921 unsigned ScalarTyNumElements =
15922 cast<FixedVectorType>(ScalarTy)->getNumElements();
15923 unsigned VecTyNumElements =
15924 cast<FixedVectorType>(VecTy)->getNumElements();
15925 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15926 "Cannot expand getelementptr.");
15927 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15930 return Builder.getInt64(I % ScalarTyNumElements);
15939 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15944 V = FinalShuffle(V, E);
15945 E->VectorizedValue =
V;
15946 ++NumVectorInstructions;
15949 case Instruction::Store: {
15950 auto *
SI = cast<StoreInst>(VL0);
15952 setInsertPointAfterBundle(E);
15954 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15955 if (VecValue->
getType() != VecTy)
15957 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15958 VecValue = FinalShuffle(VecValue, E);
15962 if (E->State == TreeEntry::Vectorize) {
15965 assert(E->State == TreeEntry::StridedVectorize &&
15966 "Expected either strided or consecutive stores.");
15967 if (!E->ReorderIndices.empty()) {
15968 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15969 Ptr =
SI->getPointerOperand();
15971 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15972 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
15974 Intrinsic::experimental_vp_strided_store,
15975 {VecTy,
Ptr->getType(), StrideTy},
15978 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
15980 Builder.
getInt32(E->Scalars.size())});
15989 E->VectorizedValue =
V;
15990 ++NumVectorInstructions;
15993 case Instruction::GetElementPtr: {
15994 auto *GEP0 = cast<GetElementPtrInst>(VL0);
15995 setInsertPointAfterBundle(E);
15997 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
15998 if (E->VectorizedValue) {
15999 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16000 return E->VectorizedValue;
16004 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
16005 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
16006 if (E->VectorizedValue) {
16007 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16008 return E->VectorizedValue;
16013 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16014 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
16016 for (
Value *V : E->Scalars) {
16017 if (isa<GetElementPtrInst>(V))
16023 V = FinalShuffle(V, E);
16025 E->VectorizedValue =
V;
16026 ++NumVectorInstructions;
16030 case Instruction::Call: {
16031 CallInst *CI = cast<CallInst>(VL0);
16032 setInsertPointAfterBundle(E);
16038 It != MinBWs.
end() ? It->second.first : 0,
TTI);
16041 VecCallCosts.first <= VecCallCosts.second;
16043 Value *ScalarArg =
nullptr;
16049 auto *CEI = cast<CallInst>(VL0);
16050 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
16055 ScalarArg = CEI->getArgOperand(
I);
16058 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
16059 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
16067 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
16068 if (E->VectorizedValue) {
16069 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16070 return E->VectorizedValue;
16072 ScalarArg = CEI->getArgOperand(
I);
16073 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
16075 It == MinBWs.
end()) {
16078 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
16079 }
else if (It != MinBWs.
end()) {
16080 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
16089 if (!UseIntrinsic) {
16105 V = FinalShuffle(V, E);
16107 E->VectorizedValue =
V;
16108 ++NumVectorInstructions;
16111 case Instruction::ShuffleVector: {
16113 if (
SLPReVec && !E->isAltShuffle()) {
16114 setInsertPointAfterBundle(E);
16115 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16116 if (E->VectorizedValue) {
16117 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16118 return E->VectorizedValue;
16121 if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16122 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16123 "Not supported shufflevector usage.");
16126 return SVSrc->getShuffleMask()[Mask];
16133 if (
auto *
I = dyn_cast<Instruction>(V))
16135 V = FinalShuffle(V, E);
16137 assert(E->isAltShuffle() &&
16142 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16143 "Invalid Shuffle Vector Operand");
16147 setInsertPointAfterBundle(E);
16148 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16149 if (E->VectorizedValue) {
16150 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16151 return E->VectorizedValue;
16153 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16155 setInsertPointAfterBundle(E);
16156 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16158 if (E->VectorizedValue) {
16159 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16160 return E->VectorizedValue;
16167 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16168 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16169 MinBWs.
contains(getOperandEntry(E, 0)) ||
16170 MinBWs.
contains(getOperandEntry(E, 1))) &&
16171 "Expected item in MinBWs.");
16172 Type *CastTy = VecTy;
16176 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
16178 ->getIntegerBitWidth())
16195 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16196 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
16197 auto *AltCI = cast<CmpInst>(E->getAltOp());
16199 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
16202 unsigned SrcBWSz =
DL->getTypeSizeInBits(
16203 cast<VectorType>(
LHS->
getType())->getElementType());
16204 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
16205 if (BWSz <= SrcBWSz) {
16206 if (BWSz < SrcBWSz)
16209 "Expected same type as operand.");
16210 if (
auto *
I = dyn_cast<Instruction>(LHS))
16212 LHS = FinalShuffle(LHS, E);
16213 E->VectorizedValue =
LHS;
16214 ++NumVectorInstructions;
16225 for (
Value *V : {V0, V1}) {
16226 if (
auto *
I = dyn_cast<Instruction>(V)) {
16227 GatherShuffleExtractSeq.
insert(
I);
16228 CSEBlocks.
insert(
I->getParent());
16237 E->buildAltOpShuffleMask(
16239 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
16243 Mask, &OpScalars, &AltScalars);
16247 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
16249 if (
auto *
I = dyn_cast<Instruction>(Vec);
16250 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
16252 if (isa<PoisonValue>(V))
16254 auto *IV = cast<Instruction>(V);
16255 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16257 I->setHasNoUnsignedWrap(
false);
16259 DropNuwFlag(V0, E->getOpcode());
16260 DropNuwFlag(V1, E->getAltOpcode());
16262 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16267 if (
auto *
I = dyn_cast<Instruction>(V)) {
16269 GatherShuffleExtractSeq.
insert(
I);
16270 CSEBlocks.
insert(
I->getParent());
16274 E->VectorizedValue =
V;
16275 ++NumVectorInstructions;
16294 for (
auto &BSIter : BlocksSchedules) {
16295 scheduleBlock(BSIter.second.get());
16299 EntryToLastInstruction.
clear();
16309 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16310 if (GatheredLoadsEntriesFirst.has_value() &&
16311 TE->Idx >= *GatheredLoadsEntriesFirst &&
16312 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16313 assert((!TE->UserTreeIndices.empty() ||
16314 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16315 "Expected gathered load node.");
16321 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16322 if (TE->State == TreeEntry::Vectorize &&
16323 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16324 TE->VectorizedValue)
16330 for (
const TreeEntry *E : PostponedNodes) {
16331 auto *TE =
const_cast<TreeEntry *
>(E);
16332 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
16333 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16334 TE->UserTreeIndices.front().EdgeIdx)) &&
16335 VecTE->isSame(TE->Scalars))
16339 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16340 TE->VectorizedValue =
nullptr;
16342 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16351 if (isa<PHINode>(UserI)) {
16354 for (
User *U : PrevVec->users()) {
16357 auto *UI = dyn_cast<Instruction>(U);
16358 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
16360 if (UI->comesBefore(InsertPt))
16369 if (
auto *VecI = dyn_cast<Instruction>(Vec);
16374 if (Vec->
getType() != PrevVec->getType()) {
16376 PrevVec->getType()->isIntOrIntVectorTy() &&
16377 "Expected integer vector types only.");
16378 std::optional<bool> IsSigned;
16379 for (
Value *V : TE->Scalars) {
16380 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
16381 auto It = MinBWs.
find(BaseTE);
16382 if (It != MinBWs.
end()) {
16383 IsSigned = IsSigned.value_or(
false) || It->second.second;
16387 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
16388 auto It = MinBWs.
find(MNTE);
16389 if (It != MinBWs.
end()) {
16390 IsSigned = IsSigned.value_or(
false) || It->second.second;
16395 if (IsSigned.value_or(
false))
16398 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16399 auto It = MinBWs.
find(BVE);
16400 if (It != MinBWs.
end()) {
16401 IsSigned = IsSigned.value_or(
false) || It->second.second;
16406 if (IsSigned.value_or(
false))
16408 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
16410 IsSigned.value_or(
false) ||
16414 if (IsSigned.value_or(
false))
16418 if (IsSigned.value_or(
false)) {
16420 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
16421 if (It != MinBWs.
end())
16422 IsSigned = It->second.second;
16425 "Expected user node or perfect diamond match in MinBWs.");
16429 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
16432 auto It = PostponedValues.
find(PrevVec);
16433 if (It != PostponedValues.
end()) {
16434 for (TreeEntry *VTE : It->getSecond())
16435 VTE->VectorizedValue = Vec;
16455 for (
const auto &ExternalUse : ExternalUses) {
16456 Value *Scalar = ExternalUse.Scalar;
16463 TreeEntry *E = getTreeEntry(Scalar);
16464 assert(E &&
"Invalid scalar");
16465 assert(!E->isGather() &&
"Extracting from a gather list");
16467 if (E->getOpcode() == Instruction::GetElementPtr &&
16468 !isa<GetElementPtrInst>(Scalar))
16471 Value *Vec = E->VectorizedValue;
16472 assert(Vec &&
"Can't find vectorizable value");
16475 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
16476 if (Scalar->getType() != Vec->
getType()) {
16477 Value *Ex =
nullptr;
16478 Value *ExV =
nullptr;
16479 auto *Inst = dyn_cast<Instruction>(Scalar);
16480 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
16481 auto It = ScalarToEEs.
find(Scalar);
16482 if (It != ScalarToEEs.
end()) {
16485 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16487 if (EEIt != It->second.end()) {
16488 Value *PrevV = EEIt->second.first;
16489 if (
auto *
I = dyn_cast<Instruction>(PrevV);
16490 I && !ReplaceInst &&
16495 if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16499 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16507 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16508 IgnoredExtracts.
insert(EE);
16511 auto *CloneInst = Inst->clone();
16512 CloneInst->insertBefore(Inst);
16513 if (Inst->hasName())
16517 }
else if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16518 ES && isa<Instruction>(Vec)) {
16519 Value *V = ES->getVectorOperand();
16520 auto *IVec = cast<Instruction>(Vec);
16521 if (
const TreeEntry *ETE = getTreeEntry(V))
16522 V = ETE->VectorizedValue;
16523 if (
auto *
IV = dyn_cast<Instruction>(V);
16524 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
16525 IV->comesBefore(IVec))
16529 }
else if (
auto *VecTy =
16530 dyn_cast<FixedVectorType>(Scalar->getType())) {
16539 Vec, Builder.
getInt64(ExternalUse.Lane * VecTyNumElements));
16546 if (Scalar->getType() != Ex->
getType())
16548 Ex, Scalar->getType(),
16550 auto *
I = dyn_cast<Instruction>(Ex);
16552 : &
F->getEntryBlock(),
16553 std::make_pair(Ex, ExV));
16557 if (
auto *ExI = dyn_cast<Instruction>(Ex);
16559 GatherShuffleExtractSeq.
insert(ExI);
16560 CSEBlocks.
insert(ExI->getParent());
16564 assert(isa<FixedVectorType>(Scalar->getType()) &&
16565 isa<InsertElementInst>(Scalar) &&
16566 "In-tree scalar of vector type is not insertelement?");
16567 auto *IE = cast<InsertElementInst>(Scalar);
16575 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
16579 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
16582 if (ExternalUsesAsOriginalScalar.contains(U))
16584 TreeEntry *UseEntry = getTreeEntry(U);
16586 (UseEntry->State == TreeEntry::Vectorize ||
16588 TreeEntry::StridedVectorize) &&
16589 (E->State == TreeEntry::Vectorize ||
16590 E->State == TreeEntry::StridedVectorize) &&
16591 doesInTreeUserNeedToExtract(
16592 Scalar, getRootEntryInstruction(*UseEntry),
16595 "Scalar with nullptr User must be registered in "
16596 "ExternallyUsedValues map or remain as scalar in vectorized "
16598 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16599 if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
16600 if (
PHI->getParent()->isLandingPad())
16604 PHI->getParent()->getLandingPadInst()->getIterator()));
16607 PHI->getParent()->getFirstNonPHIIt());
16610 std::next(VecI->getIterator()));
16615 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16617 if (Scalar != NewInst) {
16618 assert((!isa<ExtractElementInst>(Scalar) ||
16619 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
16620 "Extractelements should not be replaced.");
16621 Scalar->replaceAllUsesWith(NewInst);
16626 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
16629 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16630 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
16631 if (!UsedInserts.
insert(VU).second)
16634 auto BWIt = MinBWs.
find(E);
16636 auto *ScalarTy = FTy->getElementType();
16637 auto Key = std::make_pair(Vec, ScalarTy);
16638 auto VecIt = VectorCasts.
find(Key);
16639 if (VecIt == VectorCasts.
end()) {
16641 if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
16642 if (IVec->getParent()->isLandingPad())
16644 std::next(IVec->getParent()
16645 ->getLandingPadInst()
16649 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16650 }
else if (
auto *IVec = dyn_cast<Instruction>(Vec)) {
16657 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
16658 BWIt->second.second);
16661 Vec = VecIt->second;
16668 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
16675 unsigned Idx = *InsertIdx;
16676 if (It == ShuffledInserts.
end()) {
16678 It = std::next(ShuffledInserts.
begin(),
16679 ShuffledInserts.
size() - 1);
16684 Mask[
Idx] = ExternalUse.Lane;
16685 It->InsertElements.push_back(cast<InsertElementInst>(
User));
16694 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16696 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16697 if (PH->getIncomingValue(
I) == Scalar) {
16699 PH->getIncomingBlock(
I)->getTerminator();
16700 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16702 std::next(VecI->getIterator()));
16706 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16707 PH->setOperand(
I, NewInst);
16712 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16717 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16727 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
16728 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
16730 CombinedMask1[
I] = Mask[
I];
16732 CombinedMask2[
I] = Mask[
I] - VF;
16735 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
16736 ShuffleBuilder.
add(V1, CombinedMask1);
16738 ShuffleBuilder.
add(V2, CombinedMask2);
16739 return ShuffleBuilder.
finalize({}, {}, {});
16743 bool ForSingleMask) {
16744 unsigned VF = Mask.size();
16745 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
16747 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
16748 Vec = CreateShuffle(Vec,
nullptr, Mask);
16749 return std::make_pair(Vec,
true);
16751 if (!ForSingleMask) {
16753 for (
unsigned I = 0;
I < VF; ++
I) {
16755 ResizeMask[Mask[
I]] = Mask[
I];
16757 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
16761 return std::make_pair(Vec,
false);
16765 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16771 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16772 Value *NewInst = performExtractsShuffleAction<Value>(
16776 return cast<VectorType>(Vec->getType())
16777 ->getElementCount()
16778 .getKnownMinValue();
16783 assert((Vals.size() == 1 || Vals.size() == 2) &&
16784 "Expected exactly 1 or 2 input values.");
16785 if (Vals.size() == 1) {
16788 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16789 ->getNumElements() ||
16790 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16791 return CreateShuffle(Vals.front(), nullptr, Mask);
16792 return Vals.front();
16794 return CreateShuffle(Vals.
front() ? Vals.
front()
16796 Vals.
back(), Mask);
16798 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
16801 if (It != ShuffledInserts[
I].InsertElements.
rend())
16804 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
16805 assert(
II &&
"Must be an insertelement instruction.");
16810 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
16813 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
16814 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
16815 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
16816 II->moveAfter(NewI);
16819 LastInsert->replaceAllUsesWith(NewInst);
16821 IE->replaceUsesOfWith(IE->getOperand(0),
16823 IE->replaceUsesOfWith(IE->getOperand(1),
16827 CSEBlocks.
insert(LastInsert->getParent());
16832 for (
auto &TEPtr : VectorizableTree) {
16833 TreeEntry *Entry = TEPtr.get();
16836 if (Entry->isGather())
16839 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
16842 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16843 Value *Scalar = Entry->Scalars[Lane];
16845 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16846 !isa<GetElementPtrInst>(Scalar))
16848 if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16849 EE && IgnoredExtracts.contains(EE))
16851 if (isa<PoisonValue>(Scalar))
16854 Type *Ty = Scalar->getType();
16856 for (
User *U : Scalar->users()) {
16860 assert((getTreeEntry(U) ||
16861 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16862 (isa_and_nonnull<Instruction>(U) &&
16863 isDeleted(cast<Instruction>(U)))) &&
16864 "Deleting out-of-tree value");
16868 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
16869 auto *
I = cast<Instruction>(Scalar);
16876 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16877 V->mergeDIAssignID(RemovedInsts);
16880 if (UserIgnoreList) {
16882 const TreeEntry *
IE = getTreeEntry(
I);
16883 if (
IE->Idx != 0 &&
16884 !(VectorizableTree.front()->isGather() &&
16885 !
IE->UserTreeIndices.empty() &&
16886 (ValueToGatherNodes.lookup(
I).contains(
16887 VectorizableTree.front().get()) ||
16889 [&](
const EdgeInfo &EI) {
16890 return EI.UserTE == VectorizableTree.front().get() &&
16891 EI.EdgeIdx == UINT_MAX;
16893 !(GatheredLoadsEntriesFirst.has_value() &&
16894 IE->Idx >= *GatheredLoadsEntriesFirst &&
16895 VectorizableTree.front()->isGather() &&
16901 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16902 (match(U.getUser(), m_LogicalAnd()) ||
16903 match(U.getUser(), m_LogicalOr())) &&
16904 U.getOperandNo() == 0;
16905 if (IsPoisoningLogicalOp) {
16906 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16909 return UserIgnoreList->contains(
U.getUser());
16921 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
16924 InstrElementSize.
clear();
16926 const TreeEntry &RootTE = *VectorizableTree.front();
16927 Value *Vec = RootTE.VectorizedValue;
16928 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16929 It != MinBWs.end() &&
16930 ReductionBitWidth != It->second.first) {
16933 ReductionRoot->getIterator());
16937 cast<VectorType>(Vec->
getType())->getElementCount()),
16938 It->second.second);
16945 <<
" gather sequences instructions.\n");
16952 Loop *L = LI->getLoopFor(
I->getParent());
16957 BasicBlock *PreHeader = L->getLoopPreheader();
16965 auto *OpI = dyn_cast<Instruction>(V);
16966 return OpI && L->contains(OpI);
16972 CSEBlocks.
insert(PreHeader);
16987 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
16988 "Different nodes should have different DFS numbers");
16989 return A->getDFSNumIn() <
B->getDFSNumIn();
17000 if (I1->getType() != I2->getType())
17002 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
17003 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
17005 return I1->isIdenticalTo(I2);
17006 if (SI1->isIdenticalTo(SI2))
17008 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
17009 if (SI1->getOperand(
I) != SI2->getOperand(
I))
17012 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17016 unsigned LastUndefsCnt = 0;
17017 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
17023 NewMask[
I] != SM1[
I])
17026 NewMask[
I] = SM1[
I];
17030 return SM1.
size() - LastUndefsCnt > 1 &&
17034 SM1.
size() - LastUndefsCnt));
17040 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
17043 "Worklist not sorted properly!");
17049 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17050 !GatherShuffleExtractSeq.contains(&In))
17055 bool Replaced =
false;
17058 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17059 DT->
dominates(V->getParent(), In.getParent())) {
17060 In.replaceAllUsesWith(V);
17062 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
17063 if (!NewMask.
empty())
17064 SI->setShuffleMask(NewMask);
17068 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17069 GatherShuffleExtractSeq.contains(V) &&
17070 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17071 DT->
dominates(In.getParent(), V->getParent())) {
17073 V->replaceAllUsesWith(&In);
17075 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17076 if (!NewMask.
empty())
17077 SI->setShuffleMask(NewMask);
17085 Visited.push_back(&In);
17090 GatherShuffleExtractSeq.clear();
17093BoUpSLP::ScheduleData *
17095 ScheduleData *Bundle =
nullptr;
17096 ScheduleData *PrevInBundle =
nullptr;
17097 for (
Value *V : VL) {
17100 ScheduleData *BundleMember = getScheduleData(V);
17102 "no ScheduleData for bundle member "
17103 "(maybe not in same basic block)");
17104 assert(BundleMember->isSchedulingEntity() &&
17105 "bundle member already part of other bundle");
17106 if (PrevInBundle) {
17107 PrevInBundle->NextInBundle = BundleMember;
17109 Bundle = BundleMember;
17113 BundleMember->FirstInBundle = Bundle;
17114 PrevInBundle = BundleMember;
17116 assert(Bundle &&
"Failed to find schedule bundle");
17122std::optional<BoUpSLP::ScheduleData *>
17124 const InstructionsState &S) {
17127 if (isa<PHINode>(S.getMainOp()) ||
17133 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
17135 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
17136 ScheduleData *Bundle) {
17142 if (ScheduleEnd != OldScheduleEnd) {
17143 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
17144 if (ScheduleData *SD = getScheduleData(
I))
17145 SD->clearDependencies();
17150 <<
" in block " << BB->
getName() <<
"\n");
17151 calculateDependencies(Bundle,
true, SLP);
17156 initialFillReadyList(ReadyInsts);
17163 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17164 !ReadyInsts.empty()) {
17165 ScheduleData *Picked = ReadyInsts.pop_back_val();
17166 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17167 "must be ready to schedule");
17168 schedule(Picked, ReadyInsts);
17174 for (
Value *V : VL) {
17177 if (!extendSchedulingRegion(V, S)) {
17184 TryScheduleBundleImpl(
false,
nullptr);
17185 return std::nullopt;
17189 bool ReSchedule =
false;
17190 for (
Value *V : VL) {
17193 ScheduleData *BundleMember = getScheduleData(V);
17195 "no ScheduleData for bundle member (maybe not in same basic block)");
17199 ReadyInsts.remove(BundleMember);
17201 if (!BundleMember->IsScheduled)
17206 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
17207 <<
" was already scheduled\n");
17211 auto *Bundle = buildBundle(VL);
17212 TryScheduleBundleImpl(ReSchedule, Bundle);
17213 if (!Bundle->isReady()) {
17214 cancelScheduling(VL, S.getMainOp());
17215 return std::nullopt;
17228 ScheduleData *Bundle = getScheduleData(OpValue);
17229 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
17230 assert(!Bundle->IsScheduled &&
17231 "Can't cancel bundle which is already scheduled");
17232 assert(Bundle->isSchedulingEntity() &&
17234 "tried to unbundle something which is not a bundle");
17237 if (Bundle->isReady())
17238 ReadyInsts.remove(Bundle);
17241 ScheduleData *BundleMember = Bundle;
17242 while (BundleMember) {
17243 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
17244 BundleMember->FirstInBundle = BundleMember;
17245 ScheduleData *Next = BundleMember->NextInBundle;
17246 BundleMember->NextInBundle =
nullptr;
17247 BundleMember->TE =
nullptr;
17248 if (BundleMember->unscheduledDepsInBundle() == 0) {
17249 ReadyInsts.insert(BundleMember);
17251 BundleMember = Next;
17255BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17257 if (ChunkPos >= ChunkSize) {
17258 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17261 return &(ScheduleDataChunks.back()[ChunkPos++]);
17264bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17265 Value *V,
const InstructionsState &S) {
17267 assert(
I &&
"bundle member must be an instruction");
17270 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17272 if (getScheduleData(
I))
17274 if (!ScheduleStart) {
17276 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
17278 ScheduleEnd =
I->getNextNode();
17279 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17280 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
17288 ++ScheduleStart->getIterator().getReverse();
17293 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
17294 return II->isAssumeLikeIntrinsic();
17297 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17298 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17299 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
17301 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17302 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
17309 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17310 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17312 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
17313 assert(
I->getParent() == ScheduleStart->getParent() &&
17314 "Instruction is in wrong basic block.");
17315 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
17321 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
17322 "Expected to reach top of the basic block or instruction down the "
17324 assert(
I->getParent() == ScheduleEnd->getParent() &&
17325 "Instruction is in wrong basic block.");
17326 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
17328 ScheduleEnd =
I->getNextNode();
17329 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17330 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
17334void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
17336 ScheduleData *PrevLoadStore,
17337 ScheduleData *NextLoadStore) {
17338 ScheduleData *CurrentLoadStore = PrevLoadStore;
17343 ScheduleData *SD = ScheduleDataMap.lookup(
I);
17345 SD = allocateScheduleDataChunks();
17346 ScheduleDataMap[
I] = SD;
17348 assert(!isInSchedulingRegion(SD) &&
17349 "new ScheduleData already in scheduling region");
17350 SD->init(SchedulingRegionID,
I);
17352 if (
I->mayReadOrWriteMemory() &&
17353 (!isa<IntrinsicInst>(
I) ||
17354 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
17355 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
17356 Intrinsic::pseudoprobe))) {
17358 if (CurrentLoadStore) {
17359 CurrentLoadStore->NextLoadStore = SD;
17361 FirstLoadStoreInRegion = SD;
17363 CurrentLoadStore = SD;
17366 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17367 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17368 RegionHasStackSave =
true;
17370 if (NextLoadStore) {
17371 if (CurrentLoadStore)
17372 CurrentLoadStore->NextLoadStore = NextLoadStore;
17374 LastLoadStoreInRegion = CurrentLoadStore;
17378void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17379 bool InsertInReadyList,
17381 assert(SD->isSchedulingEntity());
17386 while (!WorkList.
empty()) {
17388 for (ScheduleData *BundleMember = SD; BundleMember;
17389 BundleMember = BundleMember->NextInBundle) {
17390 assert(isInSchedulingRegion(BundleMember));
17391 if (BundleMember->hasValidDependencies())
17396 BundleMember->Dependencies = 0;
17397 BundleMember->resetUnscheduledDeps();
17400 for (
User *U : BundleMember->Inst->
users()) {
17401 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17402 BundleMember->Dependencies++;
17403 ScheduleData *DestBundle = UseSD->FirstInBundle;
17404 if (!DestBundle->IsScheduled)
17405 BundleMember->incrementUnscheduledDeps(1);
17406 if (!DestBundle->hasValidDependencies())
17412 auto *DepDest = getScheduleData(
I);
17413 assert(DepDest &&
"must be in schedule window");
17414 DepDest->ControlDependencies.push_back(BundleMember);
17415 BundleMember->Dependencies++;
17416 ScheduleData *DestBundle = DepDest->FirstInBundle;
17417 if (!DestBundle->IsScheduled)
17418 BundleMember->incrementUnscheduledDeps(1);
17419 if (!DestBundle->hasValidDependencies())
17427 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17428 I != ScheduleEnd;
I =
I->getNextNode()) {
17433 MakeControlDependent(
I);
17441 if (RegionHasStackSave) {
17445 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17446 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17447 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17448 I != ScheduleEnd;
I =
I->getNextNode()) {
17449 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17450 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17455 if (!isa<AllocaInst>(
I))
17459 MakeControlDependent(
I);
17468 if (isa<AllocaInst>(BundleMember->Inst) ||
17469 BundleMember->Inst->mayReadOrWriteMemory()) {
17470 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17471 I != ScheduleEnd;
I =
I->getNextNode()) {
17472 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
17473 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17477 MakeControlDependent(
I);
17484 ScheduleData *DepDest = BundleMember->NextLoadStore;
17489 "NextLoadStore list for non memory effecting bundle?");
17491 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17492 unsigned NumAliased = 0;
17493 unsigned DistToSrc = 1;
17495 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17496 assert(isInSchedulingRegion(DepDest));
17506 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17508 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17515 DepDest->MemoryDependencies.push_back(BundleMember);
17516 BundleMember->Dependencies++;
17517 ScheduleData *DestBundle = DepDest->FirstInBundle;
17518 if (!DestBundle->IsScheduled) {
17519 BundleMember->incrementUnscheduledDeps(1);
17521 if (!DestBundle->hasValidDependencies()) {
17544 if (InsertInReadyList && SD->isReady()) {
17545 ReadyInsts.insert(SD);
17552void BoUpSLP::BlockScheduling::resetSchedule() {
17554 "tried to reset schedule on block which has not been scheduled");
17555 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
17556 if (ScheduleData *SD = getScheduleData(
I)) {
17557 assert(isInSchedulingRegion(SD) &&
17558 "ScheduleData not in scheduling region");
17559 SD->IsScheduled =
false;
17560 SD->resetUnscheduledDeps();
17563 ReadyInsts.clear();
17566void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17567 if (!BS->ScheduleStart)
17570 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
17577 BS->resetSchedule();
17584 struct ScheduleDataCompare {
17585 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
17586 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17589 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17594 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
17595 I =
I->getNextNode()) {
17596 if (ScheduleData *SD = BS->getScheduleData(
I)) {
17597 TreeEntry *SDTE = getTreeEntry(SD->Inst);
17600 SD->isPartOfBundle() ==
17602 "scheduler and vectorizer bundle mismatch");
17603 SD->FirstInBundle->SchedulingPriority =
Idx++;
17605 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17606 BS->calculateDependencies(SD,
false,
this);
17609 BS->initialFillReadyList(ReadyInsts);
17611 Instruction *LastScheduledInst = BS->ScheduleEnd;
17614 while (!ReadyInsts.empty()) {
17615 ScheduleData *Picked = *ReadyInsts.begin();
17616 ReadyInsts.erase(ReadyInsts.begin());
17620 for (ScheduleData *BundleMember = Picked; BundleMember;
17621 BundleMember = BundleMember->NextInBundle) {
17625 LastScheduledInst = PickedInst;
17628 BS->schedule(Picked, ReadyInsts);
17632#ifdef EXPENSIVE_CHECKS
17636#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17638 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
17639 ScheduleData *SD = BS->getScheduleData(
I);
17640 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17641 assert(SD->IsScheduled &&
"must be scheduled at this point");
17646 BS->ScheduleStart =
nullptr;
17653 if (
auto *Store = dyn_cast<StoreInst>(V))
17654 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17656 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
17659 auto E = InstrElementSize.
find(V);
17660 if (E != InstrElementSize.
end())
17669 if (
auto *
I = dyn_cast<Instruction>(V)) {
17677 Value *FirstNonBool =
nullptr;
17678 while (!Worklist.
empty()) {
17683 auto *Ty =
I->getType();
17684 if (isa<VectorType>(Ty))
17686 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
17693 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
17694 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
17702 for (
Use &U :
I->operands()) {
17703 if (
auto *J = dyn_cast<Instruction>(U.get()))
17704 if (Visited.
insert(J).second &&
17705 (isa<PHINode>(
I) || J->getParent() == Parent)) {
17709 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
17710 FirstNonBool = U.get();
17721 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
17723 Width =
DL->getTypeSizeInBits(V->getType());
17727 InstrElementSize[
I] = Width;
17732bool BoUpSLP::collectValuesToDemote(
17733 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
17736 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
17738 if (
all_of(E.Scalars, IsaPred<Constant>))
17741 unsigned OrigBitWidth =
17742 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17749 if (NodesToKeepBWs.
contains(E.Idx))
17755 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
17756 if (isa<PoisonValue>(R))
17758 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17760 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
17761 if (isa<PoisonValue>(V))
17769 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
17775 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17778 if (
auto *
I = dyn_cast<Instruction>(V)) {
17780 unsigned BitWidth2 =
17781 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17782 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17788 BitWidth1 = std::min(BitWidth1, BitWidth2);
17793 auto FinalAnalysis = [&,
TTI =
TTI]() {
17794 if (!IsProfitableToDemote)
17797 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
17799 if (Res && E.isGather()) {
17803 for (
Value *V : E.Scalars) {
17804 auto *EE = dyn_cast<ExtractElementInst>(V);
17807 UniqueBases.
insert(EE->getVectorOperand());
17809 const unsigned VF = E.Scalars.size();
17810 Type *OrigScalarTy = E.Scalars.front()->getType();
17811 if (UniqueBases.
size() <= 2 ||
17819 if (E.isGather() || !Visited.
insert(&E).second ||
17821 return all_of(V->users(), [&](User *U) {
17822 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17825 return FinalAnalysis();
17828 return !all_of(V->users(), [=](User *U) {
17829 return getTreeEntry(U) ||
17830 (E.Idx == 0 && UserIgnoreList &&
17831 UserIgnoreList->contains(U)) ||
17832 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17833 !U->getType()->isScalableTy() &&
17834 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17835 }) && !IsPotentiallyTruncated(V,
BitWidth);
17840 bool &NeedToExit) {
17841 NeedToExit =
false;
17842 unsigned InitLevel = MaxDepthLevel;
17844 unsigned Level = InitLevel;
17845 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
17846 ToDemote, Visited, NodesToKeepBWs, Level,
17847 IsProfitableToDemote, IsTruncRoot)) {
17848 if (!IsProfitableToDemote)
17851 if (!FinalAnalysis())
17855 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17859 auto AttemptCheckBitwidth =
17862 NeedToExit =
false;
17863 unsigned BestFailBitwidth = 0;
17865 if (Checker(
BitWidth, OrigBitWidth))
17867 if (BestFailBitwidth == 0 && FinalAnalysis())
17871 if (BestFailBitwidth == 0) {
17882 auto TryProcessInstruction =
17888 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17893 if (E.UserTreeIndices.size() > 1 &&
17894 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17897 bool NeedToExit =
false;
17898 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17902 if (!ProcessOperands(
Operands, NeedToExit))
17911 return IsProfitableToDemote;
17913 switch (E.getOpcode()) {
17917 case Instruction::Trunc:
17918 if (IsProfitableToDemoteRoot)
17919 IsProfitableToDemote =
true;
17920 return TryProcessInstruction(
BitWidth);
17921 case Instruction::ZExt:
17922 case Instruction::SExt:
17923 IsProfitableToDemote =
true;
17924 return TryProcessInstruction(
BitWidth);
17928 case Instruction::Add:
17929 case Instruction::Sub:
17930 case Instruction::Mul:
17931 case Instruction::And:
17932 case Instruction::Or:
17933 case Instruction::Xor: {
17934 return TryProcessInstruction(
17935 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17937 case Instruction::Freeze:
17938 return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
17939 case Instruction::Shl: {
17944 if (isa<PoisonValue>(V))
17946 auto *I = cast<Instruction>(V);
17947 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17948 return AmtKnownBits.getMaxValue().ult(BitWidth);
17951 return TryProcessInstruction(
17952 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17954 case Instruction::LShr: {
17958 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17960 if (isa<PoisonValue>(V))
17962 auto *I = cast<Instruction>(V);
17963 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17964 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17965 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17966 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17967 SimplifyQuery(*DL));
17970 return TryProcessInstruction(
17971 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17974 case Instruction::AShr: {
17978 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17980 if (isa<PoisonValue>(V))
17982 auto *I = cast<Instruction>(V);
17983 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17984 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17985 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17986 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17990 return TryProcessInstruction(
17991 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17994 case Instruction::UDiv:
17995 case Instruction::URem: {
17997 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18000 auto *I = cast<Instruction>(V);
18001 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18002 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
18003 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18006 return TryProcessInstruction(
18007 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18011 case Instruction::Select: {
18012 return TryProcessInstruction(
18013 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18018 case Instruction::PHI: {
18019 const unsigned NumOps = E.getNumOperands();
18022 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
18024 return TryProcessInstruction(
BitWidth, Ops);
18027 case Instruction::Call: {
18028 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18032 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
18033 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
18037 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18040 auto *I = cast<Instruction>(V);
18041 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18042 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18043 return MaskedValueIsZero(I->getOperand(0), Mask,
18044 SimplifyQuery(*DL)) &&
18045 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18047 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
18048 "Expected min/max intrinsics only.");
18049 unsigned SignBits = OrigBitWidth -
BitWidth;
18055 return SignBits <= Op0SignBits &&
18056 ((SignBits != Op0SignBits &&
18060 SignBits <= Op1SignBits &&
18061 ((SignBits != Op1SignBits &&
18066 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18069 auto *I = cast<Instruction>(V);
18070 unsigned SignBits = OrigBitWidth - BitWidth;
18071 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18072 unsigned Op0SignBits =
18073 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18074 return SignBits <= Op0SignBits &&
18075 ((SignBits != Op0SignBits &&
18076 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18077 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18080 if (
ID != Intrinsic::abs) {
18081 Operands.push_back(getOperandEntry(&E, 1));
18082 CallChecker = CompChecker;
18084 CallChecker = AbsChecker;
18087 std::numeric_limits<InstructionCost::CostType>::max();
18089 unsigned VF = E.Scalars.size();
18099 if (
Cost < BestCost) {
18105 [[maybe_unused]]
bool NeedToExit;
18106 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18116 return FinalAnalysis();
18123 bool IsStoreOrInsertElt =
18124 VectorizableTree.front()->getOpcode() == Instruction::Store ||
18125 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
18126 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18127 ExtraBitWidthNodes.
size() <= 1 &&
18128 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18129 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18132 unsigned NodeIdx = 0;
18133 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18137 if (VectorizableTree[NodeIdx]->
isGather() ||
18138 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18139 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18141 return EI.
UserTE->Idx > NodeIdx;
18147 bool IsTruncRoot =
false;
18148 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18151 if (NodeIdx != 0 &&
18152 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18153 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
18154 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
18155 IsTruncRoot =
true;
18157 IsProfitableToDemoteRoot =
true;
18162 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
18166 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
18167 bool IsProfitableToDemoteRoot,
unsigned Opcode,
18168 unsigned Limit,
bool IsTruncRoot,
18169 bool IsSignedCmp) ->
unsigned {
18173 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18174 !NodesToKeepBWs.
contains(E.Idx) &&
18175 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18177 return V->hasOneUse() || isa<Constant>(V) ||
18180 const TreeEntry *TE = getTreeEntry(U);
18181 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18182 if (TE == UserTE || !TE)
18184 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18186 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18187 SelectInst>(UserTE->getMainOp()))
18189 unsigned UserTESz = DL->getTypeSizeInBits(
18190 UserTE->Scalars.front()->getType());
18191 auto It = MinBWs.find(TE);
18192 if (It != MinBWs.end() && It->second.first > UserTESz)
18194 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18198 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18199 auto It = MinBWs.
find(UserTE);
18200 if (It != MinBWs.
end())
18201 return It->second.first;
18202 unsigned MaxBitWidth =
18203 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18204 MaxBitWidth =
bit_ceil(MaxBitWidth);
18205 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18207 return MaxBitWidth;
18210 unsigned VF = E.getVectorFactor();
18211 Type *ScalarTy = E.Scalars.front()->getType();
18213 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
18214 if (!TreeRootIT || !Opcode)
18218 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
18227 unsigned MaxBitWidth = 1u;
18235 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
18236 if (isa<PoisonValue>(R))
18238 KnownBits Known = computeKnownBits(R, *DL);
18239 return Known.isNonNegative();
18244 for (
Value *Root : E.Scalars) {
18245 if (isa<PoisonValue>(Root))
18250 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18266 if (!IsKnownPositive)
18270 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18272 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18275 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18280 if (NumParts > 1 &&
18286 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18287 Opcode == Instruction::SExt ||
18288 Opcode == Instruction::ZExt || NumParts > 1;
18293 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18294 bool NeedToDemote = IsProfitableToDemote;
18296 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18297 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18298 NeedToDemote, IsTruncRoot) ||
18299 (MaxDepthLevel <= Limit &&
18300 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18301 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18302 DL->getTypeSizeInBits(TreeRootIT) /
18303 DL->getTypeSizeInBits(
18304 E.getMainOp()->getOperand(0)->getType()) >
18308 MaxBitWidth =
bit_ceil(MaxBitWidth);
18310 return MaxBitWidth;
18317 if (UserIgnoreList &&
18318 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18321 if (
all_of(*UserIgnoreList,
18323 return isa<PoisonValue>(V) ||
18324 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18326 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18327 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18328 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18330 ReductionBitWidth = 1;
18332 for (
Value *V : *UserIgnoreList) {
18333 if (isa<PoisonValue>(V))
18336 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
18337 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18340 unsigned BitWidth2 = BitWidth1;
18343 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18345 ReductionBitWidth =
18346 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18348 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18349 ReductionBitWidth = 8;
18351 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
18354 bool IsTopRoot = NodeIdx == 0;
18355 while (NodeIdx < VectorizableTree.size() &&
18356 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18357 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18360 IsTruncRoot =
true;
18362 bool IsSignedCmp =
false;
18363 while (NodeIdx < VectorizableTree.size()) {
18365 unsigned Limit = 2;
18366 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
18368 ReductionBitWidth ==
18369 DL->getTypeSizeInBits(
18370 VectorizableTree.front()->Scalars.front()->getType()))
18372 unsigned MaxBitWidth = ComputeMaxBitWidth(
18373 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
18374 Limit, IsTruncRoot, IsSignedCmp);
18375 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
18376 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18377 ReductionBitWidth =
bit_ceil(MaxBitWidth);
18378 else if (MaxBitWidth == 0)
18379 ReductionBitWidth = 0;
18382 for (
unsigned Idx : RootDemotes) {
18385 DL->getTypeSizeInBits(V->getType()->getScalarType());
18386 if (OrigBitWidth > MaxBitWidth) {
18394 RootDemotes.clear();
18396 IsProfitableToDemoteRoot =
true;
18398 if (ExtraBitWidthNodes.
empty()) {
18399 NodeIdx = VectorizableTree.size();
18401 unsigned NewIdx = 0;
18403 NewIdx = *ExtraBitWidthNodes.
begin();
18404 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
18405 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
18408 NodeIdx < VectorizableTree.size() &&
18409 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18412 EI.
UserTE->getOpcode() == Instruction::Trunc &&
18413 !EI.
UserTE->isAltShuffle();
18416 NodeIdx < VectorizableTree.size() &&
18417 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18419 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
18421 auto *IC = dyn_cast<ICmpInst>(V);
18424 !isKnownNonNegative(IC->getOperand(0),
18425 SimplifyQuery(*DL)) ||
18426 !isKnownNonNegative(IC->getOperand(1),
18427 SimplifyQuery(*DL)));
18434 if (MaxBitWidth == 0 ||
18436 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
18438 if (UserIgnoreList)
18446 for (
unsigned Idx : ToDemote) {
18447 TreeEntry *TE = VectorizableTree[
Idx].get();
18450 bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
18451 if (isa<PoisonValue>(R))
18453 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18471 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
18496 DL = &
F.getDataLayout();
18500 bool Changed =
false;
18506 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
18511 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
18514 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
18518 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
18527 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
18532 R.clearReductionData();
18533 collectSeedInstructions(BB);
18536 if (!Stores.
empty()) {
18538 <<
" underlying objects.\n");
18539 Changed |= vectorizeStoreChains(R);
18543 Changed |= vectorizeChainsInBlock(BB, R);
18548 if (!GEPs.
empty()) {
18550 <<
" underlying objects.\n");
18551 Changed |= vectorizeGEPIndices(BB, R);
18556 R.optimizeGatherSequence();
18564 unsigned Idx,
unsigned MinVF,
18569 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18570 unsigned VF = Chain.
size();
18574 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
18576 VF < 2 || VF < MinVF) {
18588 for (
Value *V : Chain)
18589 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
18592 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
18594 bool IsAllowedSize =
18598 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18599 (!S.getMainOp()->isSafeToRemove() ||
18602 return !isa<ExtractElementInst>(V) &&
18603 (V->getNumUses() > Chain.size() ||
18604 any_of(V->users(), [&](User *U) {
18605 return !Stores.contains(U);
18608 (ValOps.
size() > Chain.size() / 2 && !S)) {
18609 Size = (!IsAllowedSize && S) ? 1 : 2;
18613 if (
R.isLoadCombineCandidate(Chain))
18615 R.buildTree(Chain);
18617 if (
R.isTreeTinyAndNotFullyVectorizable()) {
18618 if (
R.isGathered(Chain.front()) ||
18619 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18620 return std::nullopt;
18621 Size =
R.getCanonicalGraphSize();
18624 R.reorderTopToBottom();
18625 R.reorderBottomToTop();
18626 R.transformNodes();
18627 R.buildExternalUses();
18629 R.computeMinimumValueSizes();
18631 Size =
R.getCanonicalGraphSize();
18632 if (S && S.getOpcode() == Instruction::Load)
18640 using namespace ore;
18643 cast<StoreInst>(Chain[0]))
18644 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
18645 <<
" and with tree size "
18646 <<
NV(
"TreeSize",
R.getTreeSize()));
18660 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18661 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18662 unsigned Size = First ? Val.first : Val.second;
18674 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18675 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18676 unsigned P = First ? Val.first : Val.second;
18679 return V + (P - Mean) * (P - Mean);
18682 return Dev * 81 / (Mean * Mean) == 0;
18685bool SLPVectorizerPass::vectorizeStores(
18687 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18692 bool Changed =
false;
18694 struct StoreDistCompare {
18695 bool operator()(
const std::pair<unsigned, int> &Op1,
18696 const std::pair<unsigned, int> &Op2)
const {
18697 return Op1.second < Op2.second;
18702 using StoreIndexToDistSet =
18703 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18704 auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
18709 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
18711 PrevDist =
Data.second;
18712 if (
Idx !=
Set.size() - 1)
18717 Operands.push_back(Stores[DataVar.first]);
18718 PrevDist = DataVar.second;
18723 .
insert({Operands.front(),
18724 cast<StoreInst>(Operands.front())->getValueOperand(),
18726 cast<StoreInst>(Operands.back())->getValueOperand(),
18731 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18732 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
18736 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18738 Type *StoreTy =
Store->getValueOperand()->getType();
18739 Type *ValueTy = StoreTy;
18740 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
18741 ValueTy = Trunc->getSrcTy();
18742 unsigned MinVF = std::max<unsigned>(
18744 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18747 if (MaxVF < MinVF) {
18748 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18750 <<
"MinVF (" << MinVF <<
")\n");
18754 unsigned NonPowerOf2VF = 0;
18759 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
18761 NonPowerOf2VF = CandVF;
18762 assert(NonPowerOf2VF != MaxVF &&
18763 "Non-power-of-2 VF should not be equal to MaxVF");
18767 unsigned MaxRegVF = MaxVF;
18769 if (MaxVF < MinVF) {
18770 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18772 <<
"MinVF (" << MinVF <<
")\n");
18778 unsigned Size = MinVF;
18780 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
18784 unsigned Repeat = 0;
18785 constexpr unsigned MaxAttempts = 4;
18787 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
18788 P.first =
P.second = 1;
18791 auto IsNotVectorized = [](
bool First,
18792 const std::pair<unsigned, unsigned> &
P) {
18793 return First ?
P.first > 0 :
P.second > 0;
18795 auto IsVectorized = [](
bool First,
18796 const std::pair<unsigned, unsigned> &
P) {
18797 return First ?
P.first == 0 :
P.second == 0;
18799 auto VFIsProfitable = [](
bool First,
unsigned Size,
18800 const std::pair<unsigned, unsigned> &
P) {
18803 auto FirstSizeSame = [](
unsigned Size,
18804 const std::pair<unsigned, unsigned> &
P) {
18805 return Size ==
P.first;
18809 bool RepeatChanged =
false;
18810 bool AnyProfitableGraph =
false;
18811 for (
unsigned Size : CandidateVFs) {
18812 AnyProfitableGraph =
false;
18813 unsigned StartIdx = std::distance(
18814 RangeSizes.begin(),
18815 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
18816 std::placeholders::_1)));
18817 while (StartIdx <
End) {
18819 std::distance(RangeSizes.begin(),
18820 find_if(RangeSizes.drop_front(StartIdx),
18821 std::bind(IsVectorized,
Size >= MaxRegVF,
18822 std::placeholders::_1)));
18823 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
18824 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
18826 Size >= MaxRegVF)) {
18833 return cast<StoreInst>(V)
18834 ->getValueOperand()
18836 cast<StoreInst>(Slice.
front())
18837 ->getValueOperand()
18840 "Expected all operands of same type.");
18841 if (!NonSchedulable.empty()) {
18842 auto [NonSchedSizeMax, NonSchedSizeMin] =
18843 NonSchedulable.lookup(Slice.
front());
18844 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
18845 Cnt += NonSchedSizeMax;
18850 std::optional<bool> Res =
18851 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18855 .first->getSecond()
18863 AnyProfitableGraph = RepeatChanged = Changed =
true;
18867 [](std::pair<unsigned, unsigned> &
P) {
18868 P.first = P.second = 0;
18870 if (Cnt < StartIdx + MinVF) {
18871 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18872 [](std::pair<unsigned, unsigned> &
P) {
18873 P.first = P.second = 0;
18875 StartIdx = Cnt +
Size;
18877 if (Cnt > Sz -
Size - MinVF) {
18879 [](std::pair<unsigned, unsigned> &
P) {
18880 P.first = P.second = 0;
18889 if (
Size > 2 && Res &&
18891 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
18892 std::placeholders::_1))) {
18898 if (
Size > MaxRegVF && TreeSize > 1 &&
18900 std::bind(FirstSizeSame, TreeSize,
18901 std::placeholders::_1))) {
18903 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18909 [&](std::pair<unsigned, unsigned> &
P) {
18910 if (Size >= MaxRegVF)
18911 P.second = std::max(P.second, TreeSize);
18913 P.first = std::max(P.first, TreeSize);
18916 AnyProfitableGraph =
true;
18918 if (StartIdx >=
End)
18920 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18921 AnyProfitableGraph =
true;
18922 StartIdx = std::distance(
18923 RangeSizes.begin(),
18924 find_if(RangeSizes.drop_front(Sz),
18925 std::bind(IsNotVectorized,
Size >= MaxRegVF,
18926 std::placeholders::_1)));
18932 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
18933 return P.first == 0 &&
P.second == 0;
18937 if (Repeat >= MaxAttempts ||
18938 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18940 constexpr unsigned StoresLimit = 64;
18941 const unsigned MaxTotalNum = std::min<unsigned>(
18943 static_cast<unsigned>(
18946 RangeSizes.begin(),
18947 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
18948 std::placeholders::_1))) +
18950 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
18953 CandidateVFs.clear();
18955 CandidateVFs.push_back(Limit);
18956 if (VF > MaxTotalNum || VF >= StoresLimit)
18958 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
18960 P.first = std::max(
P.second,
P.first);
18964 CandidateVFs.push_back(VF);
19011 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19013 Stores[
Set.first]->getValueOperand()->getType(),
19014 Stores[
Set.first]->getPointerOperand(),
19015 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
19019 auto It =
Set.second.find(std::make_pair(
Idx, *Diff));
19020 if (It ==
Set.second.end()) {
19021 Set.second.emplace(
Idx, *Diff);
19025 TryToVectorize(
Set.second);
19026 unsigned ItIdx = It->first;
19027 int ItDist = It->second;
19028 StoreIndexToDistSet PrevSet;
19029 copy_if(
Set.second, std::inserter(PrevSet, PrevSet.end()),
19030 [&](
const std::pair<unsigned, int> &Pair) {
19031 return Pair.first > ItIdx;
19033 Set.second.clear();
19035 Set.second.emplace(
Idx, 0);
19038 unsigned StartIdx = ItIdx + 1;
19043 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
19045 if (VectorizedStores.
contains(Stores[Pair.first]))
19047 unsigned BI = Pair.first - StartIdx;
19048 UsedStores.set(BI);
19049 Dists[BI] = Pair.second - ItDist;
19051 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
19052 unsigned BI =
I - StartIdx;
19053 if (UsedStores.test(BI))
19054 Set.second.emplace(
I, Dists[BI]);
19058 auto &Res = SortedStores.emplace_back();
19060 Res.second.emplace(
Idx, 0);
19062 Type *PrevValTy =
nullptr;
19064 if (
R.isDeleted(SI))
19067 PrevValTy =
SI->getValueOperand()->getType();
19069 if (PrevValTy !=
SI->getValueOperand()->getType()) {
19070 for (
auto &Set : SortedStores)
19071 TryToVectorize(
Set.second);
19072 SortedStores.clear();
19073 PrevValTy =
SI->getValueOperand()->getType();
19075 FillStoresSet(
I, SI);
19079 for (
auto &Set : SortedStores)
19080 TryToVectorize(
Set.second);
19085void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
19096 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
19097 if (!
SI->isSimple())
19107 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
19108 if (
GEP->getNumIndices() != 1)
19111 if (isa<Constant>(
Idx))
19115 if (
GEP->getType()->isVectorTy())
19127 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
19128 << VL.
size() <<
".\n");
19139 for (
Value *V : VL) {
19140 Type *Ty =
V->getType();
19144 R.getORE()->emit([&]() {
19145 std::string TypeStr;
19149 <<
"Cannot SLP vectorize list: type "
19150 << TypeStr +
" is unsupported by vectorizer";
19156 unsigned Sz =
R.getVectorElementSize(I0);
19157 unsigned MinVF =
R.getMinVF(Sz);
19158 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
19159 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19161 R.getORE()->emit([&]() {
19163 <<
"Cannot SLP vectorize list: vectorization factor "
19164 <<
"less than 2 is not supported";
19169 bool Changed =
false;
19170 bool CandidateFound =
false;
19174 unsigned NextInst = 0, MaxInst = VL.size();
19175 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
19182 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
19183 unsigned ActualVF = std::min(MaxInst -
I, VF);
19188 if (MaxVFOnly && ActualVF < MaxVF)
19190 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
19195 for (
Value *V : VL.drop_front(
I)) {
19198 if (
auto *Inst = dyn_cast<Instruction>(V);
19199 !Inst || !
R.isDeleted(Inst)) {
19202 if (
Idx == ActualVF)
19207 if (
Idx != ActualVF)
19210 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
19214 if (
R.isTreeTinyAndNotFullyVectorizable())
19216 R.reorderTopToBottom();
19217 R.reorderBottomToTop(
19218 !isa<InsertElementInst>(Ops.
front()) &&
19219 !
R.doesRootHaveInTreeUses());
19220 R.transformNodes();
19221 R.buildExternalUses();
19223 R.computeMinimumValueSizes();
19225 CandidateFound =
true;
19226 MinCost = std::min(MinCost,
Cost);
19229 <<
" for VF=" << ActualVF <<
"\n");
19233 cast<Instruction>(Ops[0]))
19234 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
19235 <<
" and with tree size "
19236 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
19247 if (!Changed && CandidateFound) {
19248 R.getORE()->emit([&]() {
19250 <<
"List vectorization was possible but not beneficial with cost "
19251 <<
ore::NV(
"Cost", MinCost) <<
" >= "
19254 }
else if (!Changed) {
19255 R.getORE()->emit([&]() {
19257 <<
"Cannot SLP vectorize list: vectorization was impossible"
19258 <<
" with available vectorization factors";
19268 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
19274 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
19275 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
19276 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
19277 R.isDeleted(Op0) ||
R.isDeleted(Op1))
19284 auto *
A = dyn_cast<BinaryOperator>(Op0);
19285 auto *
B = dyn_cast<BinaryOperator>(Op1);
19287 if (
A &&
B &&
B->hasOneUse()) {
19288 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
19289 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
19290 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
19292 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
19296 if (
B &&
A &&
A->hasOneUse()) {
19297 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
19298 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
19299 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
19301 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
19305 if (Candidates.
size() == 1)
19306 return tryToVectorizeList({Op0, Op1},
R);
19309 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
19310 if (!BestCandidate)
19312 return tryToVectorizeList(
19313 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
19347 ReductionOpsListType ReductionOps;
19357 bool IsSupportedHorRdxIdentityOp =
false;
19368 return isa<SelectInst>(
I) &&
19374 if (Kind == RecurKind::None)
19382 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19386 return I->getFastMathFlags().noNaNs();
19389 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19392 return I->isAssociative();
19401 return I->getOperand(2);
19402 return I->getOperand(
Index);
19409 case RecurKind::Or: {
19417 case RecurKind::And: {
19425 case RecurKind::Add:
19426 case RecurKind::Mul:
19427 case RecurKind::Xor:
19428 case RecurKind::FAdd:
19429 case RecurKind::FMul: {
19434 case RecurKind::FMax:
19436 case RecurKind::FMin:
19438 case RecurKind::FMaximum:
19440 case RecurKind::FMinimum:
19442 case RecurKind::SMax:
19448 case RecurKind::SMin:
19454 case RecurKind::UMax:
19460 case RecurKind::UMin:
19475 const ReductionOpsListType &ReductionOps) {
19476 bool UseSelect = ReductionOps.size() == 2 ||
19478 (ReductionOps.size() == 1 &&
19479 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19480 assert((!UseSelect || ReductionOps.size() != 2 ||
19481 isa<SelectInst>(ReductionOps[1][0])) &&
19482 "Expected cmp + select pairs for reduction");
19485 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
19499 auto *
I = dyn_cast<Instruction>(V);
19501 return RecurKind::None;
19503 return RecurKind::Add;
19505 return RecurKind::Mul;
19508 return RecurKind::And;
19511 return RecurKind::Or;
19513 return RecurKind::Xor;
19515 return RecurKind::FAdd;
19517 return RecurKind::FMul;
19520 return RecurKind::FMax;
19522 return RecurKind::FMin;
19525 return RecurKind::FMaximum;
19527 return RecurKind::FMinimum;
19533 return RecurKind::SMax;
19535 return RecurKind::SMin;
19537 return RecurKind::UMax;
19539 return RecurKind::UMin;
19541 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
19563 if (!isa<ExtractElementInst>(
RHS) ||
19565 return RecurKind::None;
19567 if (!isa<ExtractElementInst>(
LHS) ||
19569 return RecurKind::None;
19571 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
19572 return RecurKind::None;
19576 return RecurKind::None;
19581 return RecurKind::None;
19584 return RecurKind::SMax;
19587 return RecurKind::SMin;
19590 return RecurKind::UMax;
19593 return RecurKind::UMin;
19596 return RecurKind::None;
19600 static unsigned getFirstOperandIndex(
Instruction *
I) {
19601 return isCmpSelMinMax(
I) ? 1 : 0;
19607 return isCmpSelMinMax(
I) ? 3 : 2;
19613 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
19614 auto *Sel = cast<SelectInst>(
I);
19615 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
19616 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
19618 return I->getParent() == BB;
19622 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
19623 if (IsCmpSelMinMax) {
19626 if (
auto *Sel = dyn_cast<SelectInst>(
I))
19627 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
19628 return I->hasNUses(2);
19632 return I->hasOneUse();
19637 if (isCmpSelMinMax(
I))
19638 ReductionOps.assign(2, ReductionOpsType());
19640 ReductionOps.assign(1, ReductionOpsType());
19645 if (isCmpSelMinMax(
I)) {
19646 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
19647 ReductionOps[1].emplace_back(
I);
19649 ReductionOps[0].emplace_back(
I);
19654 int Sz = Data.size();
19655 auto *
I = dyn_cast<Instruction>(Data.front());
19656 return Sz > 1 ||
isConstant(Data.front()) ||
19667 RdxKind = HorizontalReduction::getRdxKind(Root);
19668 if (!isVectorizable(RdxKind, Root))
19679 if (
auto *Sel = dyn_cast<SelectInst>(Root))
19680 if (!Sel->getCondition()->hasOneUse())
19683 ReductionRoot = Root;
19688 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19690 1, std::make_pair(Root, 0));
19698 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
19699 getNumberOfOperands(TreeN)))) {
19700 Value *EdgeVal = getRdxOperand(TreeN,
I);
19701 ReducedValsToOps[EdgeVal].push_back(TreeN);
19702 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19709 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19710 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19711 !isVectorizable(RdxKind, EdgeInst) ||
19712 (
R.isAnalyzedReductionRoot(EdgeInst) &&
19713 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19714 PossibleReducedVals.push_back(EdgeVal);
19717 ReductionOps.push_back(EdgeInst);
19728 PossibleReducedVals;
19729 initReductionOps(Root);
19733 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
19737 if (!LoadKeyUsed.
insert(Key).second) {
19738 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
19739 if (LIt != LoadsMap.
end()) {
19740 for (
LoadInst *RLI : LIt->second) {
19746 for (
LoadInst *RLI : LIt->second) {
19753 if (LIt->second.size() > 2) {
19755 hash_value(LIt->second.back()->getPointerOperand());
19761 .first->second.push_back(LI);
19765 while (!Worklist.empty()) {
19766 auto [TreeN, Level] = Worklist.pop_back_val();
19769 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19770 addReductionOps(TreeN);
19773 for (
Value *V : PossibleRedVals) {
19777 ++PossibleReducedVals[
Key][
Idx]
19778 .
insert(std::make_pair(V, 0))
19782 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
19784 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
19787 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
19788 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
19790 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
19793 auto RedValsVect = It->second.takeVector();
19795 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
19796 PossibleRedValsVect.
back().append(Data.second, Data.first);
19798 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
19799 return P1.size() > P2.size();
19804 (!isGoodForReduction(Data) &&
19805 (!isa<LoadInst>(Data.front()) ||
19806 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19808 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19810 cast<LoadInst>(ReducedVals[NewIdx].front())
19812 NewIdx = ReducedVals.
size();
19815 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
19830 constexpr unsigned RegMaxNumber = 4;
19831 constexpr unsigned RedValsMaxNumber = 128;
19835 if (
unsigned NumReducedVals = std::accumulate(
19836 ReducedVals.
begin(), ReducedVals.
end(), 0,
19838 if (!isGoodForReduction(Vals))
19840 return Num + Vals.size();
19842 NumReducedVals < ReductionLimit &&
19846 for (ReductionOpsType &RdxOps : ReductionOps)
19847 for (
Value *RdxOp : RdxOps)
19848 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19859 ReducedVals.
front().size());
19863 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
19864 assert(isa<SelectInst>(RdxRootInst) &&
19865 "Expected min/max reduction to have select root instruction");
19866 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19867 assert(isa<Instruction>(ScalarCond) &&
19868 "Expected min/max reduction to have compare condition");
19869 return cast<Instruction>(ScalarCond);
19872 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
19873 return isBoolLogicOp(cast<Instruction>(V));
19876 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
19877 if (VectorizedTree) {
19880 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19881 if (AnyBoolLogicOp) {
19882 auto It = ReducedValsToOps.
find(VectorizedTree);
19883 auto It1 = ReducedValsToOps.
find(Res);
19884 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
19886 (It != ReducedValsToOps.
end() &&
19888 return isBoolLogicOp(I) &&
19889 getRdxOperand(I, 0) == VectorizedTree;
19893 (It1 != ReducedValsToOps.
end() &&
19895 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19899 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
19903 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
19910 ReductionOps.front().size());
19911 for (ReductionOpsType &RdxOps : ReductionOps)
19912 for (
Value *RdxOp : RdxOps) {
19915 IgnoreList.insert(RdxOp);
19920 for (
Value *U : IgnoreList)
19921 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
19922 RdxFMF &= FPMO->getFastMathFlags();
19923 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19928 for (
Value *V : Candidates)
19929 TrackedVals.try_emplace(V, V);
19932 Value *
V) ->
unsigned & {
19933 auto *It = MV.
find(V);
19934 assert(It != MV.
end() &&
"Unable to find given key.");
19943 bool CheckForReusedReductionOps =
false;
19948 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
19950 InstructionsState S = States[
I];
19954 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
19955 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19960 auto *Inst = dyn_cast<Instruction>(RdxVal);
19962 (!S || !S.isOpcodeOrAlt(Inst))) ||
19966 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19968 bool ShuffledExtracts =
false;
19970 if (S && S.getOpcode() == Instruction::ExtractElement &&
19971 !S.isAltShuffle() &&
I + 1 <
E) {
19973 for (
Value *RV : ReducedVals[
I + 1]) {
19974 Value *RdxVal = TrackedVals.at(RV);
19978 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19981 CommonCandidates.push_back(RdxVal);
19982 TrackedToOrig.try_emplace(RdxVal, RV);
19987 Candidates.
swap(CommonCandidates);
19988 ShuffledExtracts =
true;
19995 Value *OrigV = TrackedToOrig.at(Candidates.
front());
19996 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19998 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
19999 Value *OrigV = TrackedToOrig.at(VC);
20000 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20001 if (
auto *ResI = dyn_cast<Instruction>(Res))
20002 V.analyzedReductionRoot(ResI);
20004 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
20008 unsigned NumReducedVals = Candidates.
size();
20009 if (NumReducedVals < ReductionLimit &&
20010 (NumReducedVals < 2 || !
isSplat(Candidates)))
20015 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20016 RdxKind != RecurKind::FMul &&
20017 RdxKind != RecurKind::FMulAdd;
20020 if (IsSupportedHorRdxIdentityOp)
20021 for (
Value *V : Candidates) {
20022 Value *OrigV = TrackedToOrig.at(V);
20023 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20035 bool SameScaleFactor =
false;
20036 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20037 SameValuesCounter.
size() != Candidates.size();
20039 if (OptReusedScalars) {
20041 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20042 RdxKind == RecurKind::Xor) &&
20044 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
20045 return P.second == SameValuesCounter.
front().second;
20047 Candidates.resize(SameValuesCounter.
size());
20048 transform(SameValuesCounter, Candidates.begin(),
20049 [&](
const auto &
P) { return TrackedVals.at(P.first); });
20050 NumReducedVals = Candidates.size();
20052 if (NumReducedVals == 1) {
20053 Value *OrigV = TrackedToOrig.at(Candidates.front());
20054 unsigned Cnt = At(SameValuesCounter, OrigV);
20056 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20057 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20058 VectorizedVals.try_emplace(OrigV, Cnt);
20059 ExternallyUsedValues.
insert(OrigV);
20064 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
20065 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
20066 const unsigned MaxElts = std::clamp<unsigned>(
20068 RegMaxNumber * RedValsMaxNumber);
20070 unsigned ReduxWidth = NumReducedVals;
20071 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
20072 unsigned NumParts, NumRegs;
20073 Type *ScalarTy = Candidates.front()->getType();
20080 while (NumParts > NumRegs) {
20081 ReduxWidth =
bit_floor(ReduxWidth - 1);
20087 if (NumParts > NumRegs / 2)
20092 ReduxWidth = GetVectorFactor(ReduxWidth);
20093 ReduxWidth = std::min(ReduxWidth, MaxElts);
20095 unsigned Start = 0;
20096 unsigned Pos = Start;
20098 unsigned PrevReduxWidth = ReduxWidth;
20099 bool CheckForReusedReductionOpsLocal =
false;
20100 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
20101 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
20102 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20105 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20108 if (Pos < NumReducedVals - ReduxWidth + 1)
20109 return IsAnyRedOpGathered;
20112 if (ReduxWidth > 1)
20113 ReduxWidth = GetVectorFactor(ReduxWidth);
20114 return IsAnyRedOpGathered;
20116 bool AnyVectorized =
false;
20118 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20119 ReduxWidth >= ReductionLimit) {
20122 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20124 CheckForReusedReductionOps =
true;
20127 PrevReduxWidth = ReduxWidth;
20130 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
20133 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
20135 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
20137 V.areAnalyzedReductionVals(VL)) {
20138 (void)AdjustReducedVals(
true);
20144 auto *RedValI = dyn_cast<Instruction>(RedVal);
20147 return V.isDeleted(RedValI);
20150 V.buildTree(VL, IgnoreList);
20151 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
20152 if (!AdjustReducedVals())
20153 V.analyzedReductionVals(VL);
20156 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
20157 if (!AdjustReducedVals())
20158 V.analyzedReductionVals(VL);
20161 V.reorderTopToBottom();
20163 V.reorderBottomToTop(
true);
20167 ExternallyUsedValues);
20171 LocalExternallyUsedValues.insert(ReductionRoot);
20172 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
20173 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
20175 for (
Value *V : ReducedVals[Cnt])
20176 if (isa<Instruction>(V))
20177 LocalExternallyUsedValues.insert(TrackedVals[V]);
20179 if (!IsSupportedHorRdxIdentityOp) {
20182 "Reused values counter map is not empty");
20183 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20184 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20186 Value *
V = Candidates[Cnt];
20187 Value *OrigV = TrackedToOrig.at(V);
20188 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20191 V.transformNodes();
20195 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20196 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20198 Value *RdxVal = Candidates[Cnt];
20199 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20200 RdxVal = It->second;
20201 if (!Visited.
insert(RdxVal).second)
20205 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
20206 LocalExternallyUsedValues.insert(RdxVal);
20209 Value *OrigV = TrackedToOrig.at(RdxVal);
20211 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20212 if (NumOps != ReducedValsToOps.
at(OrigV).size())
20213 LocalExternallyUsedValues.insert(RdxVal);
20216 if (!IsSupportedHorRdxIdentityOp)
20217 SameValuesCounter.
clear();
20218 for (
Value *RdxVal : VL)
20219 if (RequiredExtract.
contains(RdxVal))
20220 LocalExternallyUsedValues.insert(RdxVal);
20221 V.buildExternalUses(LocalExternallyUsedValues);
20223 V.computeMinimumValueSizes();
20228 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20231 <<
" for reduction\n");
20235 V.getORE()->emit([&]() {
20237 ReducedValsToOps.
at(VL[0]).front())
20238 <<
"Vectorizing horizontal reduction is possible "
20239 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
20240 <<
" and threshold "
20243 if (!AdjustReducedVals()) {
20244 V.analyzedReductionVals(VL);
20245 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20246 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
20249 *
TTI, VL.front()->getType(), ReduxWidth - 1);
20250 VF >= ReductionLimit;
20252 *
TTI, VL.front()->getType(), VF - 1)) {
20254 V.getCanonicalGraphSize() !=
V.getTreeSize())
20256 for (
unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20264 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
20265 <<
Cost <<
". (HorRdx)\n");
20266 V.getORE()->emit([&]() {
20268 ReducedValsToOps.
at(VL[0]).front())
20269 <<
"Vectorized horizontal reduction with cost "
20270 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
20271 <<
ore::NV(
"TreeSize",
V.getTreeSize());
20278 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20280 if (IsCmpSelMinMax)
20281 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20284 Value *VectorizedRoot =
20285 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20288 for (
Value *RdxVal : Candidates) {
20289 Value *OrigVal = TrackedToOrig.at(RdxVal);
20290 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20291 if (TransformedRdxVal != RdxVal)
20292 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20301 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
20304 if (OptReusedScalars && !SameScaleFactor) {
20305 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20306 SameValuesCounter, TrackedToOrig);
20309 Value *ReducedSubTree;
20310 Type *ScalarTy = VL.front()->getType();
20311 if (isa<FixedVectorType>(ScalarTy)) {
20316 for (
unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20334 emitReduction(Lane, Builder,
TTI, RdxRootInst->
getType()),
I);
20337 ReducedSubTree = emitReduction(VectorizedRoot, Builder,
TTI,
20340 if (ReducedSubTree->
getType() != VL.front()->getType()) {
20341 assert(ReducedSubTree->
getType() != VL.front()->getType() &&
20342 "Expected different reduction type.");
20344 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
20345 V.isSignedMinBitwidthRootNode());
20351 if (OptReusedScalars && SameScaleFactor)
20352 ReducedSubTree = emitScaleForReusedOps(
20353 ReducedSubTree, Builder, SameValuesCounter.
front().second);
20355 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20357 for (
Value *RdxVal : VL) {
20358 Value *OrigV = TrackedToOrig.at(RdxVal);
20359 if (IsSupportedHorRdxIdentityOp) {
20360 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20363 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20364 if (!
V.isVectorized(RdxVal))
20365 RequiredExtract.
insert(RdxVal);
20369 ReduxWidth = NumReducedVals - Pos;
20370 if (ReduxWidth > 1)
20371 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20372 AnyVectorized =
true;
20374 if (OptReusedScalars && !AnyVectorized) {
20375 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
20376 Value *RdxVal = TrackedVals.at(
P.first);
20377 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
20378 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20379 VectorizedVals.try_emplace(
P.first,
P.second);
20384 if (VectorizedTree) {
20405 if (!AnyBoolLogicOp)
20407 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
20408 getRdxOperand(RedOp1, 0) ==
LHS ||
20411 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
20412 getRdxOperand(RedOp2, 0) ==
RHS ||
20417 if (
LHS != VectorizedTree)
20428 unsigned Sz = InstVals.
size();
20431 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
20434 Value *RdxVal1 = InstVals[
I].second;
20435 Value *StableRdxVal1 = RdxVal1;
20436 auto It1 = TrackedVals.find(RdxVal1);
20437 if (It1 != TrackedVals.end())
20438 StableRdxVal1 = It1->second;
20439 Value *RdxVal2 = InstVals[
I + 1].second;
20440 Value *StableRdxVal2 = RdxVal2;
20441 auto It2 = TrackedVals.find(RdxVal2);
20442 if (It2 != TrackedVals.end())
20443 StableRdxVal2 = It2->second;
20447 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
20449 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20450 StableRdxVal2,
"op.rdx", ReductionOps);
20451 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
20454 ExtraReds[Sz / 2] = InstVals.
back();
20458 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
20462 for (
Value *RdxVal : Candidates) {
20463 if (!Visited.
insert(RdxVal).second)
20465 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20472 bool InitStep =
true;
20473 while (ExtraReductions.
size() > 1) {
20475 FinalGen(ExtraReductions, InitStep);
20476 ExtraReductions.
swap(NewReds);
20479 VectorizedTree = ExtraReductions.
front().second;
20481 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20490 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
20497 for (
auto *U :
Ignore->users()) {
20499 "All users must be either in the reduction ops list.");
20502 if (!
Ignore->use_empty()) {
20504 Ignore->replaceAllUsesWith(
P);
20507 V.removeInstructionsAndOperands(RdxOps);
20509 }
else if (!CheckForReusedReductionOps) {
20510 for (ReductionOpsType &RdxOps : ReductionOps)
20511 for (
Value *RdxOp : RdxOps)
20512 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20514 return VectorizedTree;
20524 Type *ScalarTy = ReducedVals.
front()->getType();
20525 unsigned ReduxWidth = ReducedVals.
size();
20534 int Cnt = ReducedVals.
size();
20535 for (
Value *RdxVal : ReducedVals) {
20540 Cost += GenCostFn();
20545 auto *RdxOp = cast<Instruction>(U);
20546 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20554 Cost += ScalarCost;
20556 Cost += GenCostFn();
20561 case RecurKind::Add:
20562 case RecurKind::Mul:
20563 case RecurKind::Or:
20564 case RecurKind::And:
20565 case RecurKind::Xor:
20566 case RecurKind::FAdd:
20567 case RecurKind::FMul: {
20570 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20573 for (
unsigned I : seq<unsigned>(ReducedVals.size())) {
20585 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
20586 std::make_pair(RedTy,
true));
20587 if (RType == RedTy) {
20597 ScalarCost = EvaluateScalarCost([&]() {
20602 case RecurKind::FMax:
20603 case RecurKind::FMin:
20604 case RecurKind::FMaximum:
20605 case RecurKind::FMinimum:
20606 case RecurKind::SMax:
20607 case RecurKind::SMin:
20608 case RecurKind::UMax:
20609 case RecurKind::UMin: {
20613 ScalarCost = EvaluateScalarCost([&]() {
20623 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
20625 <<
" (It is a splitting reduction)\n");
20626 return VectorCost - ScalarCost;
20632 assert(VectorizedValue &&
"Need to have a vectorized tree node");
20633 assert(RdxKind != RecurKind::FMulAdd &&
20634 "A call to the llvm.fmuladd intrinsic is not handled yet");
20636 auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
20637 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
20638 RdxKind == RecurKind::Add &&
20643 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
20644 ++NumVectorInstructions;
20647 ++NumVectorInstructions;
20654 assert(IsSupportedHorRdxIdentityOp &&
20655 "The optimization of matched scalar identity horizontal reductions "
20656 "must be supported.");
20658 return VectorizedValue;
20660 case RecurKind::Add: {
20662 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
20664 << VectorizedValue <<
". (HorRdx)\n");
20665 return Builder.
CreateMul(VectorizedValue, Scale);
20667 case RecurKind::Xor: {
20669 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
20670 <<
". (HorRdx)\n");
20673 return VectorizedValue;
20675 case RecurKind::FAdd: {
20677 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
20679 << VectorizedValue <<
". (HorRdx)\n");
20680 return Builder.
CreateFMul(VectorizedValue, Scale);
20682 case RecurKind::And:
20683 case RecurKind::Or:
20684 case RecurKind::SMax:
20685 case RecurKind::SMin:
20686 case RecurKind::UMax:
20687 case RecurKind::UMin:
20688 case RecurKind::FMax:
20689 case RecurKind::FMin:
20690 case RecurKind::FMaximum:
20691 case RecurKind::FMinimum:
20693 return VectorizedValue;
20694 case RecurKind::Mul:
20695 case RecurKind::FMul:
20696 case RecurKind::FMulAdd:
20697 case RecurKind::IAnyOf:
20698 case RecurKind::FAnyOf:
20699 case RecurKind::IFindLastIV:
20700 case RecurKind::FFindLastIV:
20701 case RecurKind::None:
20713 assert(IsSupportedHorRdxIdentityOp &&
20714 "The optimization of matched scalar identity horizontal reductions "
20715 "must be supported.");
20717 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
20718 if (VTy->getElementType() != VL.
front()->getType()) {
20722 R.isSignedMinBitwidthRootNode());
20725 case RecurKind::Add: {
20728 for (
Value *V : VL) {
20729 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20730 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
20734 << VectorizedValue <<
". (HorRdx)\n");
20735 return Builder.
CreateMul(VectorizedValue, Scale);
20737 case RecurKind::And:
20738 case RecurKind::Or:
20741 <<
". (HorRdx)\n");
20742 return VectorizedValue;
20743 case RecurKind::SMax:
20744 case RecurKind::SMin:
20745 case RecurKind::UMax:
20746 case RecurKind::UMin:
20747 case RecurKind::FMax:
20748 case RecurKind::FMin:
20749 case RecurKind::FMaximum:
20750 case RecurKind::FMinimum:
20753 <<
". (HorRdx)\n");
20754 return VectorizedValue;
20755 case RecurKind::Xor: {
20761 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
20763 std::iota(
Mask.begin(),
Mask.end(), 0);
20764 bool NeedShuffle =
false;
20765 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
20767 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20768 if (Cnt % 2 == 0) {
20770 NeedShuffle =
true;
20776 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
20780 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
20781 return VectorizedValue;
20783 case RecurKind::FAdd: {
20786 for (
Value *V : VL) {
20787 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20788 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
20791 return Builder.
CreateFMul(VectorizedValue, Scale);
20793 case RecurKind::Mul:
20794 case RecurKind::FMul:
20795 case RecurKind::FMulAdd:
20796 case RecurKind::IAnyOf:
20797 case RecurKind::FAnyOf:
20798 case RecurKind::IFindLastIV:
20799 case RecurKind::FFindLastIV:
20800 case RecurKind::None:
20810 return HorizontalReduction::getRdxKind(V);
20813 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20814 return cast<FixedVectorType>(IE->getType())->getNumElements();
20816 unsigned AggregateSize = 1;
20817 auto *
IV = cast<InsertValueInst>(InsertInst);
20818 Type *CurrentType =
IV->getType();
20820 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
20821 for (
auto *Elt : ST->elements())
20822 if (Elt != ST->getElementType(0))
20823 return std::nullopt;
20824 AggregateSize *= ST->getNumElements();
20825 CurrentType = ST->getElementType(0);
20826 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20827 AggregateSize *= AT->getNumElements();
20828 CurrentType = AT->getElementType();
20829 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20830 AggregateSize *= VT->getNumElements();
20831 return AggregateSize;
20833 return AggregateSize;
20835 return std::nullopt;
20844 unsigned OperandOffset,
const BoUpSLP &R) {
20847 std::optional<unsigned> OperandIndex =
20849 if (!OperandIndex || R.isDeleted(LastInsertInst))
20851 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20853 BuildVectorOpds, InsertElts, *OperandIndex, R);
20856 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20857 InsertElts[*OperandIndex] = LastInsertInst;
20859 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
20860 }
while (LastInsertInst !=
nullptr &&
20861 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20885 assert((isa<InsertElementInst>(LastInsertInst) ||
20886 isa<InsertValueInst>(LastInsertInst)) &&
20887 "Expected insertelement or insertvalue instruction!");
20890 "Expected empty result vectors!");
20893 if (!AggregateSize)
20895 BuildVectorOpds.
resize(*AggregateSize);
20896 InsertElts.
resize(*AggregateSize);
20902 if (BuildVectorOpds.
size() >= 2)
20920 auto DominatedReduxValue = [&](
Value *R) {
20921 return isa<Instruction>(R) &&
20922 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
20928 if (
P->getIncomingBlock(0) == ParentBB) {
20929 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20930 }
else if (
P->getIncomingBlock(1) == ParentBB) {
20931 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20934 if (Rdx && DominatedReduxValue(Rdx))
20947 if (
P->getIncomingBlock(0) == BBLatch) {
20948 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20949 }
else if (
P->getIncomingBlock(1) == BBLatch) {
20950 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20953 if (Rdx && DominatedReduxValue(Rdx))
20987 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20988 isa<IntrinsicInst>(Root)) &&
20989 "Expected binop, select, or intrinsic for reduction matching");
20991 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20993 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20995 return dyn_cast<Instruction>(
RHS);
20997 return dyn_cast<Instruction>(
LHS);
21004 Value *Op0 =
nullptr;
21005 Value *Op1 =
nullptr;
21008 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21014 Value *B0 =
nullptr, *B1 =
nullptr;
21019bool SLPVectorizerPass::vectorizeHorReduction(
21024 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
21026 if (Root->
getParent() != BB || isa<PHINode>(Root))
21030 auto SelectRoot = [&]() {
21049 std::queue<std::pair<Instruction *, unsigned>>
Stack;
21050 Stack.emplace(SelectRoot(), 0);
21054 if (
R.isAnalyzedReductionRoot(Inst))
21059 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
21061 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI, AC);
21063 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
21064 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21071 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21076 while (!
Stack.empty()) {
21079 std::tie(Inst, Level) =
Stack.front();
21084 if (
R.isDeleted(Inst))
21086 if (
Value *VectorizedV = TryToReduce(Inst)) {
21088 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
21090 Stack.emplace(
I, Level);
21093 if (
R.isDeleted(Inst))
21097 if (!TryAppendToPostponedInsts(Inst)) {
21108 if (VisitedInstrs.
insert(
Op).second)
21109 if (
auto *
I = dyn_cast<Instruction>(
Op))
21112 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
21113 !
R.isDeleted(
I) &&
I->getParent() == BB)
21114 Stack.emplace(
I, Level);
21122 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
21123 Res |= tryToVectorize(PostponedInsts, R);
21130 for (
Value *V : Insts)
21131 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
21132 Res |= tryToVectorize(Inst, R);
21136bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
21139 if (!
R.canMapToVector(IVI->
getType()))
21147 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
21148 R.getORE()->emit([&]() {
21150 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
21151 "trying reduction first.";
21155 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
21157 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21167 (
all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21171 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
21172 R.getORE()->emit([&]() {
21174 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
21175 "trying reduction first.";
21179 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
21180 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21183template <
typename T>
21188 bool MaxVFOnly,
BoUpSLP &R) {
21189 bool Changed =
false;
21200 auto *
I = dyn_cast<Instruction>(*IncIt);
21201 if (!
I || R.isDeleted(
I)) {
21205 auto *SameTypeIt = IncIt;
21206 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21207 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21208 AreCompatible(*SameTypeIt, *IncIt))) {
21209 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21211 if (
I && !R.isDeleted(
I))
21216 unsigned NumElts = VL.
size();
21217 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
21218 << NumElts <<
")\n");
21228 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
21231 VL.
swap(Candidates);
21232 Candidates.
clear();
21234 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21240 auto GetMinNumElements = [&R](
Value *V) {
21241 unsigned EltSize = R.getVectorElementSize(V);
21242 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21244 if (NumElts < GetMinNumElements(*IncIt) &&
21245 (Candidates.
empty() ||
21246 Candidates.
front()->getType() == (*IncIt)->getType())) {
21248 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21254 if (Candidates.
size() > 1 &&
21255 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21256 if (TryToVectorizeHelper(Candidates,
false)) {
21259 }
else if (MaxVFOnly) {
21262 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
21264 auto *
I = dyn_cast<Instruction>(*It);
21265 if (!
I || R.isDeleted(
I)) {
21269 auto *SameTypeIt = It;
21270 while (SameTypeIt !=
End &&
21271 (!isa<Instruction>(*SameTypeIt) ||
21272 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21273 AreCompatible(*SameTypeIt, *It))) {
21274 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21276 if (
I && !R.isDeleted(
I))
21279 unsigned NumElts = VL.
size();
21280 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
21286 Candidates.
clear();
21290 IncIt = SameTypeIt;
21302template <
bool IsCompatibility>
21307 "Expected valid element types only.");
21309 return IsCompatibility;
21310 auto *CI1 = cast<CmpInst>(V);
21311 auto *CI2 = cast<CmpInst>(V2);
21312 if (CI1->getOperand(0)->getType()->getTypeID() <
21314 return !IsCompatibility;
21315 if (CI1->getOperand(0)->getType()->getTypeID() >
21318 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21320 return !IsCompatibility;
21321 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21330 if (BasePred1 < BasePred2)
21331 return !IsCompatibility;
21332 if (BasePred1 > BasePred2)
21335 bool CI1Preds = Pred1 == BasePred1;
21336 bool CI2Preds = Pred2 == BasePred1;
21337 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
21338 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
21339 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
21343 return !IsCompatibility;
21346 if (
auto *I1 = dyn_cast<Instruction>(Op1))
21347 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
21348 if (IsCompatibility) {
21349 if (I1->getParent() != I2->getParent())
21356 return NodeI2 !=
nullptr;
21359 assert((NodeI1 == NodeI2) ==
21361 "Different nodes should have different DFS numbers");
21362 if (NodeI1 != NodeI2)
21366 if (S && (IsCompatibility || !S.isAltShuffle()))
21368 if (IsCompatibility)
21370 if (I1->getOpcode() != I2->getOpcode())
21371 return I1->getOpcode() < I2->getOpcode();
21374 return IsCompatibility;
21377template <
typename ItT>
21380 bool Changed =
false;
21383 if (
R.isDeleted(
I))
21386 if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
21387 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
21388 if (
R.isDeleted(
I))
21394 if (
R.isDeleted(
I))
21396 Changed |= tryToVectorize(
I, R);
21403 return compareCmp<false>(V, V2, *TLI, *DT);
21406 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
21409 return compareCmp<true>(V1, V2, *TLI, *DT);
21416 if (Vals.
size() <= 1)
21418 Changed |= tryToVectorizeSequence<Value>(
21419 Vals, CompareSorter, AreCompatibleCompares,
21422 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
21424 auto *Select = dyn_cast<SelectInst>(U);
21426 Select->getParent() != cast<Instruction>(V)->getParent();
21429 if (ArePossiblyReducedInOtherBlock)
21431 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21437bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21439 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21440 "This function only accepts Insert instructions");
21441 bool OpsChanged =
false;
21443 for (
auto *
I :
reverse(Instructions)) {
21445 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21447 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21449 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
21450 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21452 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
21455 if (
R.isDeleted(
I))
21457 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
21458 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21461 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21463 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
21464 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21465 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21470 OpsChanged |= tryToVectorize(PostponedInsts, R);
21477 bool Changed =
false;
21484 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
21487 "Expected vectorizable types only.");
21495 V2->getType()->getScalarSizeInBits())
21498 V2->getType()->getScalarSizeInBits())
21502 if (Opcodes1.
size() < Opcodes2.
size())
21504 if (Opcodes1.
size() > Opcodes2.
size())
21506 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21509 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
21510 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
21515 return NodeI2 !=
nullptr;
21518 assert((NodeI1 == NodeI2) ==
21520 "Different nodes should have different DFS numbers");
21521 if (NodeI1 != NodeI2)
21524 if (S && !S.isAltShuffle())
21526 return I1->getOpcode() < I2->getOpcode();
21535 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
21536 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
21544 bool U1 = isa<UndefValue>(Opcodes1[
I]);
21545 bool U2 = isa<UndefValue>(Opcodes2[
I]);
21549 auto ValID1 = Opcodes1[
I]->getValueID();
21550 auto ValID2 = Opcodes2[
I]->getValueID();
21551 if (ValID1 == ValID2)
21553 if (ValID1 < ValID2)
21555 if (ValID1 > ValID2)
21564 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
21568 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
21571 if (V1->getType() !=
V2->getType())
21575 if (Opcodes1.
size() != Opcodes2.
size())
21577 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21579 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
21581 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
21582 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
21583 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
21585 if (
I1->getParent() != I2->getParent())
21591 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
21593 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
21599 bool HaveVectorizedPhiNodes =
false;
21604 auto *
P = dyn_cast<PHINode>(&
I);
21610 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
21623 if (!Opcodes.
empty())
21627 while (!Nodes.
empty()) {
21628 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
21631 for (
Value *V :
PHI->incoming_values()) {
21632 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
21633 Nodes.push_back(PHI1);
21641 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21642 Incoming, PHICompare, AreCompatiblePHIs,
21644 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21647 Changed |= HaveVectorizedPhiNodes;
21648 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
21649 auto *
PHI = dyn_cast<PHINode>(
P.first);
21650 return !
PHI ||
R.isDeleted(
PHI);
21652 PHIToOpcodes.
clear();
21654 }
while (HaveVectorizedPhiNodes);
21656 VisitedInstrs.
clear();
21658 InstSetVector PostProcessInserts;
21662 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
21663 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21664 if (VectorizeCmps) {
21665 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
21666 PostProcessCmps.
clear();
21668 PostProcessInserts.clear();
21673 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
21674 return PostProcessCmps.
contains(Cmp);
21675 return isa<InsertElementInst, InsertValueInst>(
I) &&
21676 PostProcessInserts.contains(
I);
21682 return I->use_empty() &&
21683 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
21688 if (isa<ScalableVectorType>(It->getType()))
21692 if (
R.isDeleted(&*It))
21695 if (!VisitedInstrs.
insert(&*It).second) {
21696 if (HasNoUsers(&*It) &&
21697 VectorizeInsertsAndCmps(It->isTerminator())) {
21707 if (isa<DbgInfoIntrinsic>(It))
21711 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
21713 if (
P->getNumIncomingValues() == 2) {
21716 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
21725 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
21730 if (BB ==
P->getIncomingBlock(
I) ||
21736 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
21737 PI && !IsInPostProcessInstrs(PI)) {
21739 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
21741 if (Res &&
R.isDeleted(
P)) {
21751 if (HasNoUsers(&*It)) {
21752 bool OpsChanged =
false;
21753 auto *
SI = dyn_cast<StoreInst>(It);
21763 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
21764 SI->getValueOperand()->hasOneUse();
21766 if (TryToVectorizeRoot) {
21767 for (
auto *V : It->operand_values()) {
21770 if (
auto *VI = dyn_cast<Instruction>(V);
21771 VI && !IsInPostProcessInstrs(VI))
21773 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
21780 VectorizeInsertsAndCmps(It->isTerminator());
21791 if (isa<InsertElementInst, InsertValueInst>(It))
21792 PostProcessInserts.insert(&*It);
21793 else if (isa<CmpInst>(It))
21794 PostProcessCmps.
insert(cast<CmpInst>(&*It));
21801 auto Changed =
false;
21802 for (
auto &Entry : GEPs) {
21805 if (
Entry.second.size() < 2)
21808 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
21809 <<
Entry.second.size() <<
".\n");
21817 return !R.isDeleted(GEP);
21819 if (It ==
Entry.second.end())
21821 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
21822 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
21823 if (MaxVecRegSize < EltSize)
21826 unsigned MaxElts = MaxVecRegSize / EltSize;
21827 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
21828 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21841 Candidates.remove_if([&R](
Value *
I) {
21842 return R.isDeleted(cast<Instruction>(
I)) ||
21843 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
21851 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
21852 auto *GEPI = GEPList[
I];
21853 if (!Candidates.count(GEPI))
21856 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
21857 auto *GEPJ = GEPList[J];
21859 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
21860 Candidates.remove(GEPI);
21861 Candidates.remove(GEPJ);
21862 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21863 Candidates.remove(GEPJ);
21870 if (Candidates.
size() < 2)
21877 auto BundleIndex = 0
u;
21878 for (
auto *V : Candidates) {
21879 auto *
GEP = cast<GetElementPtrInst>(V);
21880 auto *GEPIdx =
GEP->idx_begin()->get();
21881 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21882 Bundle[BundleIndex++] = GEPIdx;
21894 Changed |= tryToVectorizeList(Bundle, R);
21900bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
21901 bool Changed =
false;
21906 if (
V->getValueOperand()->getType()->getTypeID() <
21907 V2->getValueOperand()->getType()->getTypeID())
21909 if (
V->getValueOperand()->getType()->getTypeID() >
21910 V2->getValueOperand()->getType()->getTypeID())
21912 if (
V->getPointerOperandType()->getTypeID() <
21913 V2->getPointerOperandType()->getTypeID())
21915 if (
V->getPointerOperandType()->getTypeID() >
21916 V2->getPointerOperandType()->getTypeID())
21918 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
21919 V2->getValueOperand()->getType()->getScalarSizeInBits())
21921 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
21922 V2->getValueOperand()->getType()->getScalarSizeInBits())
21925 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
21926 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21930 DT->
getNode(I2->getParent());
21931 assert(NodeI1 &&
"Should only process reachable instructions");
21932 assert(NodeI2 &&
"Should only process reachable instructions");
21933 assert((NodeI1 == NodeI2) ==
21935 "Different nodes should have different DFS numbers");
21936 if (NodeI1 != NodeI2)
21938 return I1->getOpcode() < I2->getOpcode();
21940 return V->getValueOperand()->getValueID() <
21941 V2->getValueOperand()->getValueID();
21953 isa<UndefValue>(
V2->getValueOperand()))
21956 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21957 if (
I1->getParent() != I2->getParent())
21962 isa<Constant>(
V2->getValueOperand()))
21965 V2->getValueOperand()->getValueID();
21970 for (
auto &Pair : Stores) {
21971 if (Pair.second.size() < 2)
21975 << Pair.second.size() <<
".\n");
21984 Pair.second.rend());
21985 Changed |= tryToVectorizeSequence<StoreInst>(
21986 ReversedStores, StoreSorter, AreCompatibleStores,
21988 return vectorizeStores(Candidates, R, Attempted);
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void clearAllBits()
Set every bit to 0.
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
static bool shouldExecute(unsigned CounterName)
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, Instruction *VL0, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.