74#ifdef EXPENSIVE_CHECKS
108using namespace std::placeholders;
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
113STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116 "Controls which SLP graphs should be vectorized.");
120 cl::desc(
"Run the SLP vectorization passes"));
124 cl::desc(
"Enable vectorization for wider vector utilization"));
128 cl::desc(
"Only vectorize if you gain more than this "
133 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
163 cl::desc(
"Limit the size of the SLP scheduling region per block"));
167 cl::desc(
"Attempt to vectorize for this register size in bits"));
171 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
175 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
181 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
190 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
194 cl::desc(
"The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
199 cl::desc(
"The maximum stride, considered to be profitable."));
203 cl::desc(
"Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
208 cl::desc(
"Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
213 cl::desc(
"Display the SLP trees with Graphviz"));
217 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
222 cl::desc(
"Try to replace values with the idempotent instructions for "
223 "better vectorization."));
255 Ty = Ty->getScalarType();
257 !Ty->isPPC_FP128Ty();
266 return SI->getValueOperand()->getType();
268 return CI->getOperand(0)->getType();
270 return IE->getOperand(1)->getType();
277 "ScalableVectorType is not supported.");
279 return VecTy->getNumElements();
293 Type *Ty,
unsigned Sz) {
298 if (NumParts == 0 || NumParts >= Sz)
313 if (NumParts == 0 || NumParts >= Sz)
318 return (Sz / RegVF) * RegVF;
330 I * VecTyNumElements, VecTyNumElements)))
332 : Mask[
I] * VecTyNumElements + J;
366 unsigned SVNumElements =
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
374 unsigned NumGroup = 0;
375 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
377 Value *Src = SV->getOperand(0);
383 if (SV->getOperand(0) != Src)
386 if (!SV->isExtractSubvectorMask(Index))
388 ExpectedIndex.
set(Index / ShuffleMaskSize);
392 if (!ExpectedIndex.
all())
396 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
415 unsigned SVNumElements =
418 unsigned AccumulateLength = 0;
419 for (
Value *V : VL) {
421 for (
int M : SV->getShuffleMask())
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
465 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
474 OS <<
"Idx: " << Idx <<
", ";
475 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
498 if (BB !=
II->getParent())
515 Value *FirstNonUndef =
nullptr;
516 for (
Value *V : VL) {
519 if (!FirstNonUndef) {
523 if (V != FirstNonUndef)
526 return FirstNonUndef !=
nullptr;
542 return Cmp->isCommutative();
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
564 (BO->getOpcode() == Instruction::FSub &&
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
570 return I->isCommutative();
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
593 return I->getNumOperands();
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
610 if (CI->getValue().uge(VT->getNumElements()))
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
635 Type *CurrentType =
IV->getType();
636 for (
unsigned I :
IV->indices()) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(
I);
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
664 return std::all_of(It, VL.
end(), [&](
Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
697 if (MaskArg == UseMask::UndefsAsMask)
701 if (MaskArg == UseMask::FirstArg &&
Value < VF)
702 UseMask.reset(
Value);
703 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
704 UseMask.reset(
Value - VF);
712template <
bool IsPoisonOnly = false>
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
724 if (!UseMask.empty()) {
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
750 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
751 if (
Constant *Elem =
C->getAggregateElement(
I))
753 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
781static std::optional<TargetTransformInfo::ShuffleKind>
788 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
795 return std::max(S, VTy->getNumElements());
798 Value *Vec1 =
nullptr;
799 Value *Vec2 =
nullptr;
804 Value *Vec = EE->getVectorOperand();
810 ShuffleMode CommonShuffleMode =
Unknown;
812 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
819 auto *Vec = EI->getVectorOperand();
833 if (Idx->getValue().uge(
Size))
835 unsigned IntIdx = Idx->getValue().getZExtValue();
842 if (!Vec1 || Vec1 == Vec) {
844 }
else if (!Vec2 || Vec2 == Vec) {
850 if (CommonShuffleMode == Permute)
854 if (Mask[
I] %
Size !=
I) {
855 CommonShuffleMode = Permute;
858 CommonShuffleMode =
Select;
861 if (CommonShuffleMode ==
Select && Vec2)
871 unsigned Opcode =
E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
879 return CI->getZExtValue();
882 if (EI->getNumIndices() != 1)
884 return *EI->idx_begin();
910bool isValidForAlternation(
unsigned Opcode) {
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
934 MainOpBIT = 0b100000000,
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(
const Instruction *
I) {
944 unsigned Opcode =
I->getOpcode();
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
957 struct InterchangeableInfo {
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(
const Instruction *I) : I(I) {}
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
979 bool equal(
unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1006 bool hasCandidateOpcode(
unsigned Opcode)
const {
1007 MaskType Candidate = Mask & SeenBefore;
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1058 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1061 : APInt::getZero(FromCIValueBitWidth);
1064 case Instruction::Mul:
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1069 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1072 : APInt::getZero(FromCIValueBitWidth);
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.
isZero()) {
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1086 case Instruction::And:
1088 ToCIValue = ToOpcode == Instruction::Mul
1090 : APInt::getZero(FromCIValueBitWidth);
1093 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1097 Value *
LHS = I->getOperand(1 - Pos);
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1103 (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
1108 InterchangeableInfo MainOp;
1109 InterchangeableInfo AltOp;
1110 bool isValidForAlternation(
const Instruction *
I)
const {
1111 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1112 ::isValidForAlternation(
I->getOpcode());
1114 bool initializeAltOp(
const Instruction *
I) {
1117 if (!isValidForAlternation(
I))
1124 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1125 const Instruction *AltOp =
nullptr)
1126 : MainOp(MainOp), AltOp(AltOp) {
1129 bool add(
const Instruction *
I) {
1131 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1132 unsigned Opcode =
I->getOpcode();
1133 MaskType OpcodeInMaskForm;
1136 case Instruction::Shl:
1137 OpcodeInMaskForm = ShlBIT;
1139 case Instruction::AShr:
1140 OpcodeInMaskForm = AShrBIT;
1142 case Instruction::Mul:
1143 OpcodeInMaskForm = MulBIT;
1145 case Instruction::Add:
1146 OpcodeInMaskForm = AddBIT;
1148 case Instruction::Sub:
1149 OpcodeInMaskForm = SubBIT;
1151 case Instruction::And:
1152 OpcodeInMaskForm = AndBIT;
1154 case Instruction::Or:
1155 OpcodeInMaskForm = OrBIT;
1157 case Instruction::Xor:
1158 OpcodeInMaskForm = XorBIT;
1161 return MainOp.equal(Opcode) ||
1162 (initializeAltOp(
I) && AltOp.equal(Opcode));
1164 MaskType InterchangeableMask = OpcodeInMaskForm;
1165 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1167 constexpr MaskType CanBeAll =
1168 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1169 const APInt &CIValue = CI->
getValue();
1171 case Instruction::Shl:
1173 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1175 case Instruction::Mul:
1176 if (CIValue.
isOne()) {
1177 InterchangeableMask = CanBeAll;
1181 InterchangeableMask = MulBIT | ShlBIT;
1183 case Instruction::Add:
1184 case Instruction::Sub:
1185 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1187 case Instruction::And:
1189 InterchangeableMask = CanBeAll;
1193 InterchangeableMask = CanBeAll;
1197 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1198 (initializeAltOp(
I) &&
1199 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1201 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1203 bool hasCandidateOpcode(
unsigned Opcode)
const {
1204 return MainOp.hasCandidateOpcode(Opcode);
1206 bool hasAltOp()
const {
return AltOp.I; }
1207 unsigned getAltOpcode()
const {
1208 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1211 return MainOp.getOperand(
I);
1216class InstructionsState {
1242 bool HasCopyables =
false;
1246 assert(valid() &&
"InstructionsState is invalid.");
1251 assert(valid() &&
"InstructionsState is invalid.");
1256 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1258 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1261 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1270 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1271 assert(MainOp &&
"MainOp cannot be nullptr.");
1272 if (
I->getOpcode() == MainOp->getOpcode())
1275 assert(AltOp &&
"AltOp cannot be nullptr.");
1276 if (
I->getOpcode() == AltOp->getOpcode())
1278 if (!
I->isBinaryOp())
1280 BinOpSameOpcodeHelper
Converter(MainOp);
1283 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1284 BinOpSameOpcodeHelper AltConverter(AltOp);
1285 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1286 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1289 if (
Converter.hasAltOp() && !isAltShuffle())
1291 return Converter.hasAltOp() ? AltOp : MainOp;
1295 bool isShiftOp()
const {
1296 return getMainOp()->isShift() && getAltOp()->isShift();
1301 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1305 bool isMulDivLikeOp()
const {
1306 constexpr std::array<unsigned, 8> MulDiv = {
1307 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1308 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1309 Instruction::URem, Instruction::FRem};
1315 bool isAddSubLikeOp()
const {
1316 constexpr std::array<unsigned, 4>
AddSub = {
1317 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1324 bool isCmpOp()
const {
1325 return (
getOpcode() == Instruction::ICmp ||
1331 bool valid()
const {
return MainOp && AltOp; }
1333 explicit operator bool()
const {
return valid(); }
1335 InstructionsState() =
delete;
1336 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1337 bool HasCopyables =
false)
1338 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1339 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1342 bool isCopyableElement(
Value *V)
const {
1343 assert(valid() &&
"InstructionsState is invalid.");
1346 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1351 if (
I->getParent() != MainOp->getParent() &&
1355 if (
I->getOpcode() == MainOp->getOpcode())
1357 if (!
I->isBinaryOp())
1359 BinOpSameOpcodeHelper
Converter(MainOp);
1365 bool isNonSchedulable(
Value *V)
const {
1366 assert(valid() &&
"InstructionsState is invalid.");
1373 if (getMainOp() == V)
1375 if (isCopyableElement(V)) {
1376 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1378 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1383 !MainOp->comesBefore(
I));
1386 return IsNonSchedulableCopyableElement(V);
1393 bool areInstructionsWithCopyableElements()
const {
1394 assert(valid() &&
"InstructionsState is invalid.");
1395 return HasCopyables;
1399std::pair<Instruction *, SmallVector<Value *>>
1401 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1402 assert(SelectedOp &&
"Cannot convert the instruction.");
1403 if (
I->isBinaryOp()) {
1405 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1424 for (
Value *V : VL) {
1429 if (Inst->getOpcode() == Opcode)
1443 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1454 "Assessing comparisons of different types?");
1464 return (BasePred == Pred &&
1466 (BasePred == SwappedPred &&
1477 return InstructionsState::invalid();
1481 return InstructionsState::invalid();
1486 (VL.
size() == 2 && InstCnt < 2))
1487 return InstructionsState::invalid();
1496 unsigned AltOpcode = Opcode;
1498 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1499 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1501 UniquePreds.
insert(BasePred);
1502 UniqueNonSwappedPreds.
insert(BasePred);
1503 for (
Value *V : VL) {
1510 UniqueNonSwappedPreds.
insert(CurrentPred);
1511 if (!UniquePreds.
contains(CurrentPred) &&
1512 !UniquePreds.
contains(SwappedCurrentPred))
1513 UniquePreds.
insert(CurrentPred);
1518 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1528 return InstructionsState::invalid();
1530 bool AnyPoison = InstCnt != VL.
size();
1541 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1542 return InstructionsState::invalid();
1543 unsigned InstOpcode =
I->getOpcode();
1545 if (BinOpHelper.add(
I))
1550 Value *Op1 =
I->getOperand(0);
1553 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1555 if (Opcode == AltOpcode) {
1556 assert(isValidForAlternation(Opcode) &&
1557 isValidForAlternation(InstOpcode) &&
1558 "Cast isn't safe for alternation, logic needs to be updated!");
1559 AltOpcode = InstOpcode;
1566 Type *Ty0 = BaseInst->getOperand(0)->getType();
1567 Type *Ty1 = Inst->getOperand(0)->getType();
1569 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1570 assert(InstOpcode == AltOpcode &&
1571 "Alternate instructions are only supported by BinaryOperator "
1579 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1580 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1586 if (MainOp != AltOp) {
1589 }
else if (BasePred != CurrentPred) {
1591 isValidForAlternation(InstOpcode) &&
1592 "CmpInst isn't safe for alternation, logic needs to be updated!");
1597 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1598 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1601 }
else if (InstOpcode == Opcode) {
1602 assert(InstOpcode == AltOpcode &&
1603 "Alternate instructions are only supported by BinaryOperator and "
1606 if (Gep->getNumOperands() != 2 ||
1608 return InstructionsState::invalid();
1611 return InstructionsState::invalid();
1614 if (!LI->isSimple() || !BaseLI->isSimple())
1615 return InstructionsState::invalid();
1619 return InstructionsState::invalid();
1620 if (
Call->hasOperandBundles() &&
1622 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1623 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1626 return InstructionsState::invalid();
1629 return InstructionsState::invalid();
1632 if (Mappings.
size() != BaseMappings.
size() ||
1633 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1634 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1635 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1636 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1637 Mappings.
front().Shape.Parameters !=
1638 BaseMappings.
front().Shape.Parameters)
1639 return InstructionsState::invalid();
1644 return InstructionsState::invalid();
1649 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1651 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1654 "Incorrect implementation of allSameOpcode.");
1655 InstructionsState S(MainOp, AltOp);
1661 "Invalid InstructionsState.");
1669 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1679 unsigned Opcode = UserInst->
getOpcode();
1681 case Instruction::Load: {
1685 case Instruction::Store: {
1687 return (
SI->getPointerOperand() == Scalar);
1689 case Instruction::Call: {
1693 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1694 Arg.value().get() == Scalar;
1714 return LI->isSimple();
1716 return SI->isSimple();
1718 return !
MI->isVolatile();
1726 bool ExtendingManyInputs =
false) {
1727 if (SubMask.
empty())
1730 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1733 "SubMask with many inputs support must be larger than the mask.");
1735 Mask.append(SubMask.
begin(), SubMask.
end());
1739 int TermValue = std::min(Mask.size(), SubMask.
size());
1740 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1742 (!ExtendingManyInputs &&
1743 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1745 NewMask[
I] = Mask[SubMask[
I]];
1761 const size_t Sz = Order.
size();
1764 for (
unsigned I = 0;
I < Sz; ++
I) {
1766 UnusedIndices.
reset(Order[
I]);
1768 MaskedIndices.
set(
I);
1770 if (MaskedIndices.
none())
1773 "Non-synced masked/available indices.");
1777 assert(Idx >= 0 &&
"Indices must be synced.");
1787 unsigned Opcode0,
unsigned Opcode1) {
1794 OpcodeMask.
set(Lane * ScalarTyNumElements,
1795 Lane * ScalarTyNumElements + ScalarTyNumElements);
1804 "Expected scalar constants.");
1807 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1816 const unsigned E = Indices.
size();
1818 for (
unsigned I = 0;
I < E; ++
I)
1819 Mask[Indices[
I]] =
I;
1825 assert(!Mask.empty() &&
"Expected non-empty mask.");
1829 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1831 Scalars[Mask[
I]] = Prev[
I];
1844 auto *IO = dyn_cast<Instruction>(V);
1847 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1860 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1862 auto *IU = dyn_cast<Instruction>(U);
1865 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1881 return !VL.
empty() &&
1897 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1906 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1907 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1908 if (NumParts == 0 || NumParts >= Limit)
1911 if (NumParts >= Sz || Sz % NumParts != 0 ||
1922 class ScheduleEntity;
1924 class ScheduleCopyableData;
1925 class ScheduleBundle;
1935 struct StridedPtrInfo {
1936 Value *StrideVal =
nullptr;
1937 const SCEV *StrideSCEV =
nullptr;
1963 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1964 AC(AC), DB(DB), DL(DL), ORE(ORE),
1983 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1996 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2017 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2018 return VectorizableTree.front()->Scalars;
2024 const TreeEntry &Root = *VectorizableTree.front();
2025 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2026 !Root.Scalars.
front()->getType()->isIntegerTy())
2027 return std::nullopt;
2028 auto It = MinBWs.find(&Root);
2029 if (It != MinBWs.end())
2033 if (Root.getOpcode() == Instruction::ZExt ||
2034 Root.getOpcode() == Instruction::SExt)
2035 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2036 Root.getOpcode() == Instruction::SExt);
2037 return std::nullopt;
2043 return MinBWs.at(VectorizableTree.front().get()).second;
2048 if (ReductionBitWidth == 0 ||
2049 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2050 ReductionBitWidth >=
2051 DL->getTypeSizeInBits(
2052 VectorizableTree.front()->Scalars.front()->getType()))
2054 VectorizableTree.front()->Scalars.front()->getType(),
2055 VectorizableTree.front()->getVectorFactor());
2058 VectorizableTree.front()->Scalars.front()->getContext(),
2060 VectorizableTree.front()->getVectorFactor());
2075 VectorizableTree.clear();
2076 ScalarToTreeEntries.clear();
2077 OperandsToTreeEntry.clear();
2078 ScalarsInSplitNodes.clear();
2080 NonScheduledFirst.clear();
2081 EntryToLastInstruction.clear();
2082 LoadEntriesToVectorize.clear();
2083 IsGraphTransformMode =
false;
2084 GatheredLoadsEntriesFirst.reset();
2085 CompressEntryToData.clear();
2086 ExternalUses.clear();
2087 ExternalUsesAsOriginalScalar.clear();
2088 ExternalUsesWithNonUsers.clear();
2089 for (
auto &Iter : BlocksSchedules) {
2090 BlockScheduling *BS = Iter.second.get();
2094 ReductionBitWidth = 0;
2096 CastMaxMinBWSizes.reset();
2097 ExtraBitWidthNodes.clear();
2098 InstrElementSize.clear();
2099 UserIgnoreList =
nullptr;
2100 PostponedGathers.clear();
2101 ValueToGatherNodes.clear();
2117 assert(!Order.
empty() &&
"expected non-empty order");
2118 const unsigned Sz = Order.
size();
2120 return P.value() ==
P.index() ||
P.value() == Sz;
2133 bool IgnoreReorder);
2146 std::optional<OrdersType>
2184 return MaxVecRegSize;
2189 return MinVecRegSize;
2197 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2198 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2199 return MaxVF ? MaxVF : UINT_MAX;
2240 const bool IsAnyPointerUsedOutGraph,
const int64_t Diff,
2241 StridedPtrInfo &SPtrInfo)
const;
2256 StridedPtrInfo &SPtrInfo,
2257 unsigned *BestVF =
nullptr,
2258 bool TryRecursiveCheck =
true)
const;
2262 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2266 template <
typename T>
2268 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2293 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2294 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2319 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2320 MaxLevel(MaxLevel) {}
2376 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2381 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2383 return U == U1 || U == U2 || R.isVectorized(U);
2386 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2389 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2391 ((
int)V1->getNumUses() == NumLanes ||
2392 AllUsersAreInternal(V1, V2)))
2398 auto CheckSameEntryOrFail = [&]() {
2403 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2412 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2414 return CheckSameEntryOrFail();
2417 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2418 LI2->getPointerOperand(), DL, SE,
true);
2419 if (!Dist || *Dist == 0) {
2422 R.TTI->isLegalMaskedGather(
2425 return CheckSameEntryOrFail();
2429 if (std::abs(*Dist) > NumLanes / 2)
2462 Value *EV2 =
nullptr;
2475 int Dist = Idx2 - Idx1;
2478 if (std::abs(Dist) == 0)
2480 if (std::abs(Dist) > NumLanes / 2)
2487 return CheckSameEntryOrFail();
2493 if (I1->getParent() != I2->getParent())
2494 return CheckSameEntryOrFail();
2502 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2503 !S.isAltShuffle()) &&
2507 S.getMainOp()->getNumOperands();
2519 return CheckSameEntryOrFail();
2553 int ShallowScoreAtThisLevel =
2564 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2567 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2569 ShallowScoreAtThisLevel))
2570 return ShallowScoreAtThisLevel;
2571 assert(I1 && I2 &&
"Should have early exited.");
2578 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2579 OpIdx1 != NumOperands1; ++OpIdx1) {
2581 int MaxTmpScore = 0;
2582 unsigned MaxOpIdx2 = 0;
2583 bool FoundBest =
false;
2587 ? I2->getNumOperands()
2588 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2589 assert(FromIdx <= ToIdx &&
"Bad index");
2590 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2592 if (Op2Used.
count(OpIdx2))
2597 I1, I2, CurrLevel + 1, {});
2600 TmpScore > MaxTmpScore) {
2601 MaxTmpScore = TmpScore;
2608 Op2Used.
insert(MaxOpIdx2);
2609 ShallowScoreAtThisLevel += MaxTmpScore;
2612 return ShallowScoreAtThisLevel;
2643 struct OperandData {
2644 OperandData() =
default;
2645 OperandData(
Value *V,
bool APO,
bool IsUsed)
2646 : V(V), APO(APO), IsUsed(IsUsed) {}
2656 bool IsUsed =
false;
2665 enum class ReorderingMode {
2679 unsigned ArgSize = 0;
2685 const Loop *L =
nullptr;
2688 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2689 return OpsVec[
OpIdx][Lane];
2693 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2694 return OpsVec[
OpIdx][Lane];
2699 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2701 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2703 OpsVec[
OpIdx][Lane].IsUsed =
false;
2707 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2708 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2720 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2722 Value *IdxLaneV = getData(Idx, Lane).V;
2735 unsigned UniquesCount = Uniques.
size();
2736 auto IdxIt = Uniques.
find(IdxLaneV);
2737 unsigned UniquesCntWithIdxLaneV =
2738 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2740 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2741 unsigned UniquesCntWithOpIdxLaneV =
2742 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2743 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2745 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2746 UniquesCntWithOpIdxLaneV,
2747 UniquesCntWithOpIdxLaneV -
2749 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2750 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2751 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2760 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2761 Value *IdxLaneV = getData(Idx, Lane).V;
2774 return R.areAllUsersVectorized(IdxLaneI)
2782 static const int ScoreScaleFactor = 10;
2790 int Lane,
unsigned OpIdx,
unsigned Idx,
2800 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2801 if (Score <= -SplatScore) {
2805 Score += SplatScore;
2811 Score *= ScoreScaleFactor;
2812 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2830 std::optional<unsigned>
2831 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2835 unsigned NumOperands = getNumOperands();
2838 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2841 ReorderingMode RMode = ReorderingModes[
OpIdx];
2842 if (RMode == ReorderingMode::Failed)
2843 return std::nullopt;
2846 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2852 std::optional<unsigned> Idx;
2856 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2862 bool IsUsed = RMode == ReorderingMode::Splat ||
2863 RMode == ReorderingMode::Constant ||
2864 RMode == ReorderingMode::Load;
2866 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2868 OperandData &OpData = getData(Idx, Lane);
2870 bool OpAPO = OpData.APO;
2879 if (OpAPO != OpIdxAPO)
2884 case ReorderingMode::Load:
2885 case ReorderingMode::Opcode: {
2886 bool LeftToRight = Lane > LastLane;
2887 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2888 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2889 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2890 OpIdx, Idx, IsUsed, UsedLanes);
2891 if (Score >
static_cast<int>(BestOp.Score) ||
2892 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2895 BestOp.Score = Score;
2896 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2900 case ReorderingMode::Constant:
2902 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2906 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2913 case ReorderingMode::Splat:
2915 IsUsed =
Op == OpLastLane;
2916 if (
Op == OpLastLane) {
2918 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2924 case ReorderingMode::Failed:
2930 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2934 return std::nullopt;
2941 unsigned getBestLaneToStartReordering()
const {
2942 unsigned Min = UINT_MAX;
2943 unsigned SameOpNumber = 0;
2954 for (
int I = getNumLanes();
I > 0; --
I) {
2955 unsigned Lane =
I - 1;
2956 OperandsOrderData NumFreeOpsHash =
2957 getMaxNumOperandsThatCanBeReordered(Lane);
2960 if (NumFreeOpsHash.NumOfAPOs < Min) {
2961 Min = NumFreeOpsHash.NumOfAPOs;
2962 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2964 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2965 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2966 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2969 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2970 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2971 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2972 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2973 auto [It, Inserted] =
2974 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2980 unsigned BestLane = 0;
2981 unsigned CntMin = UINT_MAX;
2983 if (
Data.second.first < CntMin) {
2984 CntMin =
Data.second.first;
2985 BestLane =
Data.second.second;
2992 struct OperandsOrderData {
2995 unsigned NumOfAPOs = UINT_MAX;
2998 unsigned NumOpsWithSameOpcodeParent = 0;
3012 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3013 unsigned CntTrue = 0;
3014 unsigned NumOperands = getNumOperands();
3024 bool AllUndefs =
true;
3025 unsigned NumOpsWithSameOpcodeParent = 0;
3030 const OperandData &OpData = getData(
OpIdx, Lane);
3037 I->getParent() != Parent) {
3038 if (NumOpsWithSameOpcodeParent == 0) {
3039 NumOpsWithSameOpcodeParent = 1;
3041 Parent =
I->getParent();
3043 --NumOpsWithSameOpcodeParent;
3046 ++NumOpsWithSameOpcodeParent;
3055 OperandsOrderData
Data;
3056 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3057 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3064 const InstructionsState &S) {
3068 return VL.
size() == getNumLanes();
3070 "Expected same number of lanes");
3071 assert(S.valid() &&
"InstructionsState is invalid.");
3077 OpsVec.resize(ArgSize);
3078 unsigned NumLanes = VL.
size();
3079 for (OperandDataVec &
Ops : OpsVec)
3080 Ops.resize(NumLanes);
3098 bool IsInverseOperation =
false;
3099 if (S.isCopyableElement(VL[Lane])) {
3103 assert(
I &&
"Expected instruction");
3104 auto [SelectedOp,
Ops] = convertTo(
I, S);
3111 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3118 unsigned getNumOperands()
const {
return ArgSize; }
3121 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3124 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3125 return getData(
OpIdx, Lane).V;
3129 bool empty()
const {
return OpsVec.empty(); }
3132 void clear() { OpsVec.clear(); }
3137 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3139 "Op is expected to be getValue(OpIdx, Lane).");
3143 bool OpAPO = getData(
OpIdx, Lane).APO;
3144 bool IsInvariant = L && L->isLoopInvariant(
Op);
3146 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3150 bool FoundCandidate =
false;
3151 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3152 OperandData &
Data = getData(OpI, Ln);
3153 if (
Data.APO != OpAPO ||
Data.IsUsed)
3155 Value *OpILane = getValue(OpI, Lane);
3179 L->isLoopInvariant(
Data.V))) {
3180 FoundCandidate =
true;
3187 if (!FoundCandidate)
3190 return getNumLanes() == 2 || Cnt > 1;
3197 "Op is expected to be getValue(OpIdx, Lane).");
3198 bool OpAPO = getData(
OpIdx, Lane).APO;
3199 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3203 const OperandData &
Data = getData(OpI, Ln);
3204 if (
Data.APO != OpAPO ||
Data.IsUsed)
3206 Value *OpILn = getValue(OpI, Ln);
3207 return (L && L->isLoopInvariant(OpILn)) ||
3219 const InstructionsState &S,
const BoUpSLP &R)
3220 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3221 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3223 appendOperands(RootVL,
Operands, S);
3231 "Expected same num of lanes across all operands");
3232 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3233 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3241 unsigned NumOperands = getNumOperands();
3242 unsigned NumLanes = getNumLanes();
3262 unsigned FirstLane = getBestLaneToStartReordering();
3271 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3272 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3273 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3275 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3277 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3279 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3282 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3292 auto &&SkipReordering = [
this]() {
3295 for (
const OperandData &
Data : Op0)
3298 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3299 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3306 return UniqueValues.
size() != 2 &&
3308 UniqueValues.
size());
3320 if (SkipReordering())
3323 bool StrategyFailed =
false;
3331 for (
unsigned I = 0;
I < NumOperands; ++
I)
3332 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3335 UsedLanes.
set(FirstLane);
3336 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3338 for (
int Direction : {+1, -1}) {
3339 int Lane = FirstLane + Direction * Distance;
3340 if (Lane < 0 || Lane >= (
int)NumLanes)
3342 UsedLanes.
set(Lane);
3343 int LastLane = Lane - Direction;
3344 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3349 std::optional<unsigned> BestIdx =
3350 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3351 MainAltOps[
OpIdx], UsedLanes);
3358 swap(
OpIdx, *BestIdx, Lane);
3361 StrategyFailed =
true;
3365 OperandData &AltOp = getData(
OpIdx, Lane);
3366 InstructionsState OpS =
3368 if (OpS && OpS.isAltShuffle())
3375 if (!StrategyFailed)
3380#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3383 case ReorderingMode::Load:
3385 case ReorderingMode::Opcode:
3387 case ReorderingMode::Constant:
3389 case ReorderingMode::Splat:
3391 case ReorderingMode::Failed:
3412 const unsigned Indent = 2;
3414 for (
const OperandDataVec &OpDataVec : OpsVec) {
3415 OS <<
"Operand " << Cnt++ <<
"\n";
3416 for (
const OperandData &OpData : OpDataVec) {
3417 OS.
indent(Indent) <<
"{";
3418 if (
Value *V = OpData.V)
3422 OS <<
", APO:" << OpData.APO <<
"}\n";
3444 int BestScore = Limit;
3445 std::optional<int> Index;
3446 for (
int I :
seq<int>(0, Candidates.size())) {
3448 Candidates[
I].second,
3451 if (Score > BestScore) {
3466 DeletedInstructions.insert(
I);
3471 template <
typename T>
3474 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3476 for (
T *V : DeadVals) {
3481 for (
T *V : DeadVals) {
3482 if (!V || !Processed.
insert(V).second)
3487 for (
Use &U :
I->operands()) {
3489 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3491 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3492 return Entry->VectorizedValue == OpI;
3496 I->dropAllReferences();
3498 for (
T *V : DeadVals) {
3500 if (!
I->getParent())
3505 cast<Instruction>(U.getUser()));
3507 "trying to erase instruction with users.");
3508 I->removeFromParent();
3512 while (!DeadInsts.
empty()) {
3515 if (!VI || !VI->getParent())
3518 "Live instruction found in dead worklist!");
3519 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3526 for (
Use &OpU : VI->operands()) {
3527 Value *OpV = OpU.get();
3539 if (!DeletedInstructions.contains(OpI) &&
3540 (!OpI->getType()->isVectorTy() ||
3541 none_of(VectorValuesAndScales,
3542 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3543 return std::get<0>(V) == OpI;
3549 VI->removeFromParent();
3551 SE->forgetValue(VI);
3558 return AnalyzedReductionsRoots.count(
I);
3563 AnalyzedReductionsRoots.insert(
I);
3568 return AnalyzedReductionVals.contains(
hash_value(VL));
3573 AnalyzedReductionVals.insert(
hash_value(VL));
3577 AnalyzedReductionsRoots.clear();
3578 AnalyzedReductionVals.clear();
3579 AnalyzedMinBWVals.clear();
3587 return MustGather.contains(V);
3591 return NonScheduledFirst.contains(V);
3596 assert(V &&
"V cannot be nullptr.");
3597 return ScalarToTreeEntries.contains(V);
3607 bool collectValuesToDemote(
3608 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3611 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3620 void buildReorderableOperands(
3628 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3631 bool areAllUsersVectorized(
3640 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3641 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3642 return const_cast<TreeEntry *
>(
3643 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3649 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3653 getCastContextHint(
const TreeEntry &TE)
const;
3667 const InstructionsState &LocalState,
3674 unsigned InterleaveFactor = 0);
3685 bool ResizeAllowed =
false)
const;
3692 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx);
3697 template <
typename BVTy,
typename ResTy,
typename... Args>
3698 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3703 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
3709 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3716 std::optional<TargetTransformInfo::ShuffleKind>
3728 unsigned NumParts)
const;
3740 std::optional<TargetTransformInfo::ShuffleKind>
3741 isGatherShuffledSingleRegisterEntry(
3758 isGatherShuffledEntry(
3761 unsigned NumParts,
bool ForOrder =
false);
3767 Type *ScalarTy)
const;
3771 void setInsertPointAfterBundle(
const TreeEntry *E);
3781 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3786 void tryToVectorizeGatheredLoads(
3788 std::tuple<BasicBlock *, Value *, Type *>,
3796 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3812 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3816 void reorderGatherNode(TreeEntry &TE);
3821 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3825 if (State == TreeEntry::SplitVectorize)
3827 SmallVector<int>
Mask;
3834 SmallVector<int> getSplitMask()
const {
3835 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3836 "Expected only split vectorize node.");
3838 unsigned CommonVF = std::max<unsigned>(
3839 CombinedEntriesWithIndices.back().second,
3840 Scalars.size() - CombinedEntriesWithIndices.back().second);
3841 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3843 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3844 ? CommonVF - CombinedEntriesWithIndices.back().second
3851 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3852 ArrayRef<int> MaskOrder);
3857 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3858 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3861 [Scalars](
Value *V,
int Idx) {
3862 return (isa<UndefValue>(V) &&
3863 Idx == PoisonMaskElem) ||
3864 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3867 if (!ReorderIndices.empty()) {
3871 SmallVector<int>
Mask;
3873 if (VL.
size() == Scalars.size())
3874 return IsSame(Scalars, Mask);
3875 if (VL.
size() == ReuseShuffleIndices.size()) {
3877 return IsSame(Scalars, Mask);
3881 return IsSame(Scalars, ReuseShuffleIndices);
3885 bool hasEqualOperands(
const TreeEntry &TE)
const {
3886 if (
TE.getNumOperands() != getNumOperands())
3888 SmallBitVector
Used(getNumOperands());
3889 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3890 unsigned PrevCount =
Used.count();
3891 for (
unsigned K = 0;
K <
E; ++
K) {
3894 if (getOperand(K) ==
TE.getOperand(
I)) {
3900 if (PrevCount ==
Used.count())
3909 unsigned getVectorFactor()
const {
3910 if (!ReuseShuffleIndices.empty())
3911 return ReuseShuffleIndices.size();
3912 return Scalars.size();
3916 bool isGather()
const {
return State == NeedToGather; }
3922 WeakTrackingVH VectorizedValue =
nullptr;
3943 enum CombinedOpcode {
3945 MinMax = Instruction::OtherOpsEnd + 1,
3948 CombinedOpcode CombinedOp = NotCombinedOp;
3951 SmallVector<int, 4> ReuseShuffleIndices;
3954 SmallVector<unsigned, 4> ReorderIndices;
3962 VecTreeTy &Container;
3965 EdgeInfo UserTreeIndex;
3981 SmallPtrSet<const Value *, 4> CopyableElements;
3985 InstructionsState S = InstructionsState::invalid();
3988 unsigned InterleaveFactor = 0;
3991 bool DoesNotNeedToSchedule =
false;
3995 if (Operands.size() <
OpIdx + 1)
3996 Operands.resize(
OpIdx + 1);
3999 "Number of operands is greater than the number of scalars.");
4006 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4008 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4011 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4014 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4019 setOperand(
I, Operands[
I]);
4023 void reorderOperands(ArrayRef<int> Mask) {
4031 return Operands[
OpIdx];
4037 return Operands[
OpIdx];
4041 unsigned getNumOperands()
const {
return Operands.size(); }
4044 Value *getSingleOperand(
unsigned OpIdx)
const {
4047 return Operands[
OpIdx][0];
4051 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4053 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4054 return S.getMatchingMainOpOrAltOp(
I);
4062 if (
I && getMatchingMainOpOrAltOp(
I))
4064 return S.getMainOp();
4067 void setOperations(
const InstructionsState &S) {
4068 assert(S &&
"InstructionsState is invalid.");
4072 Instruction *getMainOp()
const {
return S.getMainOp(); }
4074 Instruction *getAltOp()
const {
return S.getAltOp(); }
4077 unsigned getOpcode()
const {
return S.getOpcode(); }
4079 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4081 bool hasState()
const {
return S.valid(); }
4084 void addCopyableElement(
Value *V) {
4085 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4086 CopyableElements.insert(V);
4090 bool isCopyableElement(
Value *V)
const {
4091 return CopyableElements.contains(V);
4095 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4098 const InstructionsState &getOperations()
const {
return S; }
4102 unsigned findLaneForValue(
Value *V)
const {
4103 unsigned FoundLane = getVectorFactor();
4104 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4105 std::advance(It, 1)) {
4108 FoundLane = std::distance(Scalars.begin(), It);
4109 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4110 if (!ReorderIndices.empty())
4111 FoundLane = ReorderIndices[FoundLane];
4112 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4113 if (ReuseShuffleIndices.empty())
4115 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4116 RIt != ReuseShuffleIndices.end()) {
4117 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4121 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4128 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4129 SmallVectorImpl<int> &Mask,
4130 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4131 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4134 bool isNonPowOf2Vec()
const {
4136 return IsNonPowerOf2;
4142 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4145 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4146 "Reshuffling not supported with non-power-of-2 vectors yet.");
4147 return IsNonPowerOf2;
4150 Value *getOrdered(
unsigned Idx)
const {
4151 assert(isGather() &&
"Must be used only for buildvectors/gathers.");
4152 if (ReorderIndices.empty())
4153 return Scalars[Idx];
4154 SmallVector<int>
Mask;
4156 return Scalars[
Mask[Idx]];
4162 dbgs() << Idx <<
".\n";
4163 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4164 dbgs() <<
"Operand " << OpI <<
":\n";
4165 for (
const Value *V : Operands[OpI])
4168 dbgs() <<
"Scalars: \n";
4169 for (
Value *V : Scalars)
4171 dbgs() <<
"State: ";
4172 if (S && hasCopyableElements())
4173 dbgs() <<
"[[Copyable]] ";
4176 if (InterleaveFactor > 0) {
4177 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4180 dbgs() <<
"Vectorize\n";
4183 case ScatterVectorize:
4184 dbgs() <<
"ScatterVectorize\n";
4186 case StridedVectorize:
4187 dbgs() <<
"StridedVectorize\n";
4189 case CompressVectorize:
4190 dbgs() <<
"CompressVectorize\n";
4193 dbgs() <<
"NeedToGather\n";
4195 case CombinedVectorize:
4196 dbgs() <<
"CombinedVectorize\n";
4198 case SplitVectorize:
4199 dbgs() <<
"SplitVectorize\n";
4203 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4204 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4206 dbgs() <<
"MainOp: NULL\n";
4207 dbgs() <<
"AltOp: NULL\n";
4209 dbgs() <<
"VectorizedValue: ";
4210 if (VectorizedValue)
4211 dbgs() << *VectorizedValue <<
"\n";
4214 dbgs() <<
"ReuseShuffleIndices: ";
4215 if (ReuseShuffleIndices.empty())
4218 for (
int ReuseIdx : ReuseShuffleIndices)
4219 dbgs() << ReuseIdx <<
", ";
4221 dbgs() <<
"ReorderIndices: ";
4222 for (
unsigned ReorderIdx : ReorderIndices)
4223 dbgs() << ReorderIdx <<
", ";
4225 dbgs() <<
"UserTreeIndex: ";
4227 dbgs() << UserTreeIndex;
4229 dbgs() <<
"<invalid>";
4231 if (!CombinedEntriesWithIndices.empty()) {
4232 dbgs() <<
"Combined entries: ";
4234 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4245 StringRef Banner)
const {
4246 dbgs() <<
"SLP: " << Banner <<
":\n";
4248 dbgs() <<
"SLP: Costs:\n";
4249 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4250 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4251 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4252 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4253 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4259 const InstructionsState &S,
4261 ArrayRef<int> ReuseShuffleIndices = {}) {
4262 auto Invalid = ScheduleBundle::invalid();
4263 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4268 const InstructionsState &S,
4270 ArrayRef<int> ReuseShuffleIndices = {},
4271 ArrayRef<unsigned> ReorderIndices = {},
4272 unsigned InterleaveFactor = 0) {
4273 TreeEntry::EntryState EntryState =
4274 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4275 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4276 ReuseShuffleIndices, ReorderIndices);
4277 if (
E && InterleaveFactor > 0)
4278 E->setInterleave(InterleaveFactor);
4283 TreeEntry::EntryState EntryState,
4284 ScheduleBundle &Bundle,
const InstructionsState &S,
4286 ArrayRef<int> ReuseShuffleIndices = {},
4287 ArrayRef<unsigned> ReorderIndices = {}) {
4288 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4289 EntryState == TreeEntry::SplitVectorize)) ||
4290 (Bundle && EntryState != TreeEntry::NeedToGather &&
4291 EntryState != TreeEntry::SplitVectorize)) &&
4292 "Need to vectorize gather entry?");
4294 if (GatheredLoadsEntriesFirst.has_value() &&
4295 EntryState == TreeEntry::NeedToGather && S &&
4296 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4297 !UserTreeIdx.UserTE)
4299 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4300 TreeEntry *
Last = VectorizableTree.back().get();
4301 Last->Idx = VectorizableTree.size() - 1;
4302 Last->State = EntryState;
4303 if (UserTreeIdx.UserTE)
4304 OperandsToTreeEntry.try_emplace(
4305 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4310 ReuseShuffleIndices.empty()) &&
4311 "Reshuffling scalars not yet supported for nodes with padding");
4312 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4313 ReuseShuffleIndices.end());
4314 if (ReorderIndices.
empty()) {
4317 Last->setOperations(S);
4320 Last->Scalars.assign(VL.
size(),
nullptr);
4322 [VL](
unsigned Idx) ->
Value * {
4323 if (Idx >= VL.size())
4324 return UndefValue::get(VL.front()->getType());
4329 Last->setOperations(S);
4330 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4332 if (EntryState == TreeEntry::SplitVectorize) {
4333 assert(S &&
"Split nodes must have operations.");
4334 Last->setOperations(S);
4335 SmallPtrSet<Value *, 4> Processed;
4336 for (
Value *V : VL) {
4340 auto It = ScalarsInSplitNodes.find(V);
4341 if (It == ScalarsInSplitNodes.end()) {
4342 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4343 (void)Processed.
insert(V);
4344 }
else if (Processed.
insert(V).second) {
4346 "Value already associated with the node.");
4347 It->getSecond().push_back(
Last);
4350 }
else if (!
Last->isGather()) {
4353 (!S.areInstructionsWithCopyableElements() &&
4355 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4356 Last->setDoesNotNeedToSchedule();
4357 SmallPtrSet<Value *, 4> Processed;
4358 for (
Value *V : VL) {
4361 if (S.isCopyableElement(V)) {
4362 Last->addCopyableElement(V);
4365 auto It = ScalarToTreeEntries.find(V);
4366 if (It == ScalarToTreeEntries.end()) {
4367 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4368 (void)Processed.
insert(V);
4369 }
else if (Processed.
insert(V).second) {
4371 "Value already associated with the node.");
4372 It->getSecond().push_back(
Last);
4376 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4377 "Bundle and VL out of sync");
4378 if (!Bundle.getBundle().empty()) {
4379#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4380 auto *BundleMember = Bundle.getBundle().begin();
4381 SmallPtrSet<Value *, 4> Processed;
4382 for (
Value *V : VL) {
4383 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4387 assert(BundleMember == Bundle.getBundle().end() &&
4388 "Bundle and VL out of sync");
4390 Bundle.setTreeEntry(
Last);
4394 bool AllConstsOrCasts =
true;
4395 for (
Value *V : VL) {
4396 if (S && S.areInstructionsWithCopyableElements() &&
4397 S.isCopyableElement(V))
4398 Last->addCopyableElement(V);
4401 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4402 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4403 !UserTreeIdx.UserTE->isGather())
4404 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4407 if (AllConstsOrCasts)
4409 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4410 MustGather.insert_range(VL);
4413 if (UserTreeIdx.UserTE)
4414 Last->UserTreeIndex = UserTreeIdx;
4420 TreeEntry::VecTreeTy VectorizableTree;
4425 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4426 VectorizableTree[
Id]->dump();
4434 assert(V &&
"V cannot be nullptr.");
4435 auto It = ScalarToTreeEntries.find(V);
4436 if (It == ScalarToTreeEntries.end())
4438 return It->getSecond();
4443 assert(V &&
"V cannot be nullptr.");
4444 auto It = ScalarsInSplitNodes.find(V);
4445 if (It == ScalarsInSplitNodes.end())
4447 return It->getSecond();
4452 bool SameVF =
false)
const {
4453 assert(V &&
"V cannot be nullptr.");
4454 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4455 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4466 bool areAltOperandsProfitable(
const InstructionsState &S,
4471 class ScalarsVectorizationLegality {
4472 InstructionsState S;
4474 bool TryToFindDuplicates;
4475 bool TrySplitVectorize;
4478 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4479 bool TryToFindDuplicates =
true,
4480 bool TrySplitVectorize =
false)
4481 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4482 TrySplitVectorize(TrySplitVectorize) {
4483 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4484 "Inconsistent state");
4486 const InstructionsState &getInstructionsState()
const {
return S; };
4487 bool isLegal()
const {
return IsLegal; }
4488 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4489 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4494 ScalarsVectorizationLegality
4497 bool TryCopyableElementsVectorization)
const;
4501 TreeEntry::EntryState getScalarsVectorizationState(
4503 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4504 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4507 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4510 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4511 OperandsToTreeEntry;
4514 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4517 SmallDenseMap<Value *, unsigned> InstrElementSize;
4531 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4536 SetVector<const TreeEntry *> PostponedGathers;
4538 using ValueToGatherNodesMap =
4539 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4540 ValueToGatherNodesMap ValueToGatherNodes;
4545 SetVector<unsigned> LoadEntriesToVectorize;
4548 bool IsGraphTransformMode =
false;
4551 std::optional<unsigned> GatheredLoadsEntriesFirst;
4554 SmallDenseMap<
const TreeEntry *,
4555 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4556 CompressEntryToData;
4559 struct ExternalUser {
4560 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4561 : Scalar(S), User(
U), E(E), Lane(
L) {}
4564 Value *Scalar =
nullptr;
4567 llvm::User *User =
nullptr;
4575 using UserList = SmallVector<ExternalUser, 16>;
4581 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4582 Instruction *Inst2) {
4585 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4586 auto Res = AliasCache.try_emplace(
Key);
4588 return Res.first->second;
4589 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4591 Res.first->getSecond() = Aliased;
4595 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4599 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4604 BatchAAResults BatchAA;
4611 DenseSet<Instruction *> DeletedInstructions;
4614 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4617 DenseSet<size_t> AnalyzedReductionVals;
4621 DenseSet<Value *> AnalyzedMinBWVals;
4627 UserList ExternalUses;
4631 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4635 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4638 SmallPtrSet<const Value *, 32> EphValues;
4642 SetVector<Instruction *> GatherShuffleExtractSeq;
4645 DenseSet<BasicBlock *> CSEBlocks;
4648 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4655 class ScheduleEntity {
4656 friend class ScheduleBundle;
4657 friend class ScheduleData;
4658 friend class ScheduleCopyableData;
4661 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4662 Kind getKind()
const {
return K; }
4663 ScheduleEntity(Kind K) : K(K) {}
4667 int SchedulingPriority = 0;
4670 bool IsScheduled =
false;
4672 const Kind K = Kind::ScheduleData;
4675 ScheduleEntity() =
delete;
4677 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4678 int getSchedulingPriority()
const {
return SchedulingPriority; }
4679 bool isReady()
const {
4681 return SD->isReady();
4683 return CD->isReady();
4689 bool hasValidDependencies()
const {
4691 return SD->hasValidDependencies();
4693 return CD->hasValidDependencies();
4697 int getUnscheduledDeps()
const {
4699 return SD->getUnscheduledDeps();
4701 return CD->getUnscheduledDeps();
4705 int incrementUnscheduledDeps(
int Incr) {
4707 return SD->incrementUnscheduledDeps(Incr);
4711 int getDependencies()
const {
4713 return SD->getDependencies();
4719 return SD->getInst();
4724 bool isScheduled()
const {
return IsScheduled; }
4725 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4727 static bool classof(
const ScheduleEntity *) {
return true; }
4729#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4730 void dump(raw_ostream &OS)
const {
4732 return SD->dump(OS);
4734 return CD->dump(OS);
4745#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4747 const BoUpSLP::ScheduleEntity &SE) {
4757 class ScheduleData final :
public ScheduleEntity {
4761 enum { InvalidDeps = -1 };
4763 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4764 static bool classof(
const ScheduleEntity *Entity) {
4765 return Entity->getKind() == Kind::ScheduleData;
4768 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4769 NextLoadStore =
nullptr;
4770 IsScheduled =
false;
4771 SchedulingRegionID = BlockSchedulingRegionID;
4772 clearDependencies();
4778 if (hasValidDependencies()) {
4779 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4781 assert(UnscheduledDeps == Dependencies &&
"invariant");
4785 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4786 "unexpected scheduled state");
4793 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4797 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4802 int incrementUnscheduledDeps(
int Incr) {
4803 assert(hasValidDependencies() &&
4804 "increment of unscheduled deps would be meaningless");
4805 UnscheduledDeps += Incr;
4806 return UnscheduledDeps;
4811 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4814 void clearDependencies() {
4815 clearDirectDependencies();
4816 MemoryDependencies.clear();
4817 ControlDependencies.clear();
4824 void clearDirectDependencies() {
4825 Dependencies = InvalidDeps;
4826 resetUnscheduledDeps();
4827 IsScheduled =
false;
4831 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4833 int getDependencies()
const {
return Dependencies; }
4835 void initDependencies() { Dependencies = 0; }
4837 void incDependencies() { Dependencies++; }
4840 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4847 return MemoryDependencies;
4850 void addMemoryDependency(ScheduleData *Dep) {
4851 MemoryDependencies.push_back(Dep);
4855 return ControlDependencies;
4858 void addControlDependency(ScheduleData *Dep) {
4859 ControlDependencies.push_back(Dep);
4862 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4863 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4865 void dump(raw_ostream &OS)
const { OS << *Inst; }
4877 ScheduleData *NextLoadStore =
nullptr;
4881 SmallVector<ScheduleData *> MemoryDependencies;
4887 SmallVector<ScheduleData *> ControlDependencies;
4891 int SchedulingRegionID = 0;
4897 int Dependencies = InvalidDeps;
4903 int UnscheduledDeps = InvalidDeps;
4908 const BoUpSLP::ScheduleData &SD) {
4914 class ScheduleBundle final :
public ScheduleEntity {
4918 bool IsValid =
true;
4920 TreeEntry *TE =
nullptr;
4921 ScheduleBundle(
bool IsValid)
4922 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4925 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4926 static bool classof(
const ScheduleEntity *Entity) {
4927 return Entity->getKind() == Kind::ScheduleBundle;
4932 for (
const ScheduleEntity *SD : Bundle) {
4933 if (SD->hasValidDependencies()) {
4934 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4937 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4941 if (isScheduled()) {
4942 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4943 "unexpected scheduled state");
4949 int unscheduledDepsInBundle()
const {
4950 assert(*
this &&
"bundle must not be empty");
4952 for (
const ScheduleEntity *BundleMember : Bundle) {
4953 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4954 return ScheduleData::InvalidDeps;
4955 Sum += BundleMember->getUnscheduledDeps();
4963 bool hasValidDependencies()
const {
4964 return all_of(Bundle, [](
const ScheduleEntity *SD) {
4965 return SD->hasValidDependencies();
4971 bool isReady()
const {
4972 assert(*
this &&
"bundle must not be empty");
4973 return unscheduledDepsInBundle() == 0 && !isScheduled();
4981 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
4984 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
4985 TreeEntry *getTreeEntry()
const {
return TE; }
4987 static ScheduleBundle invalid() {
return {
false}; }
4989 operator bool()
const {
return IsValid; }
4992 void dump(raw_ostream &OS)
const {
5001 OS << *SD->getInst();
5015 const BoUpSLP::ScheduleBundle &Bundle) {
5026 class ScheduleCopyableData final :
public ScheduleEntity {
5033 int SchedulingRegionID = 0;
5035 ScheduleBundle &Bundle;
5038 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5039 const EdgeInfo &EI, ScheduleBundle &Bundle)
5040 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5041 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5042 static bool classof(
const ScheduleEntity *Entity) {
5043 return Entity->getKind() == Kind::ScheduleCopyableData;
5048 if (hasValidDependencies()) {
5049 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5051 assert(UnscheduledDeps == Dependencies &&
"invariant");
5055 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5056 "unexpected scheduled state");
5063 bool hasValidDependencies()
const {
5064 return Dependencies != ScheduleData::InvalidDeps;
5069 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5074 int incrementUnscheduledDeps(
int Incr) {
5075 assert(hasValidDependencies() &&
5076 "increment of unscheduled deps would be meaningless");
5077 UnscheduledDeps += Incr;
5078 assert(UnscheduledDeps >= 0 &&
"invariant");
5079 return UnscheduledDeps;
5084 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5087 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5089 int getDependencies()
const {
return Dependencies; }
5091 void initDependencies() { Dependencies = 0; }
5093 void incDependencies() { Dependencies++; }
5096 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5102 void clearDependencies() {
5103 Dependencies = ScheduleData::InvalidDeps;
5104 UnscheduledDeps = ScheduleData::InvalidDeps;
5105 IsScheduled =
false;
5109 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5112 ScheduleBundle &getBundle() {
return Bundle; }
5113 const ScheduleBundle &getBundle()
const {
return Bundle; }
5115#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5116 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5127 int Dependencies = ScheduleData::InvalidDeps;
5133 int UnscheduledDeps = ScheduleData::InvalidDeps;
5163 struct BlockScheduling {
5165 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5168 ScheduledBundles.clear();
5169 ScheduledBundlesList.
clear();
5170 ScheduleCopyableDataMap.clear();
5171 ScheduleCopyableDataMapByInst.clear();
5172 ScheduleCopyableDataMapByInstUser.clear();
5173 ScheduleCopyableDataMapByUsers.clear();
5175 ScheduleStart =
nullptr;
5176 ScheduleEnd =
nullptr;
5177 FirstLoadStoreInRegion =
nullptr;
5178 LastLoadStoreInRegion =
nullptr;
5179 RegionHasStackSave =
false;
5183 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5186 ScheduleRegionSize = 0;
5190 ++SchedulingRegionID;
5196 if (BB !=
I->getParent())
5199 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5200 if (SD && isInSchedulingRegion(*SD))
5205 ScheduleData *getScheduleData(
Value *V) {
5211 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5212 const Value *V)
const {
5213 if (ScheduleCopyableDataMap.empty())
5215 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5216 if (It == ScheduleCopyableDataMap.end())
5218 ScheduleCopyableData *SD = It->getSecond().get();
5219 if (!isInSchedulingRegion(*SD))
5227 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5229 if (ScheduleCopyableDataMapByInstUser.empty())
5231 const auto It = ScheduleCopyableDataMapByInstUser.find(
5232 std::make_pair(std::make_pair(User, OperandIdx), V));
5233 if (It == ScheduleCopyableDataMapByInstUser.end())
5236 for (ScheduleCopyableData *SD : It->getSecond()) {
5237 if (isInSchedulingRegion(*SD))
5251 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5255 if (ScheduleCopyableDataMap.empty())
5257 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5258 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5259 for (
const Use &U :
User->operands()) {
5263 if (Entries.
empty())
5267 for (TreeEntry *TE : Entries) {
5273 bool IsCommutativeUser =
5276 EdgeInfo EI(TE,
U.getOperandNo());
5279 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5280 if (!getScheduleCopyableData(EI,
Op) && OpCnt <
NumOps)
5286 ++PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5287 .first->getSecond();
5291 if (!PotentiallyReorderedEntriesCount.
empty()) {
5292 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5293 auto *It =
find(
P.first->Scalars, User);
5294 assert(It !=
P.first->Scalars.end() &&
5295 "User is not in the tree entry");
5296 int Lane = std::distance(
P.first->Scalars.begin(), It);
5297 assert(Lane >= 0 &&
"Lane is not found");
5299 Lane =
P.first->ReorderIndices[Lane];
5300 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5301 "Couldn't find extract lane");
5302 SmallVector<unsigned> OpIndices;
5303 for (
unsigned OpIdx :
5305 P.first->getMainOp()))) {
5306 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5307 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5311 return all_of(PotentiallyReorderedEntriesCount,
5312 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5313 return P.second ==
NumOps - 1;
5320 getScheduleCopyableData(
const Instruction *
I)
const {
5321 if (ScheduleCopyableDataMapByInst.empty())
5323 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5324 if (It == ScheduleCopyableDataMapByInst.end())
5327 for (ScheduleCopyableData *SD : It->getSecond()) {
5328 if (isInSchedulingRegion(*SD))
5335 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5336 if (ScheduleCopyableDataMapByUsers.empty())
5338 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5339 if (It == ScheduleCopyableDataMapByUsers.end())
5342 for (ScheduleCopyableData *SD : It->getSecond()) {
5343 if (isInSchedulingRegion(*SD))
5349 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5351 int SchedulingRegionID,
5352 ScheduleBundle &Bundle) {
5353 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5354 ScheduleCopyableData *CD =
5355 ScheduleCopyableDataMap
5356 .try_emplace(std::make_pair(EI,
I),
5357 std::make_unique<ScheduleCopyableData>(
5358 SchedulingRegionID,
I, EI, Bundle))
5361 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5365 assert(It !=
Op.end() &&
"Lane not set");
5366 SmallPtrSet<Instruction *, 4> Visited;
5368 int Lane = std::distance(
Op.begin(), It);
5369 assert(Lane >= 0 &&
"Lane not set");
5371 !EI.UserTE->ReorderIndices.empty())
5372 Lane = EI.UserTE->ReorderIndices[Lane];
5373 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5374 "Couldn't find extract lane");
5376 if (!Visited.
insert(In).second) {
5380 ScheduleCopyableDataMapByInstUser
5381 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5384 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5391 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5392 if (ScheduleCopyableData *UserCD =
5393 getScheduleCopyableData(UserEI, In))
5394 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5397 }
while (It !=
Op.end());
5399 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5409 auto It = ScheduledBundles.find(
I);
5410 if (It == ScheduledBundles.end())
5412 return It->getSecond();
5416 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5418 return Data->getSchedulingRegionID() == SchedulingRegionID;
5420 return CD->getSchedulingRegionID() == SchedulingRegionID;
5422 [&](
const ScheduleEntity *BundleMember) {
5423 return isInSchedulingRegion(*BundleMember);
5429 template <
typename ReadyListType>
5430 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5431 const EdgeInfo &EI, ScheduleEntity *
Data,
5432 ReadyListType &ReadyList) {
5433 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5438 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5439 if ((IsControl ||
Data->hasValidDependencies()) &&
5440 Data->incrementUnscheduledDeps(-1) == 0) {
5447 CopyableBundle.
push_back(&CD->getBundle());
5448 Bundles = CopyableBundle;
5450 Bundles = getScheduleBundles(
Data->getInst());
5452 if (!Bundles.
empty()) {
5453 for (ScheduleBundle *Bundle : Bundles) {
5454 if (Bundle->unscheduledDepsInBundle() == 0) {
5455 assert(!Bundle->isScheduled() &&
5456 "already scheduled bundle gets ready");
5457 ReadyList.insert(Bundle);
5459 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5465 "already scheduled bundle gets ready");
5467 "Expected non-copyable data");
5468 ReadyList.insert(
Data);
5475 if (!ScheduleCopyableDataMap.empty()) {
5477 getScheduleCopyableData(User,
OpIdx,
I);
5478 for (ScheduleCopyableData *CD : CopyableData)
5479 DecrUnsched(CD,
false);
5480 if (!CopyableData.empty())
5483 if (ScheduleData *OpSD = getScheduleData(
I))
5484 DecrUnsched(OpSD,
false);
5490 if (!Bundles.empty()) {
5491 auto *
In = BundleMember->getInst();
5493 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5494 unsigned TotalOpCount = 0;
5497 TotalOpCount = OperandsUses[
In] = 1;
5499 for (
const Use &U :
In->operands()) {
5502 ++Res.first->getSecond();
5509 auto DecrUnschedForInst = [&](
Instruction *
I, TreeEntry *UserTE,
5511 if (!ScheduleCopyableDataMap.empty()) {
5512 const EdgeInfo EI = {UserTE,
OpIdx};
5513 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI,
I)) {
5514 DecrUnsched(CD,
false);
5518 auto It = OperandsUses.
find(
I);
5519 assert(It != OperandsUses.
end() &&
"Operand not found");
5520 if (It->second > 0) {
5522 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5524 if (ScheduleData *OpSD = getScheduleData(
I))
5525 DecrUnsched(OpSD,
false);
5529 for (ScheduleBundle *Bundle : Bundles) {
5530 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5534 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5535 find(Bundle->getTreeEntry()->Scalars, In));
5536 assert(Lane >= 0 &&
"Lane not set");
5538 !Bundle->getTreeEntry()->ReorderIndices.empty())
5539 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5540 assert(Lane <
static_cast<int>(
5541 Bundle->getTreeEntry()->Scalars.size()) &&
5542 "Couldn't find extract lane");
5552 In->getNumOperands() ==
5553 Bundle->getTreeEntry()->getNumOperands() ||
5554 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5555 "Missed TreeEntry operands?");
5557 for (
unsigned OpIdx :
5560 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5563 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx);
5569 for (Use &U : BundleMember->getInst()->operands()) {
5572 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5573 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5581 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5582 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5583 if (!VisitedMemory.
insert(MemoryDep).second)
5588 << *MemoryDep <<
"\n");
5589 DecrUnsched(MemoryDep);
5592 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5593 for (ScheduleData *Dep : SD->getControlDependencies()) {
5594 if (!VisitedControl.
insert(Dep).second)
5599 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5600 DecrUnsched(Dep,
true);
5604 SD->setScheduled(
true);
5609 if (
R.isVectorized(In)) {
5611 for (TreeEntry *TE : Entries) {
5613 In->getNumOperands() !=
TE->getNumOperands())
5616 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5617 BundlePtr->setTreeEntry(TE);
5622 ProcessBundleMember(SD, Bundles);
5625 Bundle.setScheduled(
true);
5627 auto AreAllBundlesScheduled =
5628 [&](
const ScheduleEntity *SD,
5632 return !SDBundles.empty() &&
5633 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5634 return SDBundle->isScheduled();
5637 for (ScheduleEntity *SD : Bundle.getBundle()) {
5640 SDBundles = getScheduleBundles(SD->getInst());
5641 if (AreAllBundlesScheduled(SD, SDBundles)) {
5642 SD->setScheduled(
true);
5655 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5656 ScheduleStart->comesBefore(ScheduleEnd) &&
5657 "Not a valid scheduling region?");
5659 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5661 if (!Bundles.
empty()) {
5662 for (ScheduleBundle *Bundle : Bundles) {
5663 assert(isInSchedulingRegion(*Bundle) &&
5664 "primary schedule data not in window?");
5669 auto *SD = getScheduleData(
I);
5672 assert(isInSchedulingRegion(*SD) &&
5673 "primary schedule data not in window?");
5678 [](
const ScheduleEntity *Bundle) {
5679 return Bundle->isReady();
5681 "item in ready list not ready?");
5685 template <
typename ReadyListType>
5686 void initialFillReadyList(ReadyListType &ReadyList) {
5687 SmallPtrSet<ScheduleBundle *, 16> Visited;
5688 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5689 ScheduleData *SD = getScheduleData(
I);
5690 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5693 for (ScheduleBundle *Bundle : Bundles) {
5694 if (!Visited.
insert(Bundle).second)
5696 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5697 ReadyList.insert(Bundle);
5699 << *Bundle <<
"\n");
5704 ReadyList.insert(SD);
5706 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5717 const InstructionsState &S,
const EdgeInfo &EI);
5724 std::optional<ScheduleBundle *>
5726 const InstructionsState &S,
const EdgeInfo &EI);
5729 ScheduleData *allocateScheduleDataChunks();
5733 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5737 void initScheduleData(Instruction *FromI, Instruction *ToI,
5738 ScheduleData *PrevLoadStore,
5739 ScheduleData *NextLoadStore);
5743 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5748 void resetSchedule();
5765 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5769 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5770 std::unique_ptr<ScheduleCopyableData>>
5771 ScheduleCopyableDataMap;
5777 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5778 ScheduleCopyableDataMapByInst;
5784 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5786 ScheduleCopyableDataMapByInstUser;
5806 SmallSetVector<ScheduleCopyableData *, 4>>
5807 ScheduleCopyableDataMapByUsers;
5810 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5816 SetVector<ScheduleEntity *> ReadyInsts;
5826 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5830 ScheduleData *LastLoadStoreInRegion =
nullptr;
5835 bool RegionHasStackSave =
false;
5838 int ScheduleRegionSize = 0;
5847 int SchedulingRegionID = 1;
5851 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5855 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
5858 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
5862 struct OrdersTypeDenseMapInfo {
5875 static unsigned getHashValue(
const OrdersType &V) {
5886 ScalarEvolution *SE;
5887 TargetTransformInfo *TTI;
5888 TargetLibraryInfo *TLI;
5891 AssumptionCache *AC;
5893 const DataLayout *DL;
5894 OptimizationRemarkEmitter *ORE;
5896 unsigned MaxVecRegSize;
5897 unsigned MinVecRegSize;
5900 IRBuilder<TargetFolder> Builder;
5907 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
5912 unsigned ReductionBitWidth = 0;
5915 unsigned BaseGraphSize = 1;
5919 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5923 DenseSet<unsigned> ExtraBitWidthNodes;
5933 SecondInfo::getEmptyKey());
5938 SecondInfo::getTombstoneKey());
5943 SecondInfo::getHashValue(Val.
EdgeIdx));
5964 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5975 return R.VectorizableTree[0].get();
5979 return {&
N->UserTreeIndex,
N->Container};
5983 return {&
N->UserTreeIndex + 1,
N->Container};
6010 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6021 OS << Entry->Idx <<
".\n";
6024 for (
auto *V : Entry->Scalars) {
6026 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6027 return EU.Scalar == V;
6037 if (Entry->isGather())
6039 if (Entry->State == TreeEntry::ScatterVectorize ||
6040 Entry->State == TreeEntry::StridedVectorize ||
6041 Entry->State == TreeEntry::CompressVectorize)
6042 return "color=blue";
6051 for (
auto *
I : DeletedInstructions) {
6052 if (!
I->getParent()) {
6057 I->insertBefore(F->getEntryBlock(),
6058 F->getEntryBlock().getFirstNonPHIIt());
6060 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6063 for (
Use &U :
I->operands()) {
6065 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6069 I->dropAllReferences();
6071 for (
auto *
I : DeletedInstructions) {
6073 "trying to erase instruction with users.");
6074 I->eraseFromParent();
6080#ifdef EXPENSIVE_CHECKS
6091 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6092 "Expected non-empty mask.");
6095 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6097 Reuses[Mask[
I]] = Prev[
I];
6105 bool BottomOrder =
false) {
6106 assert(!Mask.empty() &&
"Expected non-empty mask.");
6107 unsigned Sz = Mask.size();
6110 if (Order.
empty()) {
6112 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6114 PrevOrder.
swap(Order);
6117 for (
unsigned I = 0;
I < Sz; ++
I)
6119 Order[
I] = PrevOrder[Mask[
I]];
6121 return Data.value() == Sz ||
Data.index() ==
Data.value();
6130 if (Order.
empty()) {
6132 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6142 for (
unsigned I = 0;
I < Sz; ++
I)
6144 Order[MaskOrder[
I]] =
I;
6148std::optional<BoUpSLP::OrdersType>
6150 bool TopToBottom,
bool IgnoreReorder) {
6151 assert(TE.isGather() &&
"Expected gather node only.");
6155 Type *ScalarTy = GatheredScalars.
front()->getType();
6156 size_t NumScalars = GatheredScalars.
size();
6158 return std::nullopt;
6165 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6167 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6170 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6171 return std::nullopt;
6172 OrdersType CurrentOrder(NumScalars, NumScalars);
6173 if (GatherShuffles.
size() == 1 &&
6175 Entries.
front().front()->isSame(TE.Scalars)) {
6179 return std::nullopt;
6181 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6182 TE.UserTreeIndex.UserTE)
6183 return std::nullopt;
6186 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6187 return std::nullopt;
6190 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6191 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6194 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6196 return std::nullopt;
6200 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6201 return CurrentOrder;
6205 return all_of(Mask, [&](
int I) {
6212 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6213 (Entries.
size() != 1 ||
6214 Entries.
front().front()->ReorderIndices.empty())) ||
6215 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6216 return std::nullopt;
6222 if (ShuffledSubMasks.
test(
I))
6224 const int VF = GetVF(
I);
6230 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6232 ShuffledSubMasks.
set(
I);
6236 int FirstMin = INT_MAX;
6237 int SecondVecFound =
false;
6239 int Idx = Mask[
I * PartSz + K];
6241 Value *V = GatheredScalars[
I * PartSz + K];
6243 SecondVecFound =
true;
6252 SecondVecFound =
true;
6256 FirstMin = (FirstMin / PartSz) * PartSz;
6258 if (SecondVecFound) {
6260 ShuffledSubMasks.
set(
I);
6264 int Idx = Mask[
I * PartSz + K];
6268 if (Idx >= PartSz) {
6269 SecondVecFound =
true;
6272 if (CurrentOrder[
I * PartSz + Idx] >
6273 static_cast<unsigned>(
I * PartSz + K) &&
6274 CurrentOrder[
I * PartSz + Idx] !=
6275 static_cast<unsigned>(
I * PartSz + Idx))
6276 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6279 if (SecondVecFound) {
6281 ShuffledSubMasks.
set(
I);
6287 if (!ExtractShuffles.
empty())
6288 TransformMaskToOrder(
6289 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6290 if (!ExtractShuffles[
I])
6293 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6295 int K =
I * PartSz + Idx;
6298 if (!TE.ReuseShuffleIndices.empty())
6299 K = TE.ReuseShuffleIndices[K];
6302 if (!TE.ReorderIndices.empty())
6303 K = std::distance(TE.ReorderIndices.begin(),
6304 find(TE.ReorderIndices, K));
6310 .getKnownMinValue());
6315 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6316 if (ShuffledSubMasks.
any())
6317 return std::nullopt;
6318 PartSz = NumScalars;
6321 if (!Entries.
empty())
6322 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6323 if (!GatherShuffles[
I])
6325 return std::max(Entries[
I].front()->getVectorFactor(),
6326 Entries[
I].back()->getVectorFactor());
6328 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6329 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6330 return std::nullopt;
6331 return std::move(CurrentOrder);
6336 bool CompareOpcodes =
true) {
6342 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6343 (!GEP2 || GEP2->getNumOperands() == 2) &&
6344 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6345 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6348 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6352template <
typename T>
6357 return CommonAlignment;
6363 "Order is empty. Please check it before using isReverseOrder.");
6364 unsigned Sz = Order.
size();
6366 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6377 const SCEV *PtrSCEVLowest =
nullptr;
6378 const SCEV *PtrSCEVHighest =
nullptr;
6386 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6387 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6394 PtrSCEVLowest = PtrSCEV;
6401 PtrSCEVHighest = PtrSCEV;
6409 int Size =
DL.getTypeStoreSize(ElemTy);
6410 auto TryGetStride = [&](
const SCEV *Dist,
6411 const SCEV *Multiplier) ->
const SCEV * {
6413 if (M->getOperand(0) == Multiplier)
6414 return M->getOperand(1);
6415 if (M->getOperand(1) == Multiplier)
6416 return M->getOperand(0);
6419 if (Multiplier == Dist)
6424 const SCEV *Stride =
nullptr;
6425 if (
Size != 1 || SCEVs.
size() > 2) {
6427 Stride = TryGetStride(Dist, Sz);
6435 using DistOrdPair = std::pair<int64_t, int>;
6437 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6439 bool IsConsecutive =
true;
6440 for (
const SCEV *PtrSCEV : SCEVs) {
6442 if (PtrSCEV != PtrSCEVLowest) {
6444 const SCEV *Coeff = TryGetStride(Diff, Stride);
6454 Dist = SC->getAPInt().getZExtValue();
6459 auto Res = Offsets.emplace(Dist, Cnt);
6463 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6466 if (Offsets.size() != SCEVs.
size())
6468 SortedIndices.
clear();
6469 if (!IsConsecutive) {
6473 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6474 SortedIndices[Cnt] = Pair.second;
6481static std::pair<InstructionCost, InstructionCost>
6484 Type *ScalarTy, VectorType *VecTy);
6502 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6505 Mask, NumSrcElts, NumSubElts, Index)) {
6506 if (Index + NumSubElts > NumSrcElts &&
6507 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6524 "ScalableVectorType is not supported.");
6527 "Incorrect usage.");
6532 unsigned ScalarTyNumElements = VecTy->getNumElements();
6535 if (!DemandedElts[
I])
6539 I * ScalarTyNumElements, VecTy);
6542 I * ScalarTyNumElements, VecTy);
6555 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6556 if (Opcode == Instruction::ExtractElement) {
6562 Index * VecTy->getNumElements(), VecTy);
6565 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6578 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6580 Index * ScalarTy->getNumElements(), SubTp) +
6584 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6600 auto *Begin = std::next(
Mask.begin(), Index);
6601 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6605 std::iota(
Mask.begin(),
Mask.end(), 0);
6606 std::iota(std::next(
Mask.begin(), Index),
6607 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6609 return Generator(Vec, V, Mask);
6612 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6620 unsigned SubVecVF,
unsigned Index) {
6622 std::iota(Mask.begin(), Mask.end(), Index);
6623 return Builder.CreateShuffleVector(Vec, Mask);
6633 const unsigned Sz = PointerOps.
size();
6636 CompressMask[0] = 0;
6638 std::optional<unsigned> Stride = 0;
6642 std::optional<int64_t> OptPos =
6644 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6646 unsigned Pos =
static_cast<unsigned>(*OptPos);
6647 CompressMask[
I] = Pos;
6654 if (Pos != *Stride *
I)
6657 return Stride.has_value();
6670 InterleaveFactor = 0;
6672 const size_t Sz = VL.
size();
6680 if (AreAllUsersVectorized(V))
6683 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6684 Mask.empty() ?
I : Mask[
I]);
6687 if (ExtractCost <= ScalarCost)
6692 if (Order.
empty()) {
6693 Ptr0 = PointerOps.
front();
6694 PtrN = PointerOps.
back();
6696 Ptr0 = PointerOps[Order.
front()];
6697 PtrN = PointerOps[Order.
back()];
6699 std::optional<int64_t> Diff =
6703 const size_t MaxRegSize =
6707 if (*Diff / Sz >= MaxRegSize / 8)
6711 Align CommonAlignment = LI->getAlign();
6713 Ptr0, LoadVecTy, CommonAlignment,
DL,
6716 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6717 LI->getPointerAddressSpace()))
6723 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6727 auto [ScalarGEPCost, VectorGEPCost] =
6729 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6747 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6748 LI->getPointerAddressSpace(),
CostKind);
6751 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6752 LI->getPointerAddressSpace(),
CostKind);
6754 if (IsStrided && !IsMasked && Order.
empty()) {
6761 AlignedLoadVecTy = LoadVecTy;
6762 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6764 LI->getPointerAddressSpace())) {
6766 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6767 Instruction::Load, AlignedLoadVecTy,
6768 CompressMask[1], {}, CommonAlignment,
6769 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6770 if (InterleavedCost < GatherCost) {
6771 InterleaveFactor = CompressMask[1];
6772 LoadVecTy = AlignedLoadVecTy;
6779 if (!Order.
empty()) {
6782 NewMask[
I] = CompressMask[Mask[
I]];
6784 CompressMask.
swap(NewMask);
6786 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6787 return TotalVecCost < GatherCost;
6800 unsigned InterleaveFactor;
6804 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6805 CompressMask, LoadVecTy);
6825 const bool IsAnyPointerUsedOutGraph,
6827 StridedPtrInfo &SPtrInfo)
const {
6828 const size_t Sz = VL.
size();
6829 const uint64_t AbsoluteDiff = std::abs(Diff);
6832 if (IsAnyPointerUsedOutGraph ||
6833 (AbsoluteDiff > Sz &&
6836 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
6837 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
6838 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6839 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
6844 if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
6848 if (Order.
empty()) {
6849 Ptr0 = PointerOps.
front();
6850 PtrN = PointerOps.
back();
6852 Ptr0 = PointerOps[Order.
front()];
6853 PtrN = PointerOps[Order.
back()];
6862 else if (
Ptr != Ptr0)
6866 if (((Dist / Stride) * Stride) != Dist || !Dists.
insert(Dist).second)
6869 if (Dists.
size() == Sz) {
6870 Type *StrideTy = DL.getIndexType(Ptr0->
getType());
6871 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6882 unsigned *BestVF,
bool TryRecursiveCheck)
const {
6895 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
6901 const size_t Sz = VL.
size();
6903 auto *POIter = PointerOps.
begin();
6904 for (
Value *V : VL) {
6906 if (!L || !L->isSimple())
6908 *POIter = L->getPointerOperand();
6914 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
6920 if (
const SCEV *Stride =
6922 Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
6924 SPtrInfo.StrideSCEV = Stride;
6929 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6930 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6941 if (Order.
empty()) {
6942 Ptr0 = PointerOps.
front();
6943 PtrN = PointerOps.
back();
6945 Ptr0 = PointerOps[Order.
front()];
6946 PtrN = PointerOps[Order.
back()];
6948 std::optional<int64_t> Diff =
6951 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
6954 *TLI, [&](
Value *V) {
6955 return areAllUsersVectorized(
6960 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
6962 auto IsAnyPointerUsedOutGraph =
6963 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
6965 return !isVectorized(U) && !MustGather.contains(U);
6968 if (IsPossibleStrided &&
6970 IsAnyPointerUsedOutGraph, *Diff, SPtrInfo))
6973 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6974 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6979 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
6981 bool ProfitableGatherPointers) {
6986 auto [ScalarGEPCost, VectorGEPCost] =
6988 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
6992 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
6994 if (
static_cast<unsigned>(
count_if(
7013 return C + TTI.getInstructionCost(
7019 TTI.getGatherScatterOpCost(
7021 false, CommonAlignment,
CostKind) +
7022 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7030 constexpr unsigned ListLimit = 4;
7031 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7040 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7050 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7055 PointerOps, SPtrInfo, BestVF,
7063 DemandedElts.
setBits(Cnt, Cnt + VF);
7079 if (!DemandedElts.
isZero()) {
7085 if (DemandedElts[Idx])
7096 LI0->getPointerOperand(),
7097 Instruction::GetElementPtr,
CostKind, ScalarTy,
7101 if (
static_cast<unsigned>(
7103 PointerOps.
size() - 1 ||
7122 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7123 LI0->getPointerAddressSpace(),
CostKind,
7128 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7129 LI0->getPointerOperand(),
7135 VecLdCost += TTI.getMaskedMemoryOpCost(
7136 Instruction::Load, SubVecTy, CommonAlignment,
7137 LI0->getPointerAddressSpace(),
CostKind) +
7143 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7144 LI0->getPointerOperand(),
7155 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7164 if (MaskedGatherCost >= VecLdCost &&
7177 bool ProfitableGatherPointers =
7178 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7179 return L->isLoopInvariant(V);
7181 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7184 (
GEP &&
GEP->getNumOperands() == 2 &&
7192 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7193 ProfitableGatherPointers))
7205 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7206 "Expected list of pointer operands.");
7211 std::pair<BasicBlock *, Value *>,
7215 .try_emplace(std::make_pair(
7219 SortedIndices.
clear();
7221 auto Key = std::make_pair(BBs[Cnt + 1],
7223 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7225 std::optional<int64_t> Diff =
7226 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7227 ElemTy, Ptr, DL, SE,
7232 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7238 if (Bases.size() > VL.
size() / 2 - 1)
7242 Bases.find(
Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
7246 if (Bases.size() == VL.
size())
7249 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7250 Bases.front().second.size() == VL.
size()))
7255 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7264 FirstPointers.
insert(P1);
7265 SecondPointers.
insert(P2);
7271 "Unable to find matching root.");
7274 for (
auto &
Base : Bases) {
7275 for (
auto &Vec :
Base.second) {
7276 if (Vec.size() > 1) {
7278 int64_t InitialOffset = std::get<1>(Vec[0]);
7279 bool AnyConsecutive =
7281 return std::get<1>(
P.value()) ==
7282 int64_t(
P.index()) + InitialOffset;
7286 if (!AnyConsecutive)
7291 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7295 for (
auto &
T : Bases)
7296 for (
const auto &Vec :
T.second)
7297 for (
const auto &
P : Vec)
7301 "Expected SortedIndices to be the size of VL");
7305std::optional<BoUpSLP::OrdersType>
7307 assert(TE.isGather() &&
"Expected gather node only.");
7308 Type *ScalarTy = TE.Scalars[0]->getType();
7311 Ptrs.
reserve(TE.Scalars.size());
7313 BBs.
reserve(TE.Scalars.size());
7314 for (
Value *V : TE.Scalars) {
7316 if (!L || !L->isSimple())
7317 return std::nullopt;
7323 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7325 return std::move(Order);
7326 return std::nullopt;
7337 if (VU->
getType() != V->getType())
7340 if (!VU->
hasOneUse() && !V->hasOneUse())
7346 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7353 bool IsReusedIdx =
false;
7355 if (IE2 == VU && !IE1)
7357 if (IE1 == V && !IE2)
7358 return V->hasOneUse();
7359 if (IE1 && IE1 != V) {
7361 IsReusedIdx |= ReusedIdx.
test(Idx1);
7362 ReusedIdx.
set(Idx1);
7363 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7368 if (IE2 && IE2 != VU) {
7370 IsReusedIdx |= ReusedIdx.
test(Idx2);
7371 ReusedIdx.
set(Idx2);
7372 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7377 }
while (!IsReusedIdx && (IE1 || IE2));
7385 const TargetLibraryInfo &TLI);
7387std::optional<BoUpSLP::OrdersType>
7389 bool IgnoreReorder) {
7392 if (!TE.ReuseShuffleIndices.empty()) {
7394 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7395 "Reshuffling scalars not yet supported for nodes with padding");
7398 return std::nullopt;
7406 unsigned Sz = TE.Scalars.size();
7407 if (TE.isGather()) {
7408 if (std::optional<OrdersType> CurrentOrder =
7413 ::addMask(Mask, TE.ReuseShuffleIndices);
7414 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7415 unsigned Sz = TE.Scalars.size();
7416 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7419 Res[Idx + K * Sz] =
I + K * Sz;
7421 return std::move(Res);
7424 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7426 2 * TE.getVectorFactor())) == 1)
7427 return std::nullopt;
7428 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7429 return std::nullopt;
7433 if (TE.ReorderIndices.empty())
7434 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7437 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7438 unsigned VF = ReorderMask.
size();
7442 for (
unsigned I = 0;
I < VF;
I += Sz) {
7444 unsigned UndefCnt = 0;
7445 unsigned Limit = std::min(Sz, VF -
I);
7454 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7456 return std::nullopt;
7458 for (
unsigned K = 0; K < NumParts; ++K) {
7459 unsigned Idx = Val + Sz * K;
7460 if (Idx < VF &&
I + K < VF)
7461 ResOrder[Idx] =
I + K;
7464 return std::move(ResOrder);
7466 unsigned VF = TE.getVectorFactor();
7469 TE.ReuseShuffleIndices.end());
7470 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7472 if (isa<PoisonValue>(V))
7474 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7475 return Idx && *Idx < Sz;
7477 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7478 "by BinaryOperator and CastInst.");
7480 if (TE.ReorderIndices.empty())
7481 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7484 for (
unsigned I = 0;
I < VF; ++
I) {
7485 int &Idx = ReusedMask[
I];
7488 Value *V = TE.Scalars[ReorderMask[Idx]];
7490 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7496 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7497 auto *It = ResOrder.
begin();
7498 for (
unsigned K = 0; K < VF; K += Sz) {
7502 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7504 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7505 std::advance(It, Sz);
7508 return Data.index() ==
Data.value();
7510 return std::nullopt;
7511 return std::move(ResOrder);
7513 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7514 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7516 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7517 return std::nullopt;
7518 if (TE.State == TreeEntry::SplitVectorize ||
7519 ((TE.State == TreeEntry::Vectorize ||
7520 TE.State == TreeEntry::StridedVectorize ||
7521 TE.State == TreeEntry::CompressVectorize) &&
7524 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7525 "Alternate instructions are only supported by "
7526 "BinaryOperator and CastInst.");
7527 return TE.ReorderIndices;
7529 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7530 TE.isAltShuffle()) {
7531 assert(TE.ReuseShuffleIndices.empty() &&
7532 "ReuseShuffleIndices should be "
7533 "empty for alternate instructions.");
7535 TE.buildAltOpShuffleMask(
7537 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7538 "Unexpected main/alternate opcode");
7542 const int VF = TE.getVectorFactor();
7547 ResOrder[Mask[
I] % VF] =
I;
7549 return std::move(ResOrder);
7551 if (!TE.ReorderIndices.empty())
7552 return TE.ReorderIndices;
7553 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7554 if (!TE.ReorderIndices.empty())
7555 return TE.ReorderIndices;
7558 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7566 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7574 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7575 if (!DT->isReachableFromEntry(BB1))
7577 if (!DT->isReachableFromEntry(BB2))
7579 auto *NodeA = DT->getNode(BB1);
7580 auto *NodeB = DT->getNode(BB2);
7581 assert(NodeA &&
"Should only process reachable instructions");
7582 assert(NodeB &&
"Should only process reachable instructions");
7583 assert((NodeA == NodeB) ==
7584 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7585 "Different nodes should have different DFS numbers");
7586 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7588 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7589 Value *V1 = TE.Scalars[I1];
7590 Value *V2 = TE.Scalars[I2];
7603 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7604 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7605 FirstUserOfPhi2->getParent());
7615 if (UserBVHead[I1] && !UserBVHead[I2])
7617 if (!UserBVHead[I1])
7619 if (UserBVHead[I1] == UserBVHead[I2])
7622 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7624 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7637 if (EE1->getOperand(0) == EE2->getOperand(0))
7639 if (!Inst1 && Inst2)
7641 if (Inst1 && Inst2) {
7649 "Expected either instructions or arguments vector operands.");
7650 return P1->getArgNo() < P2->getArgNo();
7655 std::iota(Phis.
begin(), Phis.
end(), 0);
7658 return std::nullopt;
7659 return std::move(Phis);
7661 if (TE.isGather() &&
7662 (!TE.hasState() || !TE.isAltShuffle() ||
7663 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7667 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7671 auto *EE = dyn_cast<ExtractElementInst>(V);
7672 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7678 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7679 if (Reuse || !CurrentOrder.
empty())
7680 return std::move(CurrentOrder);
7688 int Sz = TE.Scalars.size();
7692 if (It == TE.Scalars.begin())
7695 if (It != TE.Scalars.end()) {
7697 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7712 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7715 return std::move(Order);
7720 return std::nullopt;
7721 if (TE.Scalars.size() >= 3)
7726 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7728 StridedPtrInfo SPtrInfo;
7731 CurrentOrder, PointerOps, SPtrInfo);
7734 return std::move(CurrentOrder);
7739 if (std::optional<OrdersType> CurrentOrder =
7741 return CurrentOrder;
7743 return std::nullopt;
7753 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7755 if (Cluster != FirstCluster)
7761void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask)
const {
7764 const unsigned Sz =
TE.Scalars.size();
7766 if (!
TE.isGather() ||
7771 SmallVector<int> NewMask;
7773 addMask(NewMask,
TE.ReuseShuffleIndices);
7775 TE.ReorderIndices.clear();
7777 ArrayRef<int> Slice =
ArrayRef(NewMask).slice(0, Sz);
7778 SmallVector<unsigned> NewOrder(Slice);
7782 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
7783 *End =
TE.ReuseShuffleIndices.end();
7784 It != End; std::advance(It, Sz))
7785 std::iota(It, std::next(It, Sz), 0);
7791 "Expected same size of orders");
7792 size_t Sz = Order.
size();
7795 if (Order[Idx] != Sz)
7796 UsedIndices.
set(Order[Idx]);
7798 if (SecondaryOrder.
empty()) {
7800 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
7804 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7805 !UsedIndices.
test(SecondaryOrder[Idx]))
7806 Order[Idx] = SecondaryOrder[Idx];
7814 constexpr unsigned TinyVF = 2;
7815 constexpr unsigned TinyTree = 10;
7816 constexpr unsigned PhiOpsLimit = 12;
7817 constexpr unsigned GatherLoadsLimit = 2;
7818 if (VectorizableTree.size() <= TinyTree)
7820 if (VectorizableTree.front()->hasState() &&
7821 !VectorizableTree.front()->isGather() &&
7822 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7823 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7824 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7825 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7826 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7827 VectorizableTree.front()->ReorderIndices.empty()) {
7831 if (VectorizableTree.front()->hasState() &&
7832 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7833 VectorizableTree.front()->Scalars.size() == TinyVF &&
7834 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7837 if (VectorizableTree.front()->hasState() &&
7838 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7839 VectorizableTree.front()->ReorderIndices.empty()) {
7840 const unsigned ReorderedSplitsCnt =
7841 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7842 return TE->State == TreeEntry::SplitVectorize &&
7843 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7844 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7847 if (ReorderedSplitsCnt <= 1 &&
7849 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7850 return ((!TE->isGather() &&
7851 (TE->ReorderIndices.empty() ||
7852 (TE->UserTreeIndex.UserTE &&
7853 TE->UserTreeIndex.UserTE->State ==
7854 TreeEntry::Vectorize &&
7855 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7857 (TE->isGather() && TE->ReorderIndices.empty() &&
7858 (!TE->hasState() || TE->isAltShuffle() ||
7859 TE->getOpcode() == Instruction::Load ||
7860 TE->getOpcode() == Instruction::ZExt ||
7861 TE->getOpcode() == Instruction::SExt))) &&
7862 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7863 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
7864 return !isConstant(V) && isVectorized(V);
7866 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7869 bool HasPhis =
false;
7870 bool HasLoad =
true;
7871 unsigned GatherLoads = 0;
7872 for (
const std::unique_ptr<TreeEntry> &TE :
7873 ArrayRef(VectorizableTree).drop_front()) {
7874 if (TE->State == TreeEntry::SplitVectorize)
7876 if (!TE->hasState()) {
7880 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7885 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7886 if (!TE->isGather()) {
7893 if (GatherLoads >= GatherLoadsLimit)
7896 if (TE->getOpcode() == Instruction::GetElementPtr ||
7899 if (TE->getOpcode() != Instruction::PHI &&
7900 (!TE->hasCopyableElements() ||
7902 TE->Scalars.size() / 2))
7904 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7905 TE->getNumOperands() > PhiOpsLimit)
7914void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
7916 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
7919 std::iota(NewMask.
begin(), NewMask.
end(), 0);
7920 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7923 copy(MaskOrder, NewMaskOrder.begin());
7925 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
7926 unsigned Offset = CombinedEntriesWithIndices.
back().second;
7935 ReorderIndices.clear();
7954 ExternalUserReorderMap;
7958 for_each(VectorizableTree, [&, &TTIRef = *TTI](
7959 const std::unique_ptr<TreeEntry> &TE) {
7962 findExternalStoreUsersReorderIndices(TE.get());
7963 if (!ExternalUserReorderIndices.
empty()) {
7964 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7966 std::move(ExternalUserReorderIndices));
7972 if (TE->hasState() && TE->isAltShuffle() &&
7973 TE->State != TreeEntry::SplitVectorize) {
7974 Type *ScalarTy = TE->Scalars[0]->getType();
7976 unsigned Opcode0 = TE->getOpcode();
7977 unsigned Opcode1 = TE->getAltOpcode();
7981 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7982 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7988 bool IgnoreReorder =
7989 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7990 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
7991 VectorizableTree.front()->getOpcode() == Instruction::Store);
7992 if (std::optional<OrdersType> CurrentOrder =
8002 const TreeEntry *UserTE = TE.get();
8004 if (!UserTE->UserTreeIndex)
8006 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8007 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8008 UserTE->UserTreeIndex.UserTE->Idx != 0)
8010 UserTE = UserTE->UserTreeIndex.UserTE;
8013 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8014 if (!(TE->State == TreeEntry::Vectorize ||
8015 TE->State == TreeEntry::StridedVectorize ||
8016 TE->State == TreeEntry::SplitVectorize ||
8017 TE->State == TreeEntry::CompressVectorize) ||
8018 !TE->ReuseShuffleIndices.empty())
8019 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8020 if (TE->State == TreeEntry::Vectorize &&
8021 TE->getOpcode() == Instruction::PHI)
8022 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8027 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8028 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8029 auto It = VFToOrderedEntries.
find(VF);
8030 if (It == VFToOrderedEntries.
end())
8044 for (
const TreeEntry *OpTE : OrderedEntries) {
8047 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8048 OpTE->State != TreeEntry::SplitVectorize)
8051 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8053 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8054 auto It = GathersToOrders.find(OpTE);
8055 if (It != GathersToOrders.end())
8058 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8059 auto It = AltShufflesToOrders.find(OpTE);
8060 if (It != AltShufflesToOrders.end())
8063 if (OpTE->State == TreeEntry::Vectorize &&
8064 OpTE->getOpcode() == Instruction::PHI) {
8065 auto It = PhisToOrders.
find(OpTE);
8066 if (It != PhisToOrders.
end())
8069 return OpTE->ReorderIndices;
8072 auto It = ExternalUserReorderMap.
find(OpTE);
8073 if (It != ExternalUserReorderMap.
end()) {
8074 const auto &ExternalUserReorderIndices = It->second;
8078 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8079 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8080 ExternalUserReorderIndices.size();
8082 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8083 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8090 if (OpTE->State == TreeEntry::Vectorize &&
8091 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8092 assert(!OpTE->isAltShuffle() &&
8093 "Alternate instructions are only supported by BinaryOperator "
8097 unsigned E = Order.
size();
8100 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8103 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8105 ++OrdersUses.try_emplace(Order, 0).first->second;
8108 if (OrdersUses.empty())
8111 unsigned IdentityCnt = 0;
8112 unsigned FilledIdentityCnt = 0;
8114 for (
auto &Pair : OrdersUses) {
8116 if (!Pair.first.empty())
8117 FilledIdentityCnt += Pair.second;
8118 IdentityCnt += Pair.second;
8123 unsigned Cnt = IdentityCnt;
8124 for (
auto &Pair : OrdersUses) {
8128 if (Cnt < Pair.second ||
8129 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8130 Cnt == Pair.second && !BestOrder.
empty() &&
8133 BestOrder = Pair.first;
8146 unsigned E = BestOrder.
size();
8148 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8151 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8153 if (TE->Scalars.size() != VF) {
8154 if (TE->ReuseShuffleIndices.size() == VF) {
8155 assert(TE->State != TreeEntry::SplitVectorize &&
8156 "Split vectorized not expected.");
8161 (!TE->UserTreeIndex ||
8162 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8163 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8164 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8165 "All users must be of VF size.");
8172 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8178 reorderNodeWithReuses(*TE, Mask);
8180 if (TE->UserTreeIndex &&
8181 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8182 TE->UserTreeIndex.UserTE->reorderSplitNode(
8183 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8187 if ((TE->State == TreeEntry::SplitVectorize &&
8188 TE->ReuseShuffleIndices.empty()) ||
8189 ((TE->State == TreeEntry::Vectorize ||
8190 TE->State == TreeEntry::StridedVectorize ||
8191 TE->State == TreeEntry::CompressVectorize) &&
8196 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8197 TE->ReuseShuffleIndices.empty())) &&
8198 "Alternate instructions are only supported by BinaryOperator "
8204 TE->reorderOperands(Mask);
8207 TE->reorderOperands(Mask);
8208 assert(TE->ReorderIndices.empty() &&
8209 "Expected empty reorder sequence.");
8212 if (!TE->ReuseShuffleIndices.empty()) {
8219 addMask(NewReuses, TE->ReuseShuffleIndices);
8220 TE->ReuseShuffleIndices.swap(NewReuses);
8221 }
else if (TE->UserTreeIndex &&
8222 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8224 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8230void BoUpSLP::buildReorderableOperands(
8231 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8235 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8236 return OpData.first ==
I &&
8237 (OpData.second->State == TreeEntry::Vectorize ||
8238 OpData.second->State == TreeEntry::StridedVectorize ||
8239 OpData.second->State == TreeEntry::CompressVectorize ||
8240 OpData.second->State == TreeEntry::SplitVectorize);
8244 if (UserTE->hasState()) {
8245 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8246 UserTE->getOpcode() == Instruction::ExtractValue)
8248 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8250 if (UserTE->getOpcode() == Instruction::Store &&
8251 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8253 if (UserTE->getOpcode() == Instruction::Load &&
8254 (UserTE->State == TreeEntry::Vectorize ||
8255 UserTE->State == TreeEntry::StridedVectorize ||
8256 UserTE->State == TreeEntry::CompressVectorize))
8259 TreeEntry *TE = getOperandEntry(UserTE,
I);
8260 assert(TE &&
"Expected operand entry.");
8261 if (!TE->isGather()) {
8264 Edges.emplace_back(
I, TE);
8270 if (TE->State == TreeEntry::ScatterVectorize &&
8271 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8275 if (ReorderableGathers.
contains(TE))
8281 struct TreeEntryCompare {
8282 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8283 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8284 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8285 return LHS->Idx < RHS->Idx;
8294 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8295 if (TE->State != TreeEntry::Vectorize &&
8296 TE->State != TreeEntry::StridedVectorize &&
8297 TE->State != TreeEntry::CompressVectorize &&
8298 TE->State != TreeEntry::SplitVectorize)
8299 NonVectorized.
insert(TE.get());
8300 if (std::optional<OrdersType> CurrentOrder =
8302 Queue.push(TE.get());
8303 if (!(TE->State == TreeEntry::Vectorize ||
8304 TE->State == TreeEntry::StridedVectorize ||
8305 TE->State == TreeEntry::CompressVectorize ||
8306 TE->State == TreeEntry::SplitVectorize) ||
8307 !TE->ReuseShuffleIndices.empty())
8308 GathersToOrders.
insert(TE.get());
8317 while (!Queue.empty()) {
8319 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8320 TreeEntry *TE = Queue.top();
8321 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8324 while (!Queue.empty()) {
8326 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8331 for (TreeEntry *TE : OrderedOps) {
8332 if (!(TE->State == TreeEntry::Vectorize ||
8333 TE->State == TreeEntry::StridedVectorize ||
8334 TE->State == TreeEntry::CompressVectorize ||
8335 TE->State == TreeEntry::SplitVectorize ||
8336 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8337 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8338 !Visited.
insert(TE).second)
8342 Users.first = TE->UserTreeIndex.UserTE;
8343 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8347 if (
Data.first->State == TreeEntry::SplitVectorize) {
8349 Data.second.size() <= 2 &&
8350 "Expected not greater than 2 operands for split vectorize node.");
8352 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8355 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8356 "Expected exactly 2 entries.");
8357 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8358 TreeEntry &OpTE = *VectorizableTree[
P.first];
8360 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8361 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8363 const auto BestOrder =
8372 const unsigned E = Order.
size();
8375 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8377 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8379 if (!OpTE.ReorderIndices.empty()) {
8380 OpTE.ReorderIndices.clear();
8381 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8384 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8388 if (
Data.first->ReuseShuffleIndices.empty() &&
8389 !
Data.first->ReorderIndices.empty()) {
8392 Queue.push(
Data.first);
8398 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8410 for (
const auto &
Op :
Data.second) {
8411 TreeEntry *OpTE =
Op.second;
8412 if (!VisitedOps.
insert(OpTE).second)
8414 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8416 const auto Order = [&]() ->
const OrdersType {
8417 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8421 return OpTE->ReorderIndices;
8425 if (Order.
size() == 1)
8431 Value *Root = OpTE->hasState()
8434 auto GetSameNodesUsers = [&](
Value *Root) {
8436 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8437 if (TE != OpTE && TE->UserTreeIndex &&
8438 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8439 TE->Scalars.size() == OpTE->Scalars.size() &&
8440 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8441 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8442 Res.
insert(TE->UserTreeIndex.UserTE);
8444 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8445 if (TE != OpTE && TE->UserTreeIndex &&
8446 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8447 TE->Scalars.size() == OpTE->Scalars.size() &&
8448 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8449 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8450 Res.
insert(TE->UserTreeIndex.UserTE);
8454 auto GetNumOperands = [](
const TreeEntry *TE) {
8455 if (TE->State == TreeEntry::SplitVectorize)
8456 return TE->getNumOperands();
8458 return CI->arg_size();
8459 return TE->getNumOperands();
8461 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8462 const TreeEntry *TE) {
8470 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8471 if (
Op->isGather() &&
Op->hasState()) {
8472 const TreeEntry *VecOp =
8473 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8477 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8484 if (!RevisitedOps.
insert(UTE).second)
8486 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8487 !UTE->ReuseShuffleIndices.empty() ||
8488 (UTE->UserTreeIndex &&
8489 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8490 (
Data.first->UserTreeIndex &&
8491 Data.first->UserTreeIndex.UserTE == UTE) ||
8492 (IgnoreReorder && UTE->UserTreeIndex &&
8493 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8494 NodeShouldBeReorderedWithOperands(UTE);
8497 for (TreeEntry *UTE :
Users) {
8505 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8507 Queue.push(
const_cast<TreeEntry *
>(
Op));
8512 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8513 return P.second == OpTE;
8516 if (OpTE->State == TreeEntry::Vectorize &&
8517 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8518 assert(!OpTE->isAltShuffle() &&
8519 "Alternate instructions are only supported by BinaryOperator "
8523 unsigned E = Order.
size();
8526 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8529 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
8531 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
8533 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8534 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8535 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8536 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8537 (IgnoreReorder && TE->Idx == 0))
8539 if (TE->isGather()) {
8549 if (OpTE->UserTreeIndex) {
8550 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8551 if (!VisitedUsers.
insert(UserTE).second)
8556 if (AllowsReordering(UserTE))
8564 if (
static_cast<unsigned>(
count_if(
8565 Ops, [UserTE, &AllowsReordering](
8566 const std::pair<unsigned, TreeEntry *> &
Op) {
8567 return AllowsReordering(
Op.second) &&
8568 Op.second->UserTreeIndex.UserTE == UserTE;
8569 })) <=
Ops.size() / 2)
8570 ++Res.first->second;
8573 if (OrdersUses.empty()) {
8578 unsigned IdentityCnt = 0;
8579 unsigned VF =
Data.second.front().second->getVectorFactor();
8581 for (
auto &Pair : OrdersUses) {
8583 IdentityCnt += Pair.second;
8588 unsigned Cnt = IdentityCnt;
8589 for (
auto &Pair : OrdersUses) {
8593 if (Cnt < Pair.second) {
8595 BestOrder = Pair.first;
8612 unsigned E = BestOrder.
size();
8614 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8616 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8617 TreeEntry *TE =
Op.second;
8618 if (!VisitedOps.
insert(TE).second)
8620 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8621 reorderNodeWithReuses(*TE, Mask);
8625 if (TE->State != TreeEntry::Vectorize &&
8626 TE->State != TreeEntry::StridedVectorize &&
8627 TE->State != TreeEntry::CompressVectorize &&
8628 TE->State != TreeEntry::SplitVectorize &&
8629 (TE->State != TreeEntry::ScatterVectorize ||
8630 TE->ReorderIndices.empty()))
8632 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8633 TE->ReorderIndices.empty()) &&
8634 "Non-matching sizes of user/operand entries.");
8636 if (IgnoreReorder && TE == VectorizableTree.front().get())
8637 IgnoreReorder =
false;
8640 for (TreeEntry *
Gather : GatherOps) {
8642 "Unexpected reordering of gathers.");
8643 if (!
Gather->ReuseShuffleIndices.empty()) {
8653 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8654 return TE.isAltShuffle() &&
8655 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8656 TE.ReorderIndices.empty());
8658 if (
Data.first->State != TreeEntry::Vectorize ||
8660 Data.first->getMainOp()) ||
8661 IsNotProfitableAltCodeNode(*
Data.first))
8662 Data.first->reorderOperands(Mask);
8664 IsNotProfitableAltCodeNode(*
Data.first) ||
8665 Data.first->State == TreeEntry::StridedVectorize ||
8666 Data.first->State == TreeEntry::CompressVectorize) {
8670 if (
Data.first->ReuseShuffleIndices.empty() &&
8671 !
Data.first->ReorderIndices.empty() &&
8672 !IsNotProfitableAltCodeNode(*
Data.first)) {
8675 Queue.push(
Data.first);
8683 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8684 VectorizableTree.front()->ReuseShuffleIndices.empty())
8685 VectorizableTree.front()->ReorderIndices.
clear();
8688Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8689 if (Entry.hasState() &&
8690 (Entry.getOpcode() == Instruction::Store ||
8691 Entry.getOpcode() == Instruction::Load) &&
8692 Entry.State == TreeEntry::StridedVectorize &&
8693 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8700 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8704 for (
auto &TEPtr : VectorizableTree) {
8705 TreeEntry *Entry = TEPtr.get();
8708 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8712 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8713 Value *Scalar = Entry->Scalars[Lane];
8718 auto It = ScalarToExtUses.
find(Scalar);
8719 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8722 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8723 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8724 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8725 <<
" from " << *Scalar <<
"for many users.\n");
8726 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8727 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8728 ExternalUsesWithNonUsers.insert(Scalar);
8733 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8734 if (ExtI != ExternallyUsedValues.
end()) {
8735 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8736 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8737 << FoundLane <<
" from " << *Scalar <<
".\n");
8738 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8739 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8750 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8755 !UseEntries.
empty()) {
8759 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8762 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8763 return UseEntry->State == TreeEntry::ScatterVectorize ||
8765 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8768 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
8771 [](TreeEntry *UseEntry) {
8772 return UseEntry->isGather();
8778 if (It != ScalarToExtUses.
end()) {
8779 ExternalUses[It->second].User =
nullptr;
8784 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
8786 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8788 <<
" from lane " << FoundLane <<
" from " << *Scalar
8790 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8791 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8792 ExternalUsesWithNonUsers.insert(Scalar);
8801BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
8805 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
8806 Value *V = TE->Scalars[Lane];
8819 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
8828 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
8829 SI->getValueOperand()->getType(),
Ptr}];
8832 if (StoresVec.size() > Lane)
8834 if (!StoresVec.empty()) {
8836 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
8837 SI->getValueOperand()->getType(),
8838 StoresVec.front()->getPointerOperand(), *
DL, *SE,
8844 StoresVec.push_back(
SI);
8849 for (
auto &
P : PtrToStoresMap) {
8864 StoreInst *S0 = StoresVec[0];
8869 StoreInst *
SI = StoresVec[Idx];
8870 std::optional<int64_t> Diff =
8872 SI->getPointerOperand(), *DL, *SE,
8878 if (StoreOffsetVec.
size() != StoresVec.
size())
8880 sort(StoreOffsetVec, llvm::less_first());
8882 int64_t PrevDist = 0;
8883 for (
const auto &
P : StoreOffsetVec) {
8884 if (Idx > 0 &&
P.first != PrevDist + 1)
8892 ReorderIndices.assign(StoresVec.
size(), 0);
8893 bool IsIdentity =
true;
8895 ReorderIndices[
P.second] =
I;
8896 IsIdentity &=
P.second ==
I;
8902 ReorderIndices.clear();
8909 for (
unsigned Idx : Order)
8910 dbgs() << Idx <<
", ";
8916BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
8917 unsigned NumLanes =
TE->Scalars.size();
8930 if (StoresVec.
size() != NumLanes)
8935 if (!canFormVector(StoresVec, ReorderIndices))
8940 ExternalReorderIndices.
push_back(ReorderIndices);
8942 return ExternalReorderIndices;
8948 UserIgnoreList = &UserIgnoreLst;
8951 buildTreeRec(Roots, 0,
EdgeInfo());
8958 buildTreeRec(Roots, 0,
EdgeInfo());
8967 bool AddNew =
true) {
8975 for (
Value *V : VL) {
8979 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
8981 bool IsFound =
false;
8982 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
8983 assert(LI->getParent() ==
Data.front().first->getParent() &&
8984 LI->getType() ==
Data.front().first->getType() &&
8988 "Expected loads with the same type, same parent and same "
8989 "underlying pointer.");
8991 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
8992 Data.front().first->getPointerOperand(),
DL, SE,
8996 auto It = Map.find(*Dist);
8997 if (It != Map.end() && It->second != LI)
8999 if (It == Map.end()) {
9000 Data.emplace_back(LI, *Dist);
9001 Map.try_emplace(*Dist, LI);
9011 auto FindMatchingLoads =
9016 int64_t &
Offset,
unsigned &Start) {
9018 return GatheredLoads.
end();
9027 std::optional<int64_t> Dist =
9029 Data.front().first->getType(),
9030 Data.front().first->getPointerOperand(),
DL, SE,
9036 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9042 unsigned NumUniques = 0;
9043 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9044 bool Used = DataLoads.
contains(Pair.first);
9045 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9049 Repeated.insert(Cnt);
9052 if (NumUniques > 0 &&
9053 (Loads.
size() == NumUniques ||
9054 (Loads.
size() - NumUniques >= 2 &&
9055 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9061 return std::next(GatheredLoads.
begin(), Idx);
9065 return GatheredLoads.
end();
9067 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9071 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9073 while (It != GatheredLoads.
end()) {
9074 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9075 for (
unsigned Idx : LocalToAdd)
9078 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9082 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9089 Loads.push_back(
Data[Idx]);
9095 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9096 return PD.front().first->getParent() == LI->
getParent() &&
9097 PD.front().first->getType() == LI->
getType();
9099 while (It != GatheredLoads.
end()) {
9102 std::next(It), GatheredLoads.
end(),
9103 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9104 return PD.front().first->getParent() == LI->getParent() &&
9105 PD.front().first->getType() == LI->getType();
9109 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9110 AddNewLoads(GatheredLoads.emplace_back());
9115void BoUpSLP::tryToVectorizeGatheredLoads(
9116 const SmallMapVector<
9117 std::tuple<BasicBlock *, Value *, Type *>,
9120 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9123 LoadEntriesToVectorize.size());
9124 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9125 Set.insert_range(VectorizableTree[Idx]->Scalars);
9128 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9129 const std::pair<LoadInst *, int64_t> &L2) {
9130 return L1.second > L2.second;
9137 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9138 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9139 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9144 SmallVectorImpl<LoadInst *> &NonVectorized,
9145 bool Final,
unsigned MaxVF) {
9147 unsigned StartIdx = 0;
9148 SmallVector<int> CandidateVFs;
9152 *TTI, Loads.
front()->getType(), MaxVF);
9154 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9160 if (Final && CandidateVFs.
empty())
9163 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9164 for (
unsigned NumElts : CandidateVFs) {
9165 if (Final && NumElts > BestVF)
9167 SmallVector<unsigned> MaskedGatherVectorized;
9168 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9172 if (VectorizedLoads.count(Slice.
front()) ||
9173 VectorizedLoads.count(Slice.
back()) ||
9179 bool AllowToVectorize =
false;
9182 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9185 for (LoadInst *LI : Slice) {
9187 if (LI->hasOneUse())
9193 if (
static_cast<unsigned int>(std::distance(
9194 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9196 if (!IsLegalBroadcastLoad)
9200 for (User *U : LI->users()) {
9203 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9204 for (
int I :
seq<int>(UTE->getNumOperands())) {
9206 return V == LI || isa<PoisonValue>(V);
9216 AllowToVectorize = CheckIfAllowed(Slice);
9220 any_of(ValueToGatherNodes.at(Slice.front()),
9221 [=](
const TreeEntry *TE) {
9222 return TE->Scalars.size() == 2 &&
9223 ((TE->Scalars.front() == Slice.front() &&
9224 TE->Scalars.back() == Slice.back()) ||
9225 (TE->Scalars.front() == Slice.back() &&
9226 TE->Scalars.back() == Slice.front()));
9231 if (AllowToVectorize) {
9236 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9237 StridedPtrInfo SPtrInfo;
9239 PointerOps, SPtrInfo, &BestVF);
9241 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9243 if (MaskedGatherVectorized.
empty() ||
9244 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9249 Results.emplace_back(Values, LS);
9250 VectorizedLoads.insert_range(Slice);
9253 if (Cnt == StartIdx)
9254 StartIdx += NumElts;
9257 if (StartIdx >= Loads.
size())
9261 if (!MaskedGatherVectorized.
empty() &&
9262 Cnt < MaskedGatherVectorized.
back() + NumElts)
9268 if (!AllowToVectorize || BestVF == 0)
9272 for (
unsigned Cnt : MaskedGatherVectorized) {
9274 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9278 VectorizedLoads.insert_range(Slice);
9280 if (Cnt == StartIdx)
9281 StartIdx += NumElts;
9284 for (LoadInst *LI : Loads) {
9285 if (!VectorizedLoads.contains(LI))
9286 NonVectorized.push_back(LI);
9290 auto ProcessGatheredLoads =
9293 bool Final =
false) {
9295 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9297 if (LoadsDists.size() <= 1) {
9298 NonVectorized.
push_back(LoadsDists.back().first);
9306 unsigned MaxConsecutiveDistance = 0;
9307 unsigned CurrentConsecutiveDist = 1;
9308 int64_t LastDist = LocalLoadsDists.front().second;
9309 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9310 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9313 assert(LastDist >=
L.second &&
9314 "Expected first distance always not less than second");
9315 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9316 CurrentConsecutiveDist) {
9317 ++CurrentConsecutiveDist;
9318 MaxConsecutiveDistance =
9319 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9323 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9326 CurrentConsecutiveDist = 1;
9327 LastDist =
L.second;
9330 if (Loads.
size() <= 1)
9332 if (AllowMaskedGather)
9333 MaxConsecutiveDistance = Loads.
size();
9334 else if (MaxConsecutiveDistance < 2)
9339 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9340 Final, MaxConsecutiveDistance);
9342 OriginalLoads.size() == Loads.
size() &&
9343 MaxConsecutiveDistance == Loads.
size() &&
9348 VectorizedLoads.
clear();
9352 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9353 UnsortedNonVectorized, Final,
9354 OriginalLoads.size());
9355 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9356 SortedNonVectorized.
swap(UnsortedNonVectorized);
9357 Results.swap(UnsortedResults);
9362 << Slice.
size() <<
")\n");
9364 for (
Value *L : Slice)
9372 unsigned MaxVF = Slice.size();
9373 unsigned UserMaxVF = 0;
9374 unsigned InterleaveFactor = 0;
9379 std::optional<unsigned> InterleavedLoadsDistance = 0;
9381 std::optional<unsigned> CommonVF = 0;
9382 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9383 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9384 for (
auto [Idx, V] :
enumerate(Slice)) {
9385 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9386 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9389 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9391 if (*CommonVF == 0) {
9392 CommonVF =
E->Scalars.size();
9395 if (*CommonVF !=
E->Scalars.size())
9399 if (Pos != Idx && InterleavedLoadsDistance) {
9402 if (isa<Constant>(V))
9404 if (isVectorized(V))
9406 const auto &Nodes = ValueToGatherNodes.at(V);
9407 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9408 !is_contained(Slice, V);
9410 InterleavedLoadsDistance.reset();
9414 if (*InterleavedLoadsDistance == 0) {
9415 InterleavedLoadsDistance = Idx - Pos;
9418 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9419 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9420 InterleavedLoadsDistance.reset();
9421 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9425 DeinterleavedNodes.
clear();
9427 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9428 CommonVF.value_or(0) != 0) {
9429 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9430 unsigned VF = *CommonVF;
9433 StridedPtrInfo SPtrInfo;
9435 if (InterleaveFactor <= Slice.size() &&
9436 TTI.isLegalInterleavedAccessType(
9444 UserMaxVF = InterleaveFactor * VF;
9446 InterleaveFactor = 0;
9451 unsigned ConsecutiveNodesSize = 0;
9452 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9453 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9454 [&, Slice = Slice](
const auto &
P) {
9456 return std::get<1>(
P).contains(V);
9458 if (It == Slice.end())
9460 const TreeEntry &
TE =
9461 *VectorizableTree[std::get<0>(
P)];
9465 StridedPtrInfo SPtrInfo;
9467 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9471 ConsecutiveNodesSize += VL.
size();
9472 size_t Start = std::distance(Slice.begin(), It);
9473 size_t Sz = Slice.size() -
Start;
9474 return Sz < VL.
size() ||
9475 Slice.slice(Start, VL.
size()) != VL;
9480 if (InterleaveFactor == 0 &&
9482 [&, Slice = Slice](
unsigned Idx) {
9484 SmallVector<Value *> PointerOps;
9485 StridedPtrInfo SPtrInfo;
9486 return canVectorizeLoads(
9487 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9488 Slice[Idx * UserMaxVF], Order, PointerOps,
9489 SPtrInfo) == LoadsState::ScatterVectorize;
9492 if (Slice.size() != ConsecutiveNodesSize)
9493 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9495 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9496 bool IsVectorized =
true;
9497 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
9499 Slice.slice(
I, std::min(VF,
E -
I));
9504 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9505 [&](
const auto &
P) {
9507 VectorizableTree[std::get<0>(
P)]
9512 unsigned Sz = VectorizableTree.size();
9513 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
9514 if (Sz == VectorizableTree.size()) {
9515 IsVectorized =
false;
9518 if (InterleaveFactor > 0) {
9519 VF = 2 * (MaxVF / InterleaveFactor);
9520 InterleaveFactor = 0;
9529 NonVectorized.
append(SortedNonVectorized);
9531 return NonVectorized;
9533 for (
const auto &GLs : GatheredLoads) {
9534 const auto &
Ref = GLs.second;
9536 if (!
Ref.empty() && !NonVectorized.
empty() &&
9538 Ref.begin(),
Ref.end(), 0u,
9539 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9540 ->
unsigned { return S + LoadsDists.size(); }) !=
9541 NonVectorized.
size() &&
9542 IsMaskedGatherSupported(NonVectorized)) {
9545 for (LoadInst *LI : NonVectorized) {
9553 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9557 for (
unsigned Idx : LoadEntriesToVectorize) {
9558 const TreeEntry &
E = *VectorizableTree[Idx];
9561 if (!
E.ReorderIndices.empty()) {
9564 SmallVector<int> ReorderMask;
9568 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
9572 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9573 VectorizableTree.size())
9574 GatheredLoadsEntriesFirst.reset();
9584 bool AllowAlternate) {
9607 isValidForAlternation(
I->getOpcode())) {
9619 std::pair<size_t, size_t> OpVals =
9627 if (CI->isCommutative())
9649 SubKey =
hash_value(Gep->getPointerOperand());
9661 return std::make_pair(
Key, SubKey);
9667 Instruction *AltOp,
const TargetLibraryInfo &TLI);
9669bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9671 Type *ScalarTy = S.getMainOp()->getType();
9672 unsigned Opcode0 = S.getOpcode();
9673 unsigned Opcode1 = S.getAltOpcode();
9674 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9677 Opcode1, OpcodeMask))
9680 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
9683 for (
Value *V : VL) {
9700 switch (Res.value_or(0)) {
9714 DenseSet<unsigned> UniqueOpcodes;
9715 constexpr unsigned NumAltInsts = 3;
9716 unsigned NonInstCnt = 0;
9719 unsigned UndefCnt = 0;
9721 unsigned ExtraShuffleInsts = 0;
9730 return is_contained(Operands.back(), V);
9733 ++ExtraShuffleInsts;
9736 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
9748 DenseMap<Value *, unsigned> Uniques;
9758 if (!Res.second && Res.first->second == 1)
9759 ++ExtraShuffleInsts;
9760 ++Res.first->getSecond();
9762 UniqueOpcodes.
insert(
I->getOpcode());
9763 else if (Res.second)
9766 return none_of(Uniques, [&](
const auto &
P) {
9767 return P.first->hasNUsesOrMore(
P.second + 1) &&
9768 none_of(
P.first->users(), [&](User *U) {
9769 return isVectorized(U) || Uniques.contains(U);
9778 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9779 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
9780 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9787 const unsigned VF,
unsigned MinBW,
9810static std::pair<InstructionCost, InstructionCost>
9830 FMF = FPCI->getFastMathFlags();
9833 LibCost.isValid() ? LibCost : ScalarLimit);
9843BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9845 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
9846 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9848 "Expected instructions with same/alternate opcodes only.");
9850 unsigned ShuffleOrOp =
9851 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
9853 switch (ShuffleOrOp) {
9854 case Instruction::PHI: {
9857 return TreeEntry::NeedToGather;
9859 for (
Value *V : VL) {
9863 for (
Value *Incoming :
PHI->incoming_values()) {
9865 if (Term &&
Term->isTerminator()) {
9867 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
9868 return TreeEntry::NeedToGather;
9873 return TreeEntry::Vectorize;
9875 case Instruction::ExtractElement:
9882 return TreeEntry::NeedToGather;
9884 case Instruction::ExtractValue: {
9885 bool Reuse = canReuseExtract(VL, CurrentOrder);
9889 return TreeEntry::NeedToGather;
9890 if (Reuse || !CurrentOrder.empty())
9891 return TreeEntry::Vectorize;
9893 return TreeEntry::NeedToGather;
9895 case Instruction::InsertElement: {
9899 for (
Value *V : VL) {
9901 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
9902 return TreeEntry::NeedToGather;
9906 "Non-constant or undef index?");
9910 return !SourceVectors.contains(V);
9913 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9914 "different source vectors.\n");
9915 return TreeEntry::NeedToGather;
9920 return SourceVectors.contains(V) && !
V->hasOneUse();
9923 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9924 "multiple uses.\n");
9925 return TreeEntry::NeedToGather;
9928 return TreeEntry::Vectorize;
9930 case Instruction::Load: {
9937 auto IsGatheredNode = [&]() {
9938 if (!GatheredLoadsEntriesFirst)
9943 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
9944 return TE->Idx >= *GatheredLoadsEntriesFirst;
9950 return TreeEntry::Vectorize;
9952 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9954 LoadEntriesToVectorize.insert(VectorizableTree.size());
9955 return TreeEntry::NeedToGather;
9957 return IsGatheredNode() ? TreeEntry::NeedToGather
9958 : TreeEntry::CompressVectorize;
9960 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9962 LoadEntriesToVectorize.insert(VectorizableTree.size());
9963 return TreeEntry::NeedToGather;
9965 return IsGatheredNode() ? TreeEntry::NeedToGather
9966 : TreeEntry::ScatterVectorize;
9968 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9970 LoadEntriesToVectorize.insert(VectorizableTree.size());
9971 return TreeEntry::NeedToGather;
9973 return IsGatheredNode() ? TreeEntry::NeedToGather
9974 : TreeEntry::StridedVectorize;
9978 if (DL->getTypeSizeInBits(ScalarTy) !=
9979 DL->getTypeAllocSizeInBits(ScalarTy))
9980 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
9983 return !LI || !LI->isSimple();
9990 return TreeEntry::NeedToGather;
9994 case Instruction::ZExt:
9995 case Instruction::SExt:
9996 case Instruction::FPToUI:
9997 case Instruction::FPToSI:
9998 case Instruction::FPExt:
9999 case Instruction::PtrToInt:
10000 case Instruction::IntToPtr:
10001 case Instruction::SIToFP:
10002 case Instruction::UIToFP:
10003 case Instruction::Trunc:
10004 case Instruction::FPTrunc:
10005 case Instruction::BitCast: {
10007 for (
Value *V : VL) {
10013 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10014 return TreeEntry::NeedToGather;
10017 return TreeEntry::Vectorize;
10019 case Instruction::ICmp:
10020 case Instruction::FCmp: {
10025 for (
Value *V : VL) {
10029 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10030 Cmp->getOperand(0)->getType() != ComparedTy) {
10031 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10032 return TreeEntry::NeedToGather;
10035 return TreeEntry::Vectorize;
10037 case Instruction::Select:
10038 case Instruction::FNeg:
10039 case Instruction::Add:
10040 case Instruction::FAdd:
10041 case Instruction::Sub:
10042 case Instruction::FSub:
10043 case Instruction::Mul:
10044 case Instruction::FMul:
10045 case Instruction::UDiv:
10046 case Instruction::SDiv:
10047 case Instruction::FDiv:
10048 case Instruction::URem:
10049 case Instruction::SRem:
10050 case Instruction::FRem:
10051 case Instruction::Shl:
10052 case Instruction::LShr:
10053 case Instruction::AShr:
10054 case Instruction::And:
10055 case Instruction::Or:
10056 case Instruction::Xor:
10057 case Instruction::Freeze:
10058 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10059 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10061 return I &&
I->isBinaryOp() && !
I->isFast();
10063 return TreeEntry::NeedToGather;
10064 return TreeEntry::Vectorize;
10065 case Instruction::GetElementPtr: {
10067 for (
Value *V : VL) {
10071 if (
I->getNumOperands() != 2) {
10072 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10073 return TreeEntry::NeedToGather;
10080 for (
Value *V : VL) {
10084 Type *CurTy =
GEP->getSourceElementType();
10085 if (Ty0 != CurTy) {
10086 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10087 return TreeEntry::NeedToGather;
10093 for (
Value *V : VL) {
10097 auto *
Op =
I->getOperand(1);
10099 (
Op->getType() != Ty1 &&
10101 Op->getType()->getScalarSizeInBits() >
10102 DL->getIndexSizeInBits(
10103 V->getType()->getPointerAddressSpace())))) {
10105 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10106 return TreeEntry::NeedToGather;
10110 return TreeEntry::Vectorize;
10112 case Instruction::Store: {
10114 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10117 if (DL->getTypeSizeInBits(ScalarTy) !=
10118 DL->getTypeAllocSizeInBits(ScalarTy)) {
10119 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10120 return TreeEntry::NeedToGather;
10124 for (
Value *V : VL) {
10126 if (!
SI->isSimple()) {
10128 return TreeEntry::NeedToGather;
10137 if (CurrentOrder.empty()) {
10138 Ptr0 = PointerOps.
front();
10139 PtrN = PointerOps.
back();
10141 Ptr0 = PointerOps[CurrentOrder.front()];
10142 PtrN = PointerOps[CurrentOrder.back()];
10144 std::optional<int64_t> Dist =
10147 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10148 return TreeEntry::Vectorize;
10152 return TreeEntry::NeedToGather;
10154 case Instruction::Call: {
10155 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10156 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10158 return I && !
I->isFast();
10160 return TreeEntry::NeedToGather;
10170 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10174 return TreeEntry::NeedToGather;
10177 unsigned NumArgs = CI->
arg_size();
10179 for (
unsigned J = 0; J != NumArgs; ++J)
10182 for (
Value *V : VL) {
10187 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10189 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10191 return TreeEntry::NeedToGather;
10195 for (
unsigned J = 0; J != NumArgs; ++J) {
10198 if (ScalarArgs[J] != A1J) {
10200 <<
"SLP: mismatched arguments in call:" << *CI
10201 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10202 return TreeEntry::NeedToGather;
10211 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10212 <<
"!=" << *V <<
'\n');
10213 return TreeEntry::NeedToGather;
10218 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10220 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10221 return TreeEntry::NeedToGather;
10223 return TreeEntry::Vectorize;
10225 case Instruction::ShuffleVector: {
10226 if (!S.isAltShuffle()) {
10229 return TreeEntry::Vectorize;
10232 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10233 return TreeEntry::NeedToGather;
10238 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10239 "the whole alt sequence is not profitable.\n");
10240 return TreeEntry::NeedToGather;
10243 return TreeEntry::Vectorize;
10247 return TreeEntry::NeedToGather;
10256 PHINode *Main =
nullptr;
10261 PHIHandler() =
delete;
10263 : DT(DT), Main(Main), Phis(Phis),
10264 Operands(Main->getNumIncomingValues(),
10266 void buildOperands() {
10267 constexpr unsigned FastLimit = 4;
10276 for (
auto [Idx, V] :
enumerate(Phis)) {
10280 "Expected isa instruction or poison value.");
10284 if (
P->getIncomingBlock(
I) == InBB)
10287 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10292 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10302 for (
auto [Idx, V] :
enumerate(Phis)) {
10317 auto *It = Blocks.
find(InBB);
10318 if (It == Blocks.
end())
10320 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10323 for (
const auto &
P : Blocks) {
10324 ArrayRef<unsigned> IncomingValues =
P.second;
10325 if (IncomingValues.
size() <= 1)
10328 for (
unsigned I : IncomingValues) {
10330 [&](
const auto &
Data) {
10331 return !
Data.value() ||
10334 "Expected empty operands list.");
10348static std::pair<Instruction *, Instruction *>
10352 for (
Value *V : VL) {
10362 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10381 "Expected different main and alt instructions.");
10382 return std::make_pair(MainOp, AltOp);
10395 const InstructionsState &S,
10397 bool TryPad =
false) {
10401 for (
Value *V : VL) {
10417 size_t NumUniqueScalarValues = UniqueValues.
size();
10420 if (NumUniqueScalarValues == VL.
size() &&
10422 ReuseShuffleIndices.
clear();
10427 if ((UserTreeIdx.
UserTE &&
10428 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10430 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10431 "for nodes with padding.\n");
10432 ReuseShuffleIndices.
clear();
10437 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10441 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10442 S.getMainOp()->isSafeToRemove() &&
10443 (S.areInstructionsWithCopyableElements() ||
10447 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10448 PWSz = std::min<unsigned>(PWSz, VL.
size());
10449 if (PWSz == VL.
size()) {
10453 ReuseShuffleIndices.
clear();
10457 UniqueValues.
end());
10458 PaddedUniqueValues.
append(
10459 PWSz - UniqueValues.
size(),
10463 if (!S.areInstructionsWithCopyableElements() &&
10466 ReuseShuffleIndices.
clear();
10469 VL = std::move(PaddedUniqueValues);
10474 ReuseShuffleIndices.
clear();
10477 VL = std::move(UniqueValues);
10482 const InstructionsState &LocalState,
10483 SmallVectorImpl<Value *> &Op1,
10484 SmallVectorImpl<Value *> &Op2,
10486 constexpr unsigned SmallNodeSize = 4;
10487 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10492 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10494 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
10495 if (
E->isSame(VL)) {
10497 << *LocalState.getMainOp() <<
".\n");
10509 ReorderIndices.assign(VL.
size(), VL.
size());
10510 SmallBitVector Op1Indices(VL.
size());
10515 Op1Indices.set(Idx);
10518 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10521 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10523 LocalState.getAltOp(), *TLI))) {
10525 Op1Indices.set(Idx);
10532 unsigned Opcode0 = LocalState.getOpcode();
10533 unsigned Opcode1 = LocalState.getAltOpcode();
10534 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10539 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10540 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10545 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10547 if (Op1Indices.test(Idx)) {
10548 ReorderIndices[Op1Cnt] = Idx;
10551 ReorderIndices[Op2Cnt] = Idx;
10556 ReorderIndices.clear();
10557 SmallVector<int>
Mask;
10558 if (!ReorderIndices.empty())
10560 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10565 if (NumParts >= VL.
size())
10570 FixedVectorType *SubVecTy =
10574 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10575 (
Mask.empty() || InsertCost >= NewShuffleCost))
10577 if ((LocalState.getMainOp()->isBinaryOp() &&
10578 LocalState.getAltOp()->isBinaryOp() &&
10579 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10580 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10581 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10582 (LocalState.getMainOp()->isUnaryOp() &&
10583 LocalState.getAltOp()->isUnaryOp())) {
10585 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10586 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10591 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
10595 VecTy, OriginalMask, Kind);
10597 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10598 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10600 NewVecOpsCost + InsertCost +
10601 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10602 VectorizableTree.front()->getOpcode() == Instruction::Store
10606 if (NewCost >= OriginalCost)
10616class InstructionsCompatibilityAnalysis {
10618 const DataLayout &
DL;
10619 const TargetTransformInfo &
TTI;
10620 const TargetLibraryInfo &TLI;
10621 unsigned MainOpcode = 0;
10626 static bool isSupportedOpcode(
const unsigned Opcode) {
10627 return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10638 return I && isSupportedOpcode(
I->getOpcode()) &&
10643 SmallDenseSet<Value *, 8>
Operands;
10644 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10645 for (
Value *V : VL) {
10651 if (Candidates.
empty()) {
10652 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10654 Operands.insert(
I->op_begin(),
I->op_end());
10657 if (Parent ==
I->getParent()) {
10658 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10659 Operands.insert(
I->op_begin(),
I->op_end());
10662 auto *NodeA = DT.
getNode(Parent);
10663 auto *NodeB = DT.
getNode(
I->getParent());
10664 assert(NodeA &&
"Should only process reachable instructions");
10665 assert(NodeB &&
"Should only process reachable instructions");
10666 assert((NodeA == NodeB) ==
10667 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10668 "Different nodes should have different DFS numbers");
10669 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10670 Candidates.
clear();
10671 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10674 Operands.insert(
I->op_begin(),
I->op_end());
10677 unsigned BestOpcodeNum = 0;
10679 for (
const auto &
P : Candidates) {
10680 if (
P.second.size() < BestOpcodeNum)
10682 for (Instruction *
I :
P.second) {
10683 if (IsSupportedInstruction(
I) && !
Operands.contains(
I)) {
10685 BestOpcodeNum =
P.second.size();
10695 return I &&
I->getParent() == MainOp->
getParent() &&
10708 Value *selectBestIdempotentValue()
const {
10709 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10720 if (!S.isCopyableElement(V))
10722 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10723 return {
V, selectBestIdempotentValue()};
10729 SmallVectorImpl<BoUpSLP::ValueList> &
Operands)
const {
10731 unsigned ShuffleOrOp =
10732 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10735 switch (ShuffleOrOp) {
10736 case Instruction::PHI: {
10740 PHIHandler Handler(DT, PH, VL);
10741 Handler.buildOperands();
10742 Operands.assign(PH->getNumOperands(), {});
10744 Operands[
I].assign(Handler.getOperands(
I).begin(),
10745 Handler.getOperands(
I).end());
10748 case Instruction::ExtractValue:
10749 case Instruction::ExtractElement:
10754 case Instruction::InsertElement:
10762 case Instruction::Load:
10770 Op = LI->getPointerOperand();
10773 case Instruction::ZExt:
10774 case Instruction::SExt:
10775 case Instruction::FPToUI:
10776 case Instruction::FPToSI:
10777 case Instruction::FPExt:
10778 case Instruction::PtrToInt:
10779 case Instruction::IntToPtr:
10780 case Instruction::SIToFP:
10781 case Instruction::UIToFP:
10782 case Instruction::Trunc:
10783 case Instruction::FPTrunc:
10784 case Instruction::BitCast:
10785 case Instruction::ICmp:
10786 case Instruction::FCmp:
10787 case Instruction::Select:
10788 case Instruction::FNeg:
10789 case Instruction::Add:
10790 case Instruction::FAdd:
10791 case Instruction::Sub:
10792 case Instruction::FSub:
10793 case Instruction::Mul:
10794 case Instruction::FMul:
10795 case Instruction::UDiv:
10796 case Instruction::SDiv:
10797 case Instruction::FDiv:
10798 case Instruction::URem:
10799 case Instruction::SRem:
10800 case Instruction::FRem:
10801 case Instruction::Shl:
10802 case Instruction::LShr:
10803 case Instruction::AShr:
10804 case Instruction::And:
10805 case Instruction::Or:
10806 case Instruction::Xor:
10807 case Instruction::Freeze:
10808 case Instruction::Store:
10809 case Instruction::ShuffleVector:
10818 auto [
Op, ConvertedOps] = convertTo(
I, S);
10823 case Instruction::GetElementPtr: {
10830 const unsigned IndexIdx = 1;
10836 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
10839 :
DL.getIndexType(
cast<GetElementPtrInst>(VL0)
10840 ->getPointerOperandType()
10841 ->getScalarType());
10846 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10850 auto *
Op =
GEP->getOperand(IndexIdx);
10853 CI, Ty, CI->getValue().isSignBitSet(),
DL)
10858 case Instruction::Call: {
10865 for (
Value *V : VL) {
10867 Ops.push_back(
I ?
I->getOperand(Idx)
10880 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
10881 const TargetTransformInfo &
TTI,
10882 const TargetLibraryInfo &TLI)
10887 bool TryCopyableElementsVectorization,
10888 bool WithProfitabilityCheck =
false,
10889 bool SkipSameCodeCheck =
false) {
10890 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
10891 ? InstructionsState::invalid()
10897 findAndSetMainInstruction(VL, R);
10899 return InstructionsState::invalid();
10900 S = InstructionsState(MainOp, MainOp,
true);
10901 if (!WithProfitabilityCheck)
10905 auto BuildCandidates =
10906 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
10912 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
10913 I1->getParent() != I2->getParent())
10917 if (VL.
size() == 2) {
10922 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10923 R.findBestRootPair(Candidates1) &&
10924 R.findBestRootPair(Candidates2);
10926 Candidates1.
clear();
10927 Candidates2.
clear();
10930 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10931 R.findBestRootPair(Candidates1) &&
10932 R.findBestRootPair(Candidates2);
10935 return InstructionsState::invalid();
10939 FixedVectorType *VecTy =
10941 switch (MainOpcode) {
10942 case Instruction::Add:
10943 case Instruction::LShr:
10949 if (VectorCost > ScalarCost)
10950 return InstructionsState::invalid();
10953 assert(
Operands.size() == 2 &&
"Unexpected number of operands!");
10954 unsigned CopyableNum =
10955 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
10956 if (CopyableNum < VL.
size() / 2)
10959 const unsigned Limit = VL.
size() / 24;
10960 if ((CopyableNum >= VL.
size() - Limit ||
10961 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
10966 return InstructionsState::invalid();
10985 return InstructionsState::invalid();
10991 constexpr unsigned Limit = 4;
10992 if (
Operands.front().size() >= Limit) {
10993 SmallDenseMap<const Value *, unsigned>
Counters;
11001 return C.second == 1;
11007 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11008 InstructionsState OpS =
Analysis.buildInstructionsState(
11010 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11012 unsigned CopyableNum =
11014 return CopyableNum <= VL.
size() / 2;
11016 if (!CheckOperand(
Operands.front()))
11017 return InstructionsState::invalid();
11024 assert(S &&
"Invalid state!");
11026 if (S.areInstructionsWithCopyableElements()) {
11027 MainOp = S.getMainOp();
11028 MainOpcode = S.getOpcode();
11033 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11034 Operands[OperandIdx][Idx] = Operand;
11037 buildOriginalOperands(S, VL,
Operands);
11044BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11046 bool TryCopyableElementsVectorization)
const {
11049 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11050 InstructionsState S =
Analysis.buildInstructionsState(
11051 VL, *
this, TryCopyableElementsVectorization,
11052 true, TryCopyableElementsVectorization);
11060 return ScalarsVectorizationLegality(S,
false,
11066 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11067 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11068 if (
E->isSame(VL)) {
11069 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11071 return ScalarsVectorizationLegality(S,
false);
11076 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11077 LI->getLoopFor(S.getMainOp()->getParent()) &&
11081 return ScalarsVectorizationLegality(S,
false);
11090 !(S && !S.isAltShuffle() && VL.
size() >= 4 &&
11097 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11098 return ScalarsVectorizationLegality(S,
false);
11102 if (S && S.getOpcode() == Instruction::ExtractElement &&
11105 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11106 return ScalarsVectorizationLegality(S,
false);
11113 return ScalarsVectorizationLegality(S,
false,
11123 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11131 SmallVector<unsigned, 8> InstsCount;
11132 for (
Value *V : VL) {
11135 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11138 bool IsCommutative =
11140 if ((IsCommutative &&
11141 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11143 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11145 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11149 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11151 I2->getOperand(
Op));
11152 if (
static_cast<unsigned>(
count_if(
11153 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11155 })) >= S.getMainOp()->getNumOperands() / 2)
11157 if (S.getMainOp()->getNumOperands() > 2)
11159 if (IsCommutative) {
11161 Candidates.
clear();
11162 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11164 I2->getOperand((
Op + 1) %
E));
11166 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11173 SmallVector<unsigned> SortedIndices;
11175 bool IsScatterVectorizeUserTE =
11176 UserTreeIdx.UserTE &&
11177 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11178 bool AreAllSameBlock = S.valid();
11179 bool AreScatterAllGEPSameBlock =
11192 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11194 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11200 NotProfitableForVectorization(VL)) {
11202 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11203 "C,S,B,O, small shuffle. \n";
11207 return ScalarsVectorizationLegality(S,
false,
11211 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11215 return ScalarsVectorizationLegality(S,
false);
11219 if (S && !EphValues.empty()) {
11220 for (
Value *V : VL) {
11221 if (EphValues.count(V)) {
11223 <<
") is ephemeral.\n");
11225 return ScalarsVectorizationLegality(S,
false,
11237 if (S && S.isAltShuffle()) {
11238 auto GetNumVectorizedExtracted = [&]() {
11244 all_of(
I->operands(), [&](
const Use &U) {
11245 return isa<ExtractElementInst>(U.get());
11250 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11253 return std::make_pair(Vectorized, Extracted);
11255 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11257 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11258 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11261 Type *ScalarTy = VL.front()->getType();
11266 false,
true, Kind);
11268 *TTI, ScalarTy, VecTy, Vectorized,
11269 true,
false, Kind,
false);
11270 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11272 if (PreferScalarize) {
11273 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11274 "node is not profitable.\n");
11275 return ScalarsVectorizationLegality(S,
false);
11280 if (UserIgnoreList && !UserIgnoreList->empty()) {
11281 for (
Value *V : VL) {
11282 if (UserIgnoreList->contains(V)) {
11283 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11284 return ScalarsVectorizationLegality(S,
false);
11291 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11292 assert(VL.front()->getType()->isPointerTy() &&
11294 "Expected pointers only.");
11297 assert(It != VL.end() &&
"Expected at least one GEP.");
11308 !DT->isReachableFromEntry(BB))) {
11314 return ScalarsVectorizationLegality(S,
false);
11316 return ScalarsVectorizationLegality(S,
true);
11321 unsigned InterleaveFactor) {
11324 SmallVector<int> ReuseShuffleIndices;
11328 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11331 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11334 auto Invalid = ScheduleBundle::invalid();
11335 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11336 UserTreeIdx, {}, ReorderIndices);
11341 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11343 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11344 Idx == 0 ? 0 : Op1.
size());
11345 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11347 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11348 Idx == 0 ? 0 : Op1.
size());
11358 bool AreConsts =
false;
11359 for (
Value *V : VL) {
11371 if (AreOnlyConstsWithPHIs(VL)) {
11372 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11373 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11377 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11378 VL,
Depth, UserTreeIdx,
false);
11379 InstructionsState S = Legality.getInstructionsState();
11380 if (!Legality.isLegal()) {
11381 if (Legality.trySplitVectorize()) {
11384 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11388 Legality = getScalarsVectorizationLegality(
11389 VL,
Depth, UserTreeIdx,
true);
11390 if (!Legality.isLegal()) {
11391 if (Legality.tryToFindDuplicates())
11395 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11398 S = Legality.getInstructionsState();
11402 if (S.isAltShuffle() && TrySplitNode(S))
11408 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11413 bool IsScatterVectorizeUserTE =
11414 UserTreeIdx.UserTE &&
11415 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11418 StridedPtrInfo SPtrInfo;
11419 TreeEntry::EntryState State = getScalarsVectorizationState(
11420 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11421 if (State == TreeEntry::NeedToGather) {
11422 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11428 auto &BSRef = BlocksSchedules[BB];
11430 BSRef = std::make_unique<BlockScheduling>(BB);
11432 BlockScheduling &BS = *BSRef;
11435 std::optional<ScheduleBundle *> BundlePtr =
11436 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11437#ifdef EXPENSIVE_CHECKS
11441 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11442 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11444 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11446 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11447 NonScheduledFirst.insert(VL.front());
11448 if (S.getOpcode() == Instruction::Load &&
11449 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11453 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11455 ScheduleBundle
Empty;
11456 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11457 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11459 unsigned ShuffleOrOp =
11460 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11461 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
11463 SmallVector<unsigned> PHIOps;
11469 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11474 for (
unsigned I : PHIOps)
11477 switch (ShuffleOrOp) {
11478 case Instruction::PHI: {
11480 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11488 case Instruction::ExtractValue:
11489 case Instruction::ExtractElement: {
11490 if (CurrentOrder.empty()) {
11491 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11494 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11496 for (
unsigned Idx : CurrentOrder)
11497 dbgs() <<
" " << Idx;
11504 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11505 ReuseShuffleIndices, CurrentOrder);
11507 "(ExtractValueInst/ExtractElementInst).\n";
11514 case Instruction::InsertElement: {
11515 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11517 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11518 const std::pair<int, int> &P2) {
11519 return P1.first > P2.first;
11522 decltype(OrdCompare)>
11523 Indices(OrdCompare);
11524 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11526 Indices.emplace(Idx,
I);
11528 OrdersType CurrentOrder(VL.size(), VL.size());
11529 bool IsIdentity =
true;
11530 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11531 CurrentOrder[Indices.top().second] =
I;
11532 IsIdentity &= Indices.top().second ==
I;
11536 CurrentOrder.clear();
11537 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11539 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11543 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11546 case Instruction::Load: {
11553 TreeEntry *
TE =
nullptr;
11556 case TreeEntry::Vectorize:
11557 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11558 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11559 if (CurrentOrder.empty())
11560 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11564 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11567 case TreeEntry::CompressVectorize:
11569 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11570 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11573 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11576 case TreeEntry::StridedVectorize:
11578 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11579 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11580 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
11581 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11584 case TreeEntry::ScatterVectorize:
11586 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11587 UserTreeIdx, ReuseShuffleIndices);
11590 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11593 case TreeEntry::CombinedVectorize:
11594 case TreeEntry::SplitVectorize:
11595 case TreeEntry::NeedToGather:
11598 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11599 assert(
Operands.size() == 1 &&
"Expected a single operand only");
11600 SmallVector<int>
Mask;
11605 if (State == TreeEntry::ScatterVectorize)
11606 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11609 case Instruction::ZExt:
11610 case Instruction::SExt:
11611 case Instruction::FPToUI:
11612 case Instruction::FPToSI:
11613 case Instruction::FPExt:
11614 case Instruction::PtrToInt:
11615 case Instruction::IntToPtr:
11616 case Instruction::SIToFP:
11617 case Instruction::UIToFP:
11618 case Instruction::Trunc:
11619 case Instruction::FPTrunc:
11620 case Instruction::BitCast: {
11621 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11622 std::make_pair(std::numeric_limits<unsigned>::min(),
11623 std::numeric_limits<unsigned>::max()));
11624 if (ShuffleOrOp == Instruction::ZExt ||
11625 ShuffleOrOp == Instruction::SExt) {
11626 CastMaxMinBWSizes = std::make_pair(
11627 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11629 std::min<unsigned>(
11632 }
else if (ShuffleOrOp == Instruction::Trunc) {
11633 CastMaxMinBWSizes = std::make_pair(
11634 std::max<unsigned>(
11637 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11640 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11641 ReuseShuffleIndices);
11642 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11647 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11648 if (ShuffleOrOp == Instruction::Trunc) {
11649 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11650 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11651 ShuffleOrOp == Instruction::UIToFP) {
11652 unsigned NumSignBits =
11655 APInt
Mask = DB->getDemandedBits(OpI);
11656 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11658 if (NumSignBits * 2 >=
11660 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11664 case Instruction::ICmp:
11665 case Instruction::FCmp: {
11668 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11669 ReuseShuffleIndices);
11678 "Commutative Predicate mismatch");
11688 if (
Cmp->getPredicate() != P0)
11695 if (ShuffleOrOp == Instruction::ICmp) {
11696 unsigned NumSignBits0 =
11698 if (NumSignBits0 * 2 >=
11700 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11701 unsigned NumSignBits1 =
11703 if (NumSignBits1 * 2 >=
11705 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11709 case Instruction::Select:
11710 case Instruction::FNeg:
11711 case Instruction::Add:
11712 case Instruction::FAdd:
11713 case Instruction::Sub:
11714 case Instruction::FSub:
11715 case Instruction::Mul:
11716 case Instruction::FMul:
11717 case Instruction::UDiv:
11718 case Instruction::SDiv:
11719 case Instruction::FDiv:
11720 case Instruction::URem:
11721 case Instruction::SRem:
11722 case Instruction::FRem:
11723 case Instruction::Shl:
11724 case Instruction::LShr:
11725 case Instruction::AShr:
11726 case Instruction::And:
11727 case Instruction::Or:
11728 case Instruction::Xor:
11729 case Instruction::Freeze: {
11730 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11731 ReuseShuffleIndices);
11733 dbgs() <<
"SLP: added a new TreeEntry "
11734 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11745 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11748 case Instruction::GetElementPtr: {
11749 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11750 ReuseShuffleIndices);
11751 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
11759 case Instruction::Store: {
11760 bool Consecutive = CurrentOrder.empty();
11763 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11764 ReuseShuffleIndices, CurrentOrder);
11766 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
11770 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
11773 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
11776 case Instruction::Call: {
11782 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11783 ReuseShuffleIndices);
11784 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
11798 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11802 case Instruction::ShuffleVector: {
11803 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11804 ReuseShuffleIndices);
11805 if (S.isAltShuffle()) {
11806 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
11811 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11825 "Expected different main/alternate predicates.");
11855 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11873 for (
const auto *Ty : ST->elements())
11874 if (Ty != *ST->element_begin())
11876 N *= ST->getNumElements();
11877 EltTy = *ST->element_begin();
11879 N *= AT->getNumElements();
11880 EltTy = AT->getElementType();
11883 N *= VT->getNumElements();
11884 EltTy = VT->getElementType();
11890 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
11891 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11892 VTSize != DL->getTypeStoreSizeInBits(
T))
11899 bool ResizeAllowed)
const {
11901 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
11908 Value *Vec = E0->getOperand(0);
11910 CurrentOrder.
clear();
11914 if (E0->getOpcode() == Instruction::ExtractValue) {
11926 unsigned E = VL.
size();
11927 if (!ResizeAllowed && NElts !=
E)
11930 unsigned MinIdx = NElts, MaxIdx = 0;
11935 if (Inst->getOperand(0) != Vec)
11943 const unsigned ExtIdx = *Idx;
11944 if (ExtIdx >= NElts)
11946 Indices[
I] = ExtIdx;
11947 if (MinIdx > ExtIdx)
11949 if (MaxIdx < ExtIdx)
11952 if (MaxIdx - MinIdx + 1 >
E)
11954 if (MaxIdx + 1 <=
E)
11958 bool ShouldKeepOrder =
true;
11965 for (
unsigned I = 0;
I <
E; ++
I) {
11968 const unsigned ExtIdx = Indices[
I] - MinIdx;
11969 if (CurrentOrder[ExtIdx] !=
E) {
11970 CurrentOrder.
clear();
11973 ShouldKeepOrder &= ExtIdx ==
I;
11974 CurrentOrder[ExtIdx] =
I;
11976 if (ShouldKeepOrder)
11977 CurrentOrder.
clear();
11979 return ShouldKeepOrder;
11982bool BoUpSLP::areAllUsersVectorized(
11983 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
11984 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
11985 all_of(
I->users(), [
this](User *U) {
11986 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
11987 (isa<ExtractElementInst>(U) && MustGather.contains(U));
11991void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
11992 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
11993 SmallVectorImpl<Value *> *OpScalars,
11994 SmallVectorImpl<Value *> *AltScalars)
const {
11995 unsigned Sz = Scalars.size();
11997 SmallVector<int> OrderMask;
11998 if (!ReorderIndices.empty())
12000 for (
unsigned I = 0;
I < Sz; ++
I) {
12002 if (!ReorderIndices.empty())
12003 Idx = OrderMask[
I];
12007 if (IsAltOp(OpInst)) {
12008 Mask[
I] = Sz + Idx;
12017 if (!ReuseShuffleIndices.
empty()) {
12019 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12020 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12022 Mask.swap(NewMask);
12029 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12039 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12048 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12049 "CmpInst expected to match either main or alternate predicate or "
12051 return MainP !=
P && MainP != SwappedP;
12053 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12058 const auto *Op0 =
Ops.front();
12071 return CI->getValue().isPowerOf2();
12077 return CI->getValue().isNegatedPowerOf2();
12082 if (IsConstant && IsUniform)
12084 else if (IsConstant)
12086 else if (IsUniform)
12098class BaseShuffleAnalysis {
12100 Type *ScalarTy =
nullptr;
12102 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12110 unsigned getVF(
Value *V)
const {
12111 assert(V &&
"V cannot be nullptr");
12113 "V does not have FixedVectorType");
12114 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12116 unsigned VNumElements =
12118 assert(VNumElements > ScalarTyNumElements &&
12119 "the number of elements of V is not large enough");
12120 assert(VNumElements % ScalarTyNumElements == 0 &&
12121 "the number of elements of V is not a vectorized value");
12122 return VNumElements / ScalarTyNumElements;
12128 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12130 int Limit =
Mask.size();
12142 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12143 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12156 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12157 ArrayRef<int> ExtMask) {
12158 unsigned VF =
Mask.size();
12160 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12163 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12167 Mask.swap(NewMask);
12203 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12204 bool SinglePermute) {
12206 ShuffleVectorInst *IdentityOp =
nullptr;
12207 SmallVector<int> IdentityMask;
12216 if (isIdentityMask(Mask, SVTy,
false)) {
12217 if (!IdentityOp || !SinglePermute ||
12218 (isIdentityMask(Mask, SVTy,
true) &&
12220 IdentityMask.
size()))) {
12225 IdentityMask.
assign(Mask);
12245 if (SV->isZeroEltSplat()) {
12247 IdentityMask.
assign(Mask);
12249 int LocalVF =
Mask.size();
12252 LocalVF = SVOpTy->getNumElements();
12256 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12258 ExtMask[Idx] = SV->getMaskValue(
I);
12268 if (!IsOp1Undef && !IsOp2Undef) {
12270 for (
int &
I : Mask) {
12273 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12279 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12280 combineMasks(LocalVF, ShuffleMask, Mask);
12281 Mask.swap(ShuffleMask);
12283 Op = SV->getOperand(0);
12285 Op = SV->getOperand(1);
12288 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12293 "Expected masks of same sizes.");
12298 Mask.swap(IdentityMask);
12300 return SinglePermute &&
12303 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12304 Shuffle->isZeroEltSplat() &&
12308 Shuffle->getShuffleMask()[
P.index()] == 0;
12321 template <
typename T,
typename ShuffleBuilderTy>
12322 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12323 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12324 assert(V1 &&
"Expected at least one vector value.");
12326 SmallVector<int> NewMask(Mask);
12327 if (ScalarTyNumElements != 1) {
12333 Builder.resizeToMatch(V1, V2);
12334 int VF =
Mask.size();
12336 VF = FTy->getNumElements();
12347 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12349 CombinedMask1[
I] =
Mask[
I];
12351 CombinedMask2[
I] =
Mask[
I] - VF;
12358 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12359 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12365 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12368 ExtMask1[Idx] = SV1->getMaskValue(
I);
12372 ->getNumElements(),
12373 ExtMask1, UseMask::SecondArg);
12374 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12375 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12378 ExtMask2[Idx] = SV2->getMaskValue(
I);
12382 ->getNumElements(),
12383 ExtMask2, UseMask::SecondArg);
12384 if (SV1->getOperand(0)->getType() ==
12385 SV2->getOperand(0)->getType() &&
12386 SV1->getOperand(0)->getType() != SV1->getType() &&
12389 Op1 = SV1->getOperand(0);
12390 Op2 = SV2->getOperand(0);
12391 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12392 int LocalVF = ShuffleMask1.size();
12394 LocalVF = FTy->getNumElements();
12395 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12396 CombinedMask1.swap(ShuffleMask1);
12397 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12398 LocalVF = ShuffleMask2.size();
12400 LocalVF = FTy->getNumElements();
12401 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12402 CombinedMask2.swap(ShuffleMask2);
12405 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12406 Builder.resizeToMatch(Op1, Op2);
12408 ->getElementCount()
12409 .getKnownMinValue(),
12411 ->getElementCount()
12412 .getKnownMinValue());
12413 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12416 "Expected undefined mask element");
12417 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12426 return Builder.createIdentity(Op1);
12427 return Builder.createShuffleVector(
12432 return Builder.createPoison(
12434 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12435 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12438 return Builder.createShuffleVector(V1, NewMask);
12439 return Builder.createIdentity(V1);
12445 ArrayRef<int> Mask) {
12454static std::pair<InstructionCost, InstructionCost>
12465 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12474 ScalarCost =
TTI.getPointersChainCost(
12475 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12479 for (
Value *V : Ptrs) {
12480 if (V == BasePtr) {
12489 if (!
Ptr || !
Ptr->hasOneUse())
12493 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12498 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12499 TTI::PointersChainInfo::getKnownStride(),
12509 [](
const Value *V) {
12511 return Ptr && !
Ptr->hasAllConstantIndices();
12513 ? TTI::PointersChainInfo::getUnknownStride()
12514 : TTI::PointersChainInfo::getKnownStride();
12517 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
12521 if (It != Ptrs.
end())
12526 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
12527 BaseGEP->getPointerOperand(), Indices, VecTy,
12532 return std::make_pair(ScalarCost, VecCost);
12535void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12536 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12537 "Expected gather node without reordering.");
12539 SmallSet<size_t, 2> LoadKeyUsed;
12543 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12548 return VectorizableTree[Idx]->isSame(TE.Scalars);
12552 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
12557 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
12558 if (LIt != LoadsMap.
end()) {
12559 for (LoadInst *RLI : LIt->second) {
12561 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
12565 for (LoadInst *RLI : LIt->second) {
12567 LI->getPointerOperand(), *TLI)) {
12572 if (LIt->second.size() > 2) {
12574 hash_value(LIt->second.back()->getPointerOperand());
12583 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12584 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12585 bool IsOrdered =
true;
12586 unsigned NumInstructions = 0;
12590 size_t Key = 1, Idx = 1;
12598 auto &Container = SortedValues[
Key];
12599 if (IsOrdered && !KeyToIndex.
contains(V) &&
12602 ((Container.contains(Idx) &&
12603 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
12604 (!Container.empty() && !Container.contains(Idx) &&
12605 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12607 auto &KTI = KeyToIndex[
V];
12609 Container[Idx].push_back(V);
12614 if (!IsOrdered && NumInstructions > 1) {
12616 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12617 for (
const auto &
D : SortedValues) {
12618 for (
const auto &
P :
D.second) {
12620 for (
Value *V :
P.second) {
12621 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
12622 for (
auto [K, Idx] :
enumerate(Indices)) {
12623 TE.ReorderIndices[Cnt +
K] = Idx;
12624 TE.Scalars[Cnt +
K] =
V;
12626 Sz += Indices.
size();
12627 Cnt += Indices.
size();
12631 *TTI,
TE.Scalars.front()->getType(), Sz);
12635 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12643 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12648 auto *ScalarTy =
TE.Scalars.front()->getType();
12650 for (
auto [Idx, Sz] : SubVectors) {
12657 int Sz =
TE.Scalars.size();
12658 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
12659 TE.ReorderIndices.end());
12665 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12669 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12672 VecTy, ReorderMask);
12678 DemandedElts.clearBit(
I);
12680 ReorderMask[
I] =
I;
12682 ReorderMask[
I] =
I + Sz;
12688 if (!DemandedElts.isAllOnes())
12690 if (
Cost >= BVCost) {
12691 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
12693 TE.ReorderIndices.clear();
12700 const InstructionsState &S,
12706 return V->getType()->getScalarType()->isFloatingPointTy();
12708 "Can only convert to FMA for floating point types");
12709 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12714 for (
Value *V : VL) {
12718 if (S.isCopyableElement(
I))
12720 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12721 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12724 FMF &= FPCI->getFastMathFlags();
12728 if (!CheckForContractable(VL))
12731 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12738 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12740 if (!CheckForContractable(
Operands.front()))
12748 for (
Value *V : VL) {
12752 if (!S.isCopyableElement(
I))
12754 FMF &= FPCI->getFastMathFlags();
12755 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12759 if (S.isCopyableElement(V))
12762 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
12764 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
12771 FMF &= FPCI->getFastMathFlags();
12772 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12782 BaseGraphSize = VectorizableTree.size();
12784 class GraphTransformModeRAAI {
12785 bool &SavedIsGraphTransformMode;
12788 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
12789 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12790 IsGraphTransformMode =
true;
12792 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
12793 } TransformContext(IsGraphTransformMode);
12802 const InstructionsState &S) {
12806 I2->getOperand(
Op));
12808 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
12810 [](
const std::pair<Value *, Value *> &
P) {
12820 TreeEntry &E = *VectorizableTree[Idx];
12822 reorderGatherNode(E);
12827 constexpr unsigned VFLimit = 16;
12828 bool ForceLoadGather =
12829 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12830 return TE->isGather() && TE->hasState() &&
12831 TE->getOpcode() == Instruction::Load &&
12832 TE->getVectorFactor() < VFLimit;
12838 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
12847 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
12848 if (E.hasState()) {
12850 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12851 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12852 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12853 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12854 return is_contained(TEs, TE);
12861 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12862 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12863 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12864 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12865 return is_contained(TEs, TE);
12873 if (It != E.Scalars.end()) {
12875 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12876 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12877 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12878 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12879 return is_contained(TEs, TE);
12889 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
12890 TreeEntry &
E = *VectorizableTree[Idx];
12891 if (
E.isGather()) {
12894 unsigned MinVF =
getMinVF(2 * Sz);
12897 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
12898 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
12904 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
12907 if (CheckForSameVectorNodes(
E))
12911 unsigned StartIdx = 0;
12912 unsigned End = VL.
size();
12914 *TTI, VL.
front()->getType(), VL.
size() - 1);
12916 *TTI, VL.
front()->getType(), VF - 1)) {
12917 if (StartIdx + VF > End)
12920 bool AllStrided =
true;
12921 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
12926 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
12933 bool IsSplat =
isSplat(Slice);
12934 bool IsTwoRegisterSplat =
true;
12935 if (IsSplat && VF == 2) {
12938 IsTwoRegisterSplat = NumRegs2VF == 2;
12940 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
12948 (S.getOpcode() == Instruction::Load &&
12950 (S.getOpcode() != Instruction::Load &&
12956 if ((!UserIgnoreList ||
E.Idx != 0) &&
12957 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
12966 if (S.getOpcode() == Instruction::Load) {
12969 StridedPtrInfo SPtrInfo;
12971 PointerOps, SPtrInfo);
12982 if (UserIgnoreList &&
E.Idx == 0)
12987 }
else if (S.getOpcode() == Instruction::ExtractElement ||
12988 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
12990 !CheckOperandsProfitability(
13007 if (VF == 2 && AllStrided && Slices.
size() > 2)
13009 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13010 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13011 if (StartIdx == Cnt)
13012 StartIdx = Cnt + Sz;
13013 if (End == Cnt + Sz)
13016 for (
auto [Cnt, Sz] : Slices) {
13018 const TreeEntry *SameTE =
nullptr;
13020 It != Slice.
end()) {
13022 SameTE = getSameValuesTreeEntry(*It, Slice);
13024 unsigned PrevSize = VectorizableTree.size();
13025 [[maybe_unused]]
unsigned PrevEntriesSize =
13026 LoadEntriesToVectorize.size();
13027 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13028 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13029 VectorizableTree[PrevSize]->isGather() &&
13030 VectorizableTree[PrevSize]->hasState() &&
13031 VectorizableTree[PrevSize]->getOpcode() !=
13032 Instruction::ExtractElement &&
13034 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13036 VectorizableTree.pop_back();
13037 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13038 "LoadEntriesToVectorize expected to remain the same");
13041 AddCombinedNode(PrevSize, Cnt, Sz);
13045 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13046 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13048 E.ReorderIndices.clear();
13053 switch (
E.getOpcode()) {
13054 case Instruction::Load: {
13057 if (
E.State != TreeEntry::Vectorize)
13059 Type *ScalarTy =
E.getMainOp()->getType();
13065 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13066 SmallVector<int>
Mask;
13070 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13071 BaseLI->getPointerAddressSpace(),
CostKind,
13075 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13076 false, CommonAlignment,
CostKind, BaseLI);
13081 ->getPointerOperand()
13083 StridedPtrInfo SPtrInfo;
13084 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13085 SPtrInfo.Ty = VecTy;
13086 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13087 E.State = TreeEntry::StridedVectorize;
13092 case Instruction::Store: {
13100 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13101 SmallVector<int>
Mask;
13105 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13106 BaseSI->getPointerAddressSpace(),
CostKind,
13110 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13111 false, CommonAlignment,
CostKind, BaseSI);
13112 if (StridedCost < OriginalVecCost)
13115 E.State = TreeEntry::StridedVectorize;
13116 }
else if (!
E.ReorderIndices.empty()) {
13118 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13120 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13121 if (
Mask.size() < 4)
13125 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13126 TTI.isLegalInterleavedAccessType(
13127 VecTy, Factor, BaseSI->getAlign(),
13128 BaseSI->getPointerAddressSpace()))
13134 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13135 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13136 if (InterleaveFactor != 0)
13137 E.setInterleave(InterleaveFactor);
13141 case Instruction::Select: {
13142 if (
E.State != TreeEntry::Vectorize)
13148 E.CombinedOp = TreeEntry::MinMax;
13149 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13150 if (SelectOnly && CondEntry->UserTreeIndex &&
13151 CondEntry->State == TreeEntry::Vectorize) {
13153 CondEntry->State = TreeEntry::CombinedVectorize;
13157 case Instruction::FSub:
13158 case Instruction::FAdd: {
13160 if (
E.State != TreeEntry::Vectorize ||
13161 !
E.getOperations().isAddSubLikeOp())
13167 E.CombinedOp = TreeEntry::FMulAdd;
13168 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13169 if (FMulEntry->UserTreeIndex &&
13170 FMulEntry->State == TreeEntry::Vectorize) {
13172 FMulEntry->State = TreeEntry::CombinedVectorize;
13181 if (LoadEntriesToVectorize.empty()) {
13183 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13184 VectorizableTree.front()->getOpcode() == Instruction::Load)
13187 constexpr unsigned SmallTree = 3;
13188 constexpr unsigned SmallVF = 2;
13189 if ((VectorizableTree.size() <= SmallTree &&
13190 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13191 (VectorizableTree.size() <= 2 && UserIgnoreList))
13194 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13198 [](
const std::unique_ptr<TreeEntry> &TE) {
13199 return TE->isGather() &&
TE->hasState() &&
13200 TE->getOpcode() == Instruction::Load &&
13208 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13212 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13213 TreeEntry &
E = *
TE;
13214 if (
E.isGather() &&
13215 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13216 (!
E.hasState() &&
any_of(
E.Scalars,
13218 return isa<LoadInst>(V) &&
13219 !isVectorized(V) &&
13220 !isDeleted(cast<Instruction>(V));
13223 for (
Value *V :
E.Scalars) {
13230 *
this, V, *DL, *SE, *TTI,
13231 GatheredLoads[std::make_tuple(
13239 if (!GatheredLoads.
empty())
13240 tryToVectorizeGatheredLoads(GatheredLoads);
13250 bool IsFinalized =
false;
13263 bool SameNodesEstimated =
true;
13266 if (Ty->getScalarType()->isPointerTy()) {
13270 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13271 Ty->getScalarType());
13289 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13292 count(VL, *It) > 1 &&
13294 if (!NeedShuffle) {
13297 return TTI.getShuffleCost(
13302 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13303 CostKind, std::distance(VL.
begin(), It),
13309 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13312 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13316 VecTy, ShuffleMask, CostKind,
13320 return GatherCost +
13323 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13331 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13332 unsigned NumParts) {
13333 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13335 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13336 auto *EE = dyn_cast<ExtractElementInst>(V);
13339 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13342 return std::max(Sz, VecTy->getNumElements());
13349 -> std::optional<TTI::ShuffleKind> {
13350 if (NumElts <= EltsPerVector)
13351 return std::nullopt;
13353 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13355 if (I == PoisonMaskElem)
13357 return std::min(S, I);
13360 int OffsetReg1 = OffsetReg0;
13364 int FirstRegId = -1;
13365 Indices.assign(1, OffsetReg0);
13369 int Idx =
I - OffsetReg0;
13371 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13372 if (FirstRegId < 0)
13373 FirstRegId = RegId;
13374 RegIndices.
insert(RegId);
13375 if (RegIndices.
size() > 2)
13376 return std::nullopt;
13377 if (RegIndices.
size() == 2) {
13379 if (Indices.
size() == 1) {
13382 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13383 [&](
int S,
int I) {
13384 if (I == PoisonMaskElem)
13386 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13387 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13388 if (RegId == FirstRegId)
13390 return std::min(S, I);
13393 unsigned Index = OffsetReg1 % NumElts;
13394 Indices.push_back(Index);
13395 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13397 Idx =
I - OffsetReg1;
13399 I = (Idx % NumElts) % EltsPerVector +
13400 (RegId == FirstRegId ? 0 : EltsPerVector);
13402 return ShuffleKind;
13410 if (!ShuffleKinds[Part])
13413 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13418 std::optional<TTI::ShuffleKind> RegShuffleKind =
13419 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13420 if (!RegShuffleKind) {
13423 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13436 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13437 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13438 assert((Idx + SubVecSize) <= BaseVF &&
13439 "SK_ExtractSubvector index out of range");
13449 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13450 if (OriginalCost < Cost)
13451 Cost = OriginalCost;
13458 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13460 unsigned SliceSize) {
13461 if (SameNodesEstimated) {
13467 if ((InVectors.size() == 2 &&
13471 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13474 "Expected all poisoned elements.");
13476 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13481 Cost += createShuffle(InVectors.front(),
13482 InVectors.size() == 1 ?
nullptr : InVectors.back(),
13484 transformMaskAfterShuffle(CommonMask, CommonMask);
13485 }
else if (InVectors.size() == 2) {
13486 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13487 transformMaskAfterShuffle(CommonMask, CommonMask);
13489 SameNodesEstimated =
false;
13490 if (!E2 && InVectors.size() == 1) {
13491 unsigned VF = E1.getVectorFactor();
13493 VF = std::max(VF, getVF(V1));
13496 VF = std::max(VF, E->getVectorFactor());
13498 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13500 CommonMask[Idx] = Mask[Idx] + VF;
13501 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13502 transformMaskAfterShuffle(CommonMask, CommonMask);
13504 auto P = InVectors.front();
13505 Cost += createShuffle(&E1, E2, Mask);
13506 unsigned VF = Mask.size();
13512 VF = std::max(VF, E->getVectorFactor());
13514 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13516 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13517 Cost += createShuffle(
P, InVectors.front(), CommonMask);
13518 transformMaskAfterShuffle(CommonMask, CommonMask);
13522 class ShuffleCostBuilder {
13525 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13527 return Mask.empty() ||
13528 (VF == Mask.size() &&
13536 ~ShuffleCostBuilder() =
default;
13542 if (isEmptyOrIdentity(Mask, VF))
13551 if (isEmptyOrIdentity(Mask, VF))
13560 void resizeToMatch(
Value *&,
Value *&)
const {}
13570 ShuffleCostBuilder Builder(TTI);
13573 unsigned CommonVF = Mask.size();
13575 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13579 Type *EScalarTy = E.Scalars.front()->getType();
13580 bool IsSigned =
true;
13581 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13583 IsSigned = It->second.second;
13585 if (EScalarTy != ScalarTy) {
13586 unsigned CastOpcode = Instruction::Trunc;
13587 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13588 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13590 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13591 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
13601 Type *EScalarTy = VecTy->getElementType();
13602 if (EScalarTy != ScalarTy) {
13604 unsigned CastOpcode = Instruction::Trunc;
13605 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13606 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13608 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13609 return TTI.getCastInstrCost(
13615 if (!V1 && !V2 && !P2.
isNull()) {
13618 unsigned VF = E->getVectorFactor();
13620 CommonVF = std::max(VF, E2->getVectorFactor());
13623 return Idx < 2 * static_cast<int>(CommonVF);
13625 "All elements in mask must be less than 2 * CommonVF.");
13626 if (E->Scalars.size() == E2->Scalars.size()) {
13630 for (
int &Idx : CommonMask) {
13633 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13635 else if (Idx >=
static_cast<int>(CommonVF))
13636 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13640 CommonVF = E->Scalars.size();
13641 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13642 GetNodeMinBWAffectedCost(*E2, CommonVF);
13644 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13645 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13648 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13649 }
else if (!V1 && P2.
isNull()) {
13652 unsigned VF = E->getVectorFactor();
13656 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13657 "All elements in mask must be less than CommonVF.");
13658 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13660 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13661 for (
int &Idx : CommonMask) {
13665 CommonVF = E->Scalars.size();
13666 }
else if (
unsigned Factor = E->getInterleaveFactor();
13667 Factor > 0 && E->Scalars.size() != Mask.size() &&
13671 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13673 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13676 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13677 CommonVF == CommonMask.size() &&
13679 [](
const auto &&
P) {
13681 static_cast<unsigned>(
P.value()) !=
P.index();
13689 }
else if (V1 && P2.
isNull()) {
13691 ExtraCost += GetValueMinBWAffectedCost(V1);
13692 CommonVF = getVF(V1);
13695 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13696 "All elements in mask must be less than CommonVF.");
13697 }
else if (V1 && !V2) {
13699 unsigned VF = getVF(V1);
13701 CommonVF = std::max(VF, E2->getVectorFactor());
13704 return Idx < 2 * static_cast<int>(CommonVF);
13706 "All elements in mask must be less than 2 * CommonVF.");
13707 if (E2->Scalars.size() == VF && VF != CommonVF) {
13709 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13710 for (
int &Idx : CommonMask) {
13713 if (Idx >=
static_cast<int>(CommonVF))
13714 Idx = E2Mask[Idx - CommonVF] + VF;
13718 ExtraCost += GetValueMinBWAffectedCost(V1);
13720 ExtraCost += GetNodeMinBWAffectedCost(
13721 *E2, std::min(CommonVF, E2->getVectorFactor()));
13722 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13723 }
else if (!V1 && V2) {
13725 unsigned VF = getVF(V2);
13727 CommonVF = std::max(VF, E1->getVectorFactor());
13730 return Idx < 2 * static_cast<int>(CommonVF);
13732 "All elements in mask must be less than 2 * CommonVF.");
13733 if (E1->Scalars.size() == VF && VF != CommonVF) {
13735 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
13736 for (
int &Idx : CommonMask) {
13739 if (Idx >=
static_cast<int>(CommonVF))
13740 Idx = E1Mask[Idx - CommonVF] + VF;
13746 ExtraCost += GetNodeMinBWAffectedCost(
13747 *E1, std::min(CommonVF, E1->getVectorFactor()));
13749 ExtraCost += GetValueMinBWAffectedCost(V2);
13750 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13752 assert(V1 && V2 &&
"Expected both vectors.");
13753 unsigned VF = getVF(V1);
13754 CommonVF = std::max(VF, getVF(V2));
13757 return Idx < 2 * static_cast<int>(CommonVF);
13759 "All elements in mask must be less than 2 * CommonVF.");
13761 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13764 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13769 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13772 InVectors.front() =
13774 if (InVectors.size() == 2)
13775 InVectors.pop_back();
13776 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13777 V1, V2, CommonMask, Builder, ScalarTy);
13784 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13785 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13786 CheckedExtracts(CheckedExtracts) {}
13788 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13789 unsigned NumParts,
bool &UseVecBaseAsInput) {
13790 UseVecBaseAsInput =
false;
13793 Value *VecBase =
nullptr;
13795 if (!E->ReorderIndices.empty()) {
13797 E->ReorderIndices.end());
13802 bool PrevNodeFound =
any_of(
13803 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13804 [&](
const std::unique_ptr<TreeEntry> &TE) {
13805 return ((TE->hasState() && !TE->isAltShuffle() &&
13806 TE->getOpcode() == Instruction::ExtractElement) ||
13808 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13809 return VL.size() > Data.index() &&
13810 (Mask[Data.index()] == PoisonMaskElem ||
13811 isa<UndefValue>(VL[Data.index()]) ||
13812 Data.value() == VL[Data.index()]);
13820 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13834 VecBase = EE->getVectorOperand();
13835 UniqueBases.
insert(VecBase);
13837 if (!CheckedExtracts.
insert(V).second ||
13841 return isa<GetElementPtrInst>(U) &&
13842 !R.areAllUsersVectorized(cast<Instruction>(U),
13850 unsigned Idx = *EEIdx;
13852 if (EE->hasOneUse() || !PrevNodeFound) {
13858 Cost -= TTI.getExtractWithExtendCost(
13859 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13862 Cost += TTI.getCastInstrCost(
13863 Ext->getOpcode(), Ext->getType(), EE->getType(),
13868 APInt &DemandedElts =
13869 VectorOpsToExtracts
13872 .first->getSecond();
13873 DemandedElts.
setBit(Idx);
13876 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13878 DemandedElts,
false,
13886 if (!PrevNodeFound)
13887 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13890 transformMaskAfterShuffle(CommonMask, CommonMask);
13891 SameNodesEstimated =
false;
13892 if (NumParts != 1 && UniqueBases.
size() != 1) {
13893 UseVecBaseAsInput =
true;
13901 std::optional<InstructionCost>
13905 return std::nullopt;
13909 IsFinalized =
false;
13910 CommonMask.clear();
13913 VectorizedVals.clear();
13914 SameNodesEstimated =
true;
13920 return Idx < static_cast<int>(E1.getVectorFactor());
13922 "Expected single vector shuffle mask.");
13926 if (InVectors.empty()) {
13927 CommonMask.assign(Mask.begin(), Mask.end());
13928 InVectors.assign({&E1, &E2});
13931 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
13937 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13938 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13941 if (InVectors.empty()) {
13942 CommonMask.assign(Mask.begin(), Mask.end());
13943 InVectors.assign(1, &E1);
13946 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
13952 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13953 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
13954 if (!SameNodesEstimated && InVectors.size() == 1)
13955 InVectors.emplace_back(&E1);
13961 assert(InVectors.size() == 1 &&
13968 ->getOrdered(
P.index()));
13969 return EI->getVectorOperand() == V1 ||
13970 EI->getVectorOperand() == V2;
13972 "Expected extractelement vectors.");
13976 if (InVectors.empty()) {
13977 assert(CommonMask.empty() && !ForExtracts &&
13978 "Expected empty input mask/vectors.");
13979 CommonMask.assign(Mask.begin(), Mask.end());
13980 InVectors.assign(1, V1);
13986 !CommonMask.empty() &&
13990 ->getOrdered(
P.index());
13992 return P.value() == Mask[
P.index()] ||
13997 return EI->getVectorOperand() == V1;
13999 "Expected only tree entry for extractelement vectors.");
14002 assert(!InVectors.empty() && !CommonMask.empty() &&
14003 "Expected only tree entries from extracts/reused buildvectors.");
14004 unsigned VF = getVF(V1);
14005 if (InVectors.size() == 2) {
14006 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14007 transformMaskAfterShuffle(CommonMask, CommonMask);
14008 VF = std::max<unsigned>(VF, CommonMask.size());
14009 }
else if (
const auto *InTE =
14010 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14011 VF = std::max(VF, InTE->getVectorFactor());
14015 ->getNumElements());
14017 InVectors.push_back(V1);
14018 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14020 CommonMask[Idx] = Mask[Idx] + VF;
14023 Value *Root =
nullptr) {
14024 Cost += getBuildVectorCost(VL, Root);
14028 unsigned VF = VL.
size();
14030 VF = std::min(VF, MaskVF);
14031 Type *VLScalarTy = VL.
front()->getType();
14055 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14061 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14066 IsFinalized =
true;
14069 if (InVectors.
size() == 2)
14070 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14072 Cost += createShuffle(Vec,
nullptr, CommonMask);
14073 transformMaskAfterShuffle(CommonMask, CommonMask);
14075 "Expected vector length for the final value before action.");
14078 Cost += createShuffle(V1, V2, Mask);
14081 InVectors.
front() = V;
14083 if (!SubVectors.empty()) {
14085 if (InVectors.
size() == 2)
14086 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14088 Cost += createShuffle(Vec,
nullptr, CommonMask);
14089 transformMaskAfterShuffle(CommonMask, CommonMask);
14091 if (!SubVectorsMask.
empty()) {
14093 "Expected same size of masks for subvectors and common mask.");
14095 copy(SubVectorsMask, SVMask.begin());
14096 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14099 I1 = I2 + CommonMask.
size();
14106 for (
auto [
E, Idx] : SubVectors) {
14107 Type *EScalarTy =
E->Scalars.front()->getType();
14108 bool IsSigned =
true;
14109 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14112 IsSigned = It->second.second;
14114 if (ScalarTy != EScalarTy) {
14115 unsigned CastOpcode = Instruction::Trunc;
14116 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14117 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14119 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14120 Cost += TTI.getCastInstrCost(
14129 if (!CommonMask.
empty()) {
14130 std::iota(std::next(CommonMask.
begin(), Idx),
14131 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14137 if (!ExtMask.
empty()) {
14138 if (CommonMask.
empty()) {
14142 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14145 NewMask[
I] = CommonMask[ExtMask[
I]];
14147 CommonMask.
swap(NewMask);
14150 if (CommonMask.
empty()) {
14151 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14155 createShuffle(InVectors.
front(),
14156 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14161 assert((IsFinalized || CommonMask.empty()) &&
14162 "Shuffle construction must be finalized.");
14166const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14167 unsigned Idx)
const {
14168 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14169 assert(
Op->isSame(E->getOperand(Idx)) &&
"Operands mismatch!");
14174 if (TE.State == TreeEntry::ScatterVectorize ||
14175 TE.State == TreeEntry::StridedVectorize)
14177 if (TE.State == TreeEntry::CompressVectorize)
14179 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14180 !TE.isAltShuffle()) {
14181 if (TE.ReorderIndices.empty())
14193 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14203 auto It = MinBWs.find(
E);
14204 Type *OrigScalarTy = ScalarTy;
14205 if (It != MinBWs.end()) {
14212 unsigned EntryVF =
E->getVectorFactor();
14215 if (
E->isGather()) {
14221 ScalarTy = VL.
front()->getType();
14222 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14223 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14225 if (
E->State == TreeEntry::SplitVectorize) {
14226 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14227 "Expected exactly 2 combined entries.");
14228 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14230 if (
E->ReorderIndices.empty()) {
14233 E->CombinedEntriesWithIndices.back().second,
14236 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14237 ->getVectorFactor()));
14239 unsigned CommonVF =
14240 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14241 ->getVectorFactor(),
14242 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14243 ->getVectorFactor());
14248 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14252 SmallVector<int>
Mask;
14253 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14254 (
E->State != TreeEntry::StridedVectorize ||
14256 SmallVector<int> NewMask;
14257 if (
E->getOpcode() == Instruction::Store) {
14259 NewMask.
resize(
E->ReorderIndices.size());
14266 if (!
E->ReuseShuffleIndices.empty())
14271 assert((
E->State == TreeEntry::Vectorize ||
14272 E->State == TreeEntry::ScatterVectorize ||
14273 E->State == TreeEntry::StridedVectorize ||
14274 E->State == TreeEntry::CompressVectorize) &&
14275 "Unhandled state");
14278 (
E->getOpcode() == Instruction::GetElementPtr &&
14279 E->getMainOp()->getType()->isPointerTy()) ||
14280 E->hasCopyableElements()) &&
14283 unsigned ShuffleOrOp =
14284 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14285 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14286 ShuffleOrOp =
E->CombinedOp;
14287 SmallSetVector<Value *, 16> UniqueValues(VL.
begin(), VL.
end());
14288 const unsigned Sz = UniqueValues.size();
14289 SmallBitVector UsedScalars(Sz,
false);
14290 for (
unsigned I = 0;
I < Sz; ++
I) {
14292 !
E->isCopyableElement(UniqueValues[
I]) &&
14293 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14295 UsedScalars.set(
I);
14297 auto GetCastContextHint = [&](
Value *
V) {
14299 return getCastContextHint(*OpTEs.front());
14300 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14301 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14302 !SrcState.isAltShuffle())
14315 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14317 for (
unsigned I = 0;
I < Sz; ++
I) {
14318 if (UsedScalars.test(
I))
14320 ScalarCost += ScalarEltCost(
I);
14329 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14331 if (!EI.UserTE->hasState() ||
14332 EI.UserTE->getOpcode() != Instruction::Select ||
14334 auto UserBWIt = MinBWs.find(EI.UserTE);
14335 Type *UserScalarTy =
14336 (EI.UserTE->isGather() ||
14337 EI.UserTE->State == TreeEntry::SplitVectorize)
14338 ? EI.UserTE->Scalars.front()->getType()
14339 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14340 if (UserBWIt != MinBWs.end())
14342 UserBWIt->second.first);
14343 if (ScalarTy != UserScalarTy) {
14344 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14345 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14346 unsigned VecOpcode;
14348 if (BWSz > SrcBWSz)
14349 VecOpcode = Instruction::Trunc;
14352 It->second.second ? Instruction::SExt : Instruction::ZExt;
14354 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14359 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14360 ScalarCost,
"Calculated costs for Tree"));
14361 return VecCost - ScalarCost;
14366 assert((
E->State == TreeEntry::Vectorize ||
14367 E->State == TreeEntry::StridedVectorize ||
14368 E->State == TreeEntry::CompressVectorize) &&
14369 "Entry state expected to be Vectorize, StridedVectorize or "
14370 "MaskedLoadCompressVectorize here.");
14374 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14375 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14376 "Calculated GEPs cost for Tree"));
14378 return VecCost - ScalarCost;
14385 Type *CanonicalType = Ty;
14391 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14392 {CanonicalType, CanonicalType});
14394 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14397 if (VI && SelectOnly) {
14399 "Expected only for scalar type.");
14402 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14403 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14404 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14408 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14413 switch (ShuffleOrOp) {
14414 case Instruction::PHI: {
14417 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14418 for (
Value *V : UniqueValues) {
14424 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14428 if (
const TreeEntry *OpTE =
14430 if (CountedOps.
insert(OpTE).second &&
14431 !OpTE->ReuseShuffleIndices.empty())
14432 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14433 OpTE->Scalars.size());
14436 return CommonCost - ScalarCost;
14438 case Instruction::ExtractValue:
14439 case Instruction::ExtractElement: {
14440 APInt DemandedElts;
14442 auto GetScalarCost = [&](
unsigned Idx) {
14448 if (ShuffleOrOp == Instruction::ExtractElement) {
14450 SrcVecTy = EE->getVectorOperandType();
14453 Type *AggregateTy = EV->getAggregateOperand()->getType();
14456 NumElts = ATy->getNumElements();
14462 if (
I->hasOneUse()) {
14472 Cost -= TTI->getCastInstrCost(
14473 Ext->getOpcode(),
Ext->getType(),
I->getType(),
14478 if (DemandedElts.
isZero())
14484 return CommonCost - (DemandedElts.
isZero()
14486 : TTI.getScalarizationOverhead(
14487 SrcVecTy, DemandedElts,
false,
14490 return GetCostDiff(GetScalarCost, GetVectorCost);
14492 case Instruction::InsertElement: {
14493 assert(
E->ReuseShuffleIndices.empty() &&
14494 "Unique insertelements only are expected.");
14496 unsigned const NumElts = SrcVecTy->getNumElements();
14497 unsigned const NumScalars = VL.
size();
14503 unsigned OffsetEnd = OffsetBeg;
14504 InsertMask[OffsetBeg] = 0;
14507 if (OffsetBeg > Idx)
14509 else if (OffsetEnd < Idx)
14511 InsertMask[Idx] =
I + 1;
14514 if (NumOfParts > 0 && NumOfParts < NumElts)
14515 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14516 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14518 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14519 unsigned InsertVecSz = std::min<unsigned>(
14521 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14522 bool IsWholeSubvector =
14523 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14527 if (OffsetBeg + InsertVecSz > VecSz) {
14530 InsertVecSz = VecSz;
14535 SmallVector<int>
Mask;
14536 if (!
E->ReorderIndices.empty()) {
14541 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14543 bool IsIdentity =
true;
14545 Mask.swap(PrevMask);
14546 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14548 DemandedElts.
setBit(InsertIdx);
14549 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14550 Mask[InsertIdx - OffsetBeg] =
I;
14552 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14566 InsertVecTy, Mask);
14568 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14574 SmallBitVector InMask =
14576 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14577 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14578 if (InsertVecSz != VecSz) {
14583 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
14585 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
14589 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14598 case Instruction::ZExt:
14599 case Instruction::SExt:
14600 case Instruction::FPToUI:
14601 case Instruction::FPToSI:
14602 case Instruction::FPExt:
14603 case Instruction::PtrToInt:
14604 case Instruction::IntToPtr:
14605 case Instruction::SIToFP:
14606 case Instruction::UIToFP:
14607 case Instruction::Trunc:
14608 case Instruction::FPTrunc:
14609 case Instruction::BitCast: {
14610 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
14613 unsigned Opcode = ShuffleOrOp;
14614 unsigned VecOpcode = Opcode;
14616 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14618 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14619 if (SrcIt != MinBWs.end()) {
14620 SrcBWSz = SrcIt->second.first;
14626 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
14627 if (BWSz == SrcBWSz) {
14628 VecOpcode = Instruction::BitCast;
14629 }
else if (BWSz < SrcBWSz) {
14630 VecOpcode = Instruction::Trunc;
14631 }
else if (It != MinBWs.end()) {
14632 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14633 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14634 }
else if (SrcIt != MinBWs.end()) {
14635 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14637 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14639 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14640 !SrcIt->second.second) {
14641 VecOpcode = Instruction::UIToFP;
14644 assert(Idx == 0 &&
"Expected 0 index only");
14645 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
14652 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14654 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14657 bool IsArithmeticExtendedReduction =
14658 E->Idx == 0 && UserIgnoreList &&
14661 return is_contained({Instruction::Add, Instruction::FAdd,
14662 Instruction::Mul, Instruction::FMul,
14663 Instruction::And, Instruction::Or,
14667 if (IsArithmeticExtendedReduction &&
14668 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14670 return CommonCost +
14671 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
14672 VecOpcode == Opcode ? VI :
nullptr);
14674 return GetCostDiff(GetScalarCost, GetVectorCost);
14676 case Instruction::FCmp:
14677 case Instruction::ICmp:
14678 case Instruction::Select: {
14679 CmpPredicate VecPred, SwappedVecPred;
14682 match(VL0, MatchCmp))
14688 auto GetScalarCost = [&](
unsigned Idx) {
14698 !
match(VI, MatchCmp)) ||
14706 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14707 CostKind, getOperandInfo(
VI->getOperand(0)),
14708 getOperandInfo(
VI->getOperand(1)), VI);
14719 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
14720 CostKind, getOperandInfo(
E->getOperand(0)),
14721 getOperandInfo(
E->getOperand(1)), VL0);
14725 unsigned CondNumElements = CondType->getNumElements();
14727 assert(VecTyNumElements >= CondNumElements &&
14728 VecTyNumElements % CondNumElements == 0 &&
14729 "Cannot vectorize Instruction::Select");
14730 if (CondNumElements != VecTyNumElements) {
14739 return VecCost + CommonCost;
14741 return GetCostDiff(GetScalarCost, GetVectorCost);
14743 case TreeEntry::MinMax: {
14744 auto GetScalarCost = [&](
unsigned Idx) {
14745 return GetMinMaxCost(OrigScalarTy);
14749 return VecCost + CommonCost;
14751 return GetCostDiff(GetScalarCost, GetVectorCost);
14753 case TreeEntry::FMulAdd: {
14754 auto GetScalarCost = [&](
unsigned Idx) {
14757 return GetFMulAddCost(
E->getOperations(),
14763 for (
Value *V :
E->Scalars) {
14765 FMF &= FPCI->getFastMathFlags();
14767 FMF &= FPCIOp->getFastMathFlags();
14770 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14771 {VecTy, VecTy, VecTy}, FMF);
14773 return VecCost + CommonCost;
14775 return GetCostDiff(GetScalarCost, GetVectorCost);
14777 case Instruction::FNeg:
14778 case Instruction::Add:
14779 case Instruction::FAdd:
14780 case Instruction::Sub:
14781 case Instruction::FSub:
14782 case Instruction::Mul:
14783 case Instruction::FMul:
14784 case Instruction::UDiv:
14785 case Instruction::SDiv:
14786 case Instruction::FDiv:
14787 case Instruction::URem:
14788 case Instruction::SRem:
14789 case Instruction::FRem:
14790 case Instruction::Shl:
14791 case Instruction::LShr:
14792 case Instruction::AShr:
14793 case Instruction::And:
14794 case Instruction::Or:
14795 case Instruction::Xor: {
14796 auto GetScalarCost = [&](
unsigned Idx) {
14803 Value *Op1 =
E->getOperand(0)[Idx];
14805 SmallVector<const Value *, 2>
Operands(1, Op1);
14809 Op2 =
E->getOperand(1)[Idx];
14817 I && (ShuffleOrOp == Instruction::FAdd ||
14818 ShuffleOrOp == Instruction::FSub)) {
14826 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14831 return CI && CI->getValue().countr_one() >= It->second.first;
14839 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
14840 Op2Info, {},
nullptr, TLI) +
14843 return GetCostDiff(GetScalarCost, GetVectorCost);
14845 case Instruction::GetElementPtr: {
14846 return CommonCost + GetGEPCostDiff(VL, VL0);
14848 case Instruction::Load: {
14849 auto GetScalarCost = [&](
unsigned Idx) {
14851 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14852 VI->getAlign(),
VI->getPointerAddressSpace(),
14858 switch (
E->State) {
14859 case TreeEntry::Vectorize:
14860 if (
unsigned Factor =
E->getInterleaveFactor()) {
14861 VecLdCost = TTI->getInterleavedMemoryOpCost(
14862 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14863 LI0->getPointerAddressSpace(),
CostKind);
14866 VecLdCost = TTI->getMemoryOpCost(
14867 Instruction::Load, VecTy, LI0->getAlign(),
14871 case TreeEntry::StridedVectorize: {
14872 Align CommonAlignment =
14874 VecLdCost = TTI->getStridedMemoryOpCost(
14875 Instruction::Load, VecTy, LI0->getPointerOperand(),
14876 false, CommonAlignment,
CostKind);
14879 case TreeEntry::CompressVectorize: {
14881 unsigned InterleaveFactor;
14882 SmallVector<int> CompressMask;
14885 if (!
E->ReorderIndices.empty()) {
14886 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
14887 E->ReorderIndices.end());
14894 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
14895 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
14896 CompressMask, LoadVecTy);
14897 assert(IsVectorized &&
"Failed to vectorize load");
14898 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
14899 InterleaveFactor, IsMasked);
14900 Align CommonAlignment = LI0->getAlign();
14901 if (InterleaveFactor) {
14902 VecLdCost = TTI->getInterleavedMemoryOpCost(
14903 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14904 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
14905 }
else if (IsMasked) {
14906 VecLdCost = TTI->getMaskedMemoryOpCost(
14907 Instruction::Load, LoadVecTy, CommonAlignment,
14908 LI0->getPointerAddressSpace(),
CostKind);
14911 LoadVecTy, CompressMask,
CostKind);
14913 VecLdCost = TTI->getMemoryOpCost(
14914 Instruction::Load, LoadVecTy, CommonAlignment,
14918 LoadVecTy, CompressMask,
CostKind);
14922 case TreeEntry::ScatterVectorize: {
14923 Align CommonAlignment =
14925 VecLdCost = TTI->getGatherScatterOpCost(
14926 Instruction::Load, VecTy, LI0->getPointerOperand(),
14927 false, CommonAlignment,
CostKind);
14930 case TreeEntry::CombinedVectorize:
14931 case TreeEntry::SplitVectorize:
14932 case TreeEntry::NeedToGather:
14935 return VecLdCost + CommonCost;
14941 if (
E->State == TreeEntry::ScatterVectorize)
14948 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14950 case Instruction::Store: {
14951 bool IsReorder = !
E->ReorderIndices.empty();
14952 auto GetScalarCost = [=](
unsigned Idx) {
14955 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
14956 VI->getAlign(),
VI->getPointerAddressSpace(),
14964 if (
E->State == TreeEntry::StridedVectorize) {
14965 Align CommonAlignment =
14967 VecStCost = TTI->getStridedMemoryOpCost(
14968 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
14969 false, CommonAlignment,
CostKind);
14971 assert(
E->State == TreeEntry::Vectorize &&
14972 "Expected either strided or consecutive stores.");
14973 if (
unsigned Factor =
E->getInterleaveFactor()) {
14974 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
14975 "No reused shuffles expected");
14977 VecStCost = TTI->getInterleavedMemoryOpCost(
14978 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
14979 BaseSI->getPointerAddressSpace(),
CostKind);
14982 VecStCost = TTI->getMemoryOpCost(
14983 Instruction::Store, VecTy, BaseSI->getAlign(),
14984 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
14987 return VecStCost + CommonCost;
14991 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
14995 return GetCostDiff(GetScalarCost, GetVectorCost) +
14996 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
14998 case Instruction::Call: {
14999 auto GetScalarCost = [&](
unsigned Idx) {
15003 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15004 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15014 CI,
ID, VecTy->getNumElements(),
15015 It != MinBWs.end() ? It->second.first : 0, TTI);
15017 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15019 return GetCostDiff(GetScalarCost, GetVectorCost);
15021 case Instruction::ShuffleVector: {
15029 "Invalid Shuffle Vector Operand");
15032 auto TryFindNodeWithEqualOperands = [=]() {
15033 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15036 if (
TE->hasState() &&
TE->isAltShuffle() &&
15037 ((
TE->getOpcode() ==
E->getOpcode() &&
15038 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15039 (
TE->getOpcode() ==
E->getAltOpcode() &&
15040 TE->getAltOpcode() ==
E->getOpcode())) &&
15041 TE->hasEqualOperands(*
E))
15046 auto GetScalarCost = [&](
unsigned Idx) {
15051 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15052 "Unexpected main/alternate opcode");
15054 return TTI->getInstructionCost(VI,
CostKind);
15062 if (TryFindNodeWithEqualOperands()) {
15064 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15071 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15073 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15076 VecCost = TTIRef.getCmpSelInstrCost(
15077 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15078 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15080 VecCost += TTIRef.getCmpSelInstrCost(
15081 E->getOpcode(), VecTy, MaskTy,
15083 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15086 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15089 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15090 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15092 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15093 if (SrcIt != MinBWs.end()) {
15094 SrcBWSz = SrcIt->second.first;
15098 if (BWSz <= SrcBWSz) {
15099 if (BWSz < SrcBWSz)
15101 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15105 <<
"SLP: alternate extension, which should be truncated.\n";
15111 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15114 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15117 SmallVector<int>
Mask;
15118 E->buildAltOpShuffleMask(
15119 [&](Instruction *
I) {
15120 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15121 "Unexpected main/alternate opcode");
15132 unsigned Opcode0 =
E->getOpcode();
15133 unsigned Opcode1 =
E->getAltOpcode();
15134 SmallBitVector OpcodeMask(
15138 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15140 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15141 return AltVecCost < VecCost ? AltVecCost : VecCost;
15147 return GetCostDiff(
15152 "Not supported shufflevector usage.");
15154 unsigned SVNumElements =
15156 ->getNumElements();
15157 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15158 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15163 "Not supported shufflevector usage.");
15166 [[maybe_unused]]
bool IsExtractSubvectorMask =
15167 SV->isExtractSubvectorMask(Index);
15168 assert(IsExtractSubvectorMask &&
15169 "Not supported shufflevector usage.");
15170 if (NextIndex != Index)
15172 NextIndex += SV->getShuffleMask().size();
15175 return ::getShuffleCost(
15181 return GetCostDiff(GetScalarCost, GetVectorCost);
15183 case Instruction::Freeze:
15190bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15192 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15194 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15195 SmallVector<int>
Mask;
15196 return TE->isGather() &&
15198 [
this](
Value *V) { return EphValues.contains(V); }) &&
15200 TE->Scalars.size() < Limit ||
15201 (((
TE->hasState() &&
15202 TE->getOpcode() == Instruction::ExtractElement) ||
15205 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15206 !
TE->isAltShuffle()) ||
15211 if (VectorizableTree.size() == 1 &&
15212 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15213 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15214 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15216 AreVectorizableGathers(VectorizableTree[0].
get(),
15217 VectorizableTree[0]->Scalars.size()) &&
15218 VectorizableTree[0]->getVectorFactor() > 2)))
15221 if (VectorizableTree.size() != 2)
15228 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15229 AreVectorizableGathers(VectorizableTree[1].
get(),
15230 VectorizableTree[0]->Scalars.size()))
15234 if (VectorizableTree[0]->
isGather() ||
15235 (VectorizableTree[1]->
isGather() &&
15236 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15237 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15238 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15246 bool MustMatchOrInst) {
15250 Value *ZextLoad = Root;
15251 const APInt *ShAmtC;
15252 bool FoundOr =
false;
15256 ShAmtC->
urem(8) == 0))) {
15258 ZextLoad = BinOp->getOperand(0);
15259 if (BinOp->getOpcode() == Instruction::Or)
15264 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15271 Type *SrcTy = Load->getType();
15272 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15278 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15288 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15289 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15297 unsigned NumElts = Stores.
size();
15298 for (
Value *Scalar : Stores) {
15312 if (VectorizableTree.empty()) {
15313 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15319 if (VectorizableTree.size() == 2 &&
15321 VectorizableTree[1]->isGather() &&
15322 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15323 !(
isSplat(VectorizableTree[1]->Scalars) ||
15331 constexpr int Limit = 4;
15333 !VectorizableTree.empty() &&
15334 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15335 return (TE->isGather() &&
15336 (!TE->hasState() ||
15337 TE->getOpcode() != Instruction::ExtractElement) &&
15339 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15346 VectorizableTree.size() <= Limit &&
15347 all_of(VectorizableTree,
15348 [&](
const std::unique_ptr<TreeEntry> &TE) {
15349 return (TE->isGather() &&
15350 (!TE->hasState() ||
15351 TE->getOpcode() != Instruction::ExtractElement) &&
15355 (TE->getOpcode() == Instruction::InsertElement ||
15356 (TE->getOpcode() == Instruction::PHI &&
15358 return isa<PoisonValue>(V) || MustGather.contains(V);
15361 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15362 return TE->State == TreeEntry::Vectorize &&
15363 TE->getOpcode() == Instruction::PHI;
15370 unsigned NumGathers = 0;
15371 constexpr int LimitTreeSize = 36;
15373 all_of(VectorizableTree,
15374 [&](
const std::unique_ptr<TreeEntry> &TE) {
15375 if (!TE->isGather() && TE->hasState() &&
15376 (TE->getOpcode() == Instruction::Load ||
15377 TE->getOpcode() == Instruction::Store)) {
15381 if (TE->isGather())
15383 return TE->State == TreeEntry::SplitVectorize ||
15384 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15385 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15386 VectorizableTree.size() > LimitTreeSize) ||
15390 (TE->getOpcode() == Instruction::PHI ||
15391 (TE->hasCopyableElements() &&
15394 TE->Scalars.size() / 2) ||
15395 ((!TE->ReuseShuffleIndices.empty() ||
15396 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15397 TE->Scalars.size() == 2)));
15399 (StoreLoadNodes.
empty() ||
15400 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15401 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15402 return TE->getOpcode() == Instruction::Store ||
15404 return !isa<LoadInst>(V) ||
15405 areAllUsersVectorized(cast<Instruction>(V));
15413 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15414 VectorizableTree.size() >= Limit &&
15416 [&](
const std::unique_ptr<TreeEntry> &TE) {
15417 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15418 TE->UserTreeIndex.UserTE->Idx == 0;
15425 VectorizableTree.size() > 2 &&
15426 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15427 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15428 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15429 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15431 ArrayRef(VectorizableTree).drop_front(2),
15432 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15442 if (isFullyVectorizableTinyTree(ForReduction))
15447 bool IsAllowedSingleBVNode =
15448 VectorizableTree.
size() > 1 ||
15449 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15450 !VectorizableTree.front()->isAltShuffle() &&
15451 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15452 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15454 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15455 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15456 return isa<ExtractElementInst, Constant>(V) ||
15457 (IsAllowedSingleBVNode &&
15458 !V->hasNUsesOrMore(UsesLimit) &&
15459 any_of(V->users(), IsaPred<InsertElementInst>));
15464 if (VectorizableTree.back()->isGather() &&
15465 VectorizableTree.back()->hasState() &&
15466 VectorizableTree.back()->isAltShuffle() &&
15467 VectorizableTree.back()->getVectorFactor() > 2 &&
15469 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15470 TTI->getScalarizationOverhead(
15471 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15472 VectorizableTree.back()->getVectorFactor()),
15485 constexpr unsigned SmallTree = 3;
15486 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15489 [](
const std::unique_ptr<TreeEntry> &TE) {
15490 return TE->isGather() && TE->hasState() &&
15491 TE->getOpcode() == Instruction::Load &&
15499 TreeEntry &E = *VectorizableTree[Idx];
15500 if (E.State == TreeEntry::SplitVectorize)
15504 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15523 const TreeEntry *Root = VectorizableTree.front().get();
15524 if (Root->isGather())
15532 for (
const auto &TEPtr : VectorizableTree) {
15533 if (!TEPtr->isGather()) {
15534 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15535 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15536 LastInstructions.
insert(LastInst);
15538 if (TEPtr->UserTreeIndex)
15539 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15546 if (
II->isAssumeLikeIntrinsic())
15553 return IntrCost < CallCost;
15560 CheckedInstructions;
15561 unsigned Budget = 0;
15562 const unsigned BudgetLimit =
15567 "Expected instructions in same block.");
15568 if (
auto It = CheckedInstructions.
find(
Last);
15569 It != CheckedInstructions.
end()) {
15570 const Instruction *Checked = It->second.getPointer();
15572 return It->second.getInt() != 0;
15578 ++
First->getIterator().getReverse(),
15580 Last->getIterator().getReverse();
15582 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15588 for (
const Instruction *LastInst : LastInstsInRange)
15589 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15592 if (LastInstructions.
contains(&*PrevInstIt))
15593 LastInstsInRange.
push_back(&*PrevInstIt);
15598 for (
const Instruction *LastInst : LastInstsInRange)
15600 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15601 Budget <= BudgetLimit ? 1 : 0);
15602 return Budget <= BudgetLimit;
15604 auto AddCosts = [&](
const TreeEntry *
Op) {
15605 Type *ScalarTy =
Op->Scalars.front()->getType();
15606 auto It = MinBWs.find(
Op);
15607 if (It != MinBWs.end())
15610 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15613 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15620 ParentOpParentToPreds;
15623 auto Key = std::make_pair(Root, OpParent);
15624 if (
auto It = ParentOpParentToPreds.
find(
Key);
15625 It != ParentOpParentToPreds.
end())
15637 for (
const auto &KeyPair : ParentsPairsToAdd) {
15639 "Should not have been added before.");
15643 while (!Worklist.
empty()) {
15645 if (BB == OpParent || !Visited.
insert(BB).second)
15647 auto Pair = std::make_pair(BB, OpParent);
15648 if (
auto It = ParentOpParentToPreds.
find(Pair);
15649 It != ParentOpParentToPreds.
end()) {
15653 ParentsPairsToAdd.
insert(Pair);
15658 if (Budget > BudgetLimit)
15670 while (!LiveEntries.
empty()) {
15675 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15678 if (!
Op->isGather())
15680 if (Entry->State == TreeEntry::SplitVectorize ||
15681 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15687 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15690 if (
Op->isGather()) {
15691 assert(Entry->getOpcode() == Instruction::PHI &&
15692 "Expected phi node only.");
15694 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15696 for (
Value *V :
Op->Scalars) {
15707 OpLastInst = EntriesToLastInstruction.
at(
Op);
15711 if (OpParent == Parent) {
15712 if (Entry->getOpcode() == Instruction::PHI) {
15713 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15717 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15723 if (Entry->getOpcode() != Instruction::PHI &&
15724 !CheckForNonVecCallsInSameBlock(
15725 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
15731 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15737 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15753 const auto *I1 = IE1;
15754 const auto *I2 = IE2;
15766 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15769 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
15772 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15779struct ValueSelect {
15780 template <
typename U>
15781 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
15784 template <
typename U>
15785 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
15803template <
typename T>
15809 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
15811 auto VMIt = std::next(ShuffleMask.begin());
15814 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15816 if (!IsBaseUndef.
all()) {
15818 std::pair<T *, bool> Res =
15819 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
15821 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15825 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15827 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
15828 assert((!V || GetVF(V) == Mask.size()) &&
15829 "Expected base vector of VF number of elements.");
15830 Prev = Action(Mask, {
nullptr, Res.first});
15831 }
else if (ShuffleMask.size() == 1) {
15834 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15840 Prev = Action(Mask, {ShuffleMask.begin()->first});
15844 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15845 unsigned Vec2VF = GetVF(VMIt->first);
15846 if (Vec1VF == Vec2VF) {
15850 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15853 Mask[
I] = SecMask[
I] + Vec1VF;
15856 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15859 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15861 std::pair<T *, bool> Res2 =
15862 ResizeAction(VMIt->first, VMIt->second,
false);
15864 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15871 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
15874 Prev = Action(Mask, {Res1.first, Res2.first});
15876 VMIt = std::next(VMIt);
15878 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
15880 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
15882 std::pair<T *, bool> Res =
15883 ResizeAction(VMIt->first, VMIt->second,
false);
15885 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15888 "Multiple uses of scalars.");
15889 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
15894 Prev = Action(Mask, {Prev, Res.first});
15902template <
typename T>
struct ShuffledInsertData {
15906 MapVector<T, SmallVector<int>> ValueMasks;
15914 << VectorizableTree.size() <<
".\n");
15917 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
15918 TreeEntry &TE = *VectorizableTree[
I];
15921 if (TE.State == TreeEntry::CombinedVectorize) {
15923 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
15924 << *TE.Scalars[0] <<
".\n";
15925 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15928 if (TE.hasState() &&
15929 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15930 if (
const TreeEntry *E =
15931 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15932 E && E->getVectorFactor() == TE.getVectorFactor()) {
15937 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15944 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15945 "Expected gather nodes with users only.");
15951 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15955 none_of(ExternalUses, [](
const ExternalUser &EU) {
15966 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
15973 for (ExternalUser &EU : ExternalUses) {
15974 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
15977 for (ExternalUser &EU : ExternalUses) {
15978 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
15979 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
15981 else dbgs() <<
" User: nullptr\n");
15982 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
15987 if (EphValues.count(EU.User))
15991 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
15993 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
16001 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16007 !ExtractCostCalculated.
insert(EU.Scalar).second)
16020 if (!UsedInserts.
insert(VU).second)
16024 const TreeEntry *ScalarTE = &EU.E;
16027 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16032 Value *Op0 =
II->getOperand(0);
16039 if (It == ShuffledInserts.
end()) {
16041 Data.InsertElements.emplace_back(VU);
16043 VecId = ShuffledInserts.
size() - 1;
16044 auto It = MinBWs.find(ScalarTE);
16045 if (It != MinBWs.end() &&
16047 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16049 unsigned BWSz = It->second.first;
16050 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16051 unsigned VecOpcode;
16052 if (DstBWSz < BWSz)
16053 VecOpcode = Instruction::Trunc;
16056 It->second.second ? Instruction::SExt : Instruction::ZExt;
16061 FTy->getNumElements()),
16064 <<
" for extending externally used vector with "
16065 "non-equal minimum bitwidth.\n");
16070 It->InsertElements.front() = VU;
16071 VecId = std::distance(ShuffledInserts.
begin(), It);
16073 int InIdx = *InsertIdx;
16075 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16078 Mask[InIdx] = EU.Lane;
16079 DemandedElts[VecId].setBit(InIdx);
16090 auto *ScalarTy = EU.Scalar->getType();
16091 const unsigned BundleWidth = EU.E.getVectorFactor();
16092 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16094 const TreeEntry *Entry = &EU.E;
16095 auto It = MinBWs.find(Entry);
16096 if (It != MinBWs.end()) {
16101 ? Instruction::ZExt
16102 : Instruction::SExt;
16107 << ExtraCost <<
"\n");
16111 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16112 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16113 << *VecTy <<
": " << ExtraCost <<
"\n");
16116 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16117 Entry->getOpcode() == Instruction::Load) {
16119 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16122 const Loop *L = LI->getLoopFor(Phi->getParent());
16123 return L && (Phi->getParent() ==
I->getParent() ||
16124 L == LI->getLoopFor(
I->getParent()));
16128 if (!ValueToExtUses) {
16129 ValueToExtUses.emplace();
16130 for (
const auto &
P :
enumerate(ExternalUses)) {
16132 if (IsPhiInLoop(
P.value()))
16135 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16142 auto OperandIsScalar = [&](
Value *V) {
16148 return !EE->hasOneUse() || !MustGather.contains(EE);
16151 return ValueToExtUses->contains(V);
16153 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16154 bool CanBeUsedAsScalarCast =
false;
16157 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16162 if (ScalarCost + OpCost <= ExtraCost) {
16163 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16164 ScalarCost += OpCost;
16168 if (CanBeUsedAsScalar) {
16169 bool KeepScalar = ScalarCost <= ExtraCost;
16173 bool IsProfitablePHIUser =
16175 VectorizableTree.front()->Scalars.size() > 2)) &&
16176 VectorizableTree.front()->hasState() &&
16177 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16181 auto *PHIUser = dyn_cast<PHINode>(U);
16182 return (!PHIUser ||
16183 PHIUser->getParent() !=
16185 VectorizableTree.front()->getMainOp())
16190 return ValueToExtUses->contains(V);
16192 if (IsProfitablePHIUser) {
16196 (!GatheredLoadsEntriesFirst.has_value() ||
16197 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16198 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16199 return ValueToExtUses->contains(V);
16201 auto It = ExtractsCount.
find(Entry);
16202 if (It != ExtractsCount.
end()) {
16203 assert(ScalarUsesCount >= It->getSecond().size() &&
16204 "Expected total number of external uses not less than "
16205 "number of scalar uses.");
16206 ScalarUsesCount -= It->getSecond().size();
16211 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16214 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16215 for (
Value *V : Inst->operands()) {
16216 auto It = ValueToExtUses->find(V);
16217 if (It != ValueToExtUses->end()) {
16219 ExternalUses[It->second].User =
nullptr;
16222 ExtraCost = ScalarCost;
16223 if (!IsPhiInLoop(EU))
16224 ExtractsCount[Entry].
insert(Inst);
16225 if (CanBeUsedAsScalarCast) {
16226 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16230 for (
Value *V : IOp->operands()) {
16231 auto It = ValueToExtUses->find(V);
16232 if (It != ValueToExtUses->end()) {
16234 ExternalUses[It->second].User =
nullptr;
16243 ExtractCost += ExtraCost;
16247 for (
Value *V : ScalarOpsFromCasts) {
16248 ExternalUsesAsOriginalScalar.insert(V);
16250 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16251 TEs.front()->findLaneForValue(V));
16255 if (!VectorizedVals.
empty()) {
16256 const TreeEntry &Root = *VectorizableTree.front();
16257 auto BWIt = MinBWs.find(&Root);
16258 if (BWIt != MinBWs.end()) {
16259 Type *DstTy = Root.Scalars.front()->getType();
16260 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
16262 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16263 if (OriginalSz != SrcSz) {
16264 unsigned Opcode = Instruction::Trunc;
16265 if (OriginalSz > SrcSz)
16266 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16272 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16279 Cost += ExtractCost;
16281 bool ForSingleMask) {
16283 unsigned VF = Mask.size();
16284 unsigned VecVF = TE->getVectorFactor();
16285 bool HasLargeIndex =
16286 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16287 if ((VF != VecVF && HasLargeIndex) ||
16290 if (HasLargeIndex) {
16292 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16298 dbgs() <<
"SLP: Adding cost " <<
C
16299 <<
" for final shuffle of insertelement external users.\n";
16300 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16302 return std::make_pair(TE,
true);
16305 if (!ForSingleMask) {
16307 for (
unsigned I = 0;
I < VF; ++
I) {
16309 ResizeMask[Mask[
I]] = Mask[
I];
16316 dbgs() <<
"SLP: Adding cost " <<
C
16317 <<
" for final shuffle of insertelement external users.\n";
16318 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16323 return std::make_pair(TE,
false);
16326 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16327 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16328 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16332 assert((TEs.size() == 1 || TEs.size() == 2) &&
16333 "Expected exactly 1 or 2 tree entries.");
16334 if (TEs.size() == 1) {
16336 VF = TEs.front()->getVectorFactor();
16337 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16341 (
Data.index() < VF &&
16342 static_cast<int>(
Data.index()) ==
Data.value());
16347 <<
" for final shuffle of insertelement "
16348 "external users.\n";
16349 TEs.front()->
dump();
16350 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16356 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16357 VF = TEs.front()->getVectorFactor();
16361 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16365 <<
" for final shuffle of vector node and external "
16366 "insertelement users.\n";
16367 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16368 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16376 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16377 EstimateShufflesCost);
16380 ShuffledInserts[
I].InsertElements.
front()->getType()),
16383 Cost -= InsertCost;
16387 if (ReductionBitWidth != 0) {
16388 assert(UserIgnoreList &&
"Expected reduction tree.");
16389 const TreeEntry &E = *VectorizableTree.front();
16390 auto It = MinBWs.find(&E);
16391 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16392 unsigned SrcSize = It->second.first;
16393 unsigned DstSize = ReductionBitWidth;
16394 unsigned Opcode = Instruction::Trunc;
16395 if (SrcSize < DstSize) {
16396 bool IsArithmeticExtendedReduction =
16399 return is_contained({Instruction::Add, Instruction::FAdd,
16400 Instruction::Mul, Instruction::FMul,
16401 Instruction::And, Instruction::Or,
16405 if (IsArithmeticExtendedReduction)
16407 Instruction::BitCast;
16409 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16411 if (Opcode != Instruction::BitCast) {
16413 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16415 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16418 switch (E.getOpcode()) {
16419 case Instruction::SExt:
16420 case Instruction::ZExt:
16421 case Instruction::Trunc: {
16422 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16423 CCH = getCastContextHint(*OpTE);
16429 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16433 <<
" for final resize for reduction from " << SrcVecTy
16434 <<
" to " << DstVecTy <<
"\n";
16435 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16440 std::optional<InstructionCost> SpillCost;
16443 Cost += *SpillCost;
16449 OS <<
"SLP: Spill Cost = ";
16454 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16455 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
16459 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
16470std::optional<TTI::ShuffleKind>
16471BoUpSLP::tryToGatherSingleRegisterExtractElements(
16477 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16493 if (Idx >= VecTy->getNumElements()) {
16497 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
16498 ExtractMask.reset(*Idx);
16503 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16508 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16509 return P1.second.size() > P2.second.size();
16512 const int UndefSz = UndefVectorExtracts.
size();
16513 unsigned SingleMax = 0;
16514 unsigned PairMax = 0;
16515 if (!Vectors.
empty()) {
16516 SingleMax = Vectors.
front().second.size() + UndefSz;
16517 if (Vectors.
size() > 1) {
16518 auto *ItNext = std::next(Vectors.
begin());
16519 PairMax = SingleMax + ItNext->second.size();
16522 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16523 return std::nullopt;
16529 if (SingleMax >= PairMax && SingleMax) {
16530 for (
int Idx : Vectors.
front().second)
16531 std::swap(GatheredExtracts[Idx], VL[Idx]);
16532 }
else if (!Vectors.
empty()) {
16533 for (
unsigned Idx : {0, 1})
16534 for (
int Idx : Vectors[Idx].second)
16535 std::swap(GatheredExtracts[Idx], VL[Idx]);
16538 for (
int Idx : UndefVectorExtracts)
16539 std::swap(GatheredExtracts[Idx], VL[Idx]);
16542 std::optional<TTI::ShuffleKind> Res =
16548 return std::nullopt;
16552 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
16573BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16574 SmallVectorImpl<int> &Mask,
16575 unsigned NumParts)
const {
16576 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16585 SmallVector<int> SubMask;
16586 std::optional<TTI::ShuffleKind> Res =
16587 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16588 ShufflesRes[Part] = Res;
16589 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16591 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16592 return Res.has_value();
16594 ShufflesRes.clear();
16595 return ShufflesRes;
16598std::optional<TargetTransformInfo::ShuffleKind>
16599BoUpSLP::isGatherShuffledSingleRegisterEntry(
16601 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
16605 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16606 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16607 TE =
TE->UserTreeIndex.UserTE;
16608 if (TE == VectorizableTree.front().get())
16609 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16610 return TE->UserTreeIndex;
16612 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16613 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16614 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16616 TE =
TE->UserTreeIndex.UserTE;
16620 const EdgeInfo TEUseEI = GetUserEntry(TE);
16622 return std::nullopt;
16623 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16628 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16629 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16630 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16633 TEInsertBlock = TEInsertPt->
getParent();
16635 if (!DT->isReachableFromEntry(TEInsertBlock))
16636 return std::nullopt;
16637 auto *NodeUI = DT->getNode(TEInsertBlock);
16638 assert(NodeUI &&
"Should only process reachable instructions");
16640 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16653 const BasicBlock *InsertBlock = InsertPt->getParent();
16654 auto *NodeEUI = DT->getNode(InsertBlock);
16657 assert((NodeUI == NodeEUI) ==
16658 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16659 "Different nodes should have different DFS numbers");
16661 if (TEInsertPt->
getParent() != InsertBlock &&
16662 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16664 if (TEInsertPt->
getParent() == InsertBlock &&
16677 SmallDenseMap<Value *, int> UsedValuesEntry;
16678 SmallPtrSet<const Value *, 16> VisitedValue;
16679 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16681 if ((TEPtr->getVectorFactor() != VL.
size() &&
16682 TEPtr->Scalars.size() != VL.
size()) ||
16683 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16687 for (
Value *V : VL) {
16694 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16695 unsigned EdgeIdx) {
16696 const TreeEntry *Ptr1 = User1;
16697 const TreeEntry *Ptr2 = User2;
16698 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16701 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16702 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16705 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16706 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16707 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16708 return Idx < It->second;
16712 for (
Value *V : VL) {
16716 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16717 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16718 if (TEPtr == TE || TEPtr->Idx == 0)
16721 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
16722 "Must contain at least single gathered value.");
16723 assert(TEPtr->UserTreeIndex &&
16724 "Expected only single user of a gather node.");
16725 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16727 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16728 UseEI.UserTE->hasState())
16733 : &getLastInstructionInBundle(UseEI.UserTE);
16734 if (TEInsertPt == InsertPt) {
16736 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16737 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16738 TEUseEI.UserTE->isAltShuffle()) &&
16740 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16741 (UseEI.UserTE->hasState() &&
16742 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16743 !UseEI.UserTE->isAltShuffle()) ||
16752 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16755 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16756 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16757 UseEI.UserTE->State == TreeEntry::Vectorize &&
16758 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16759 TEUseEI.UserTE != UseEI.UserTE)
16764 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16768 if (TEUseEI.UserTE != UseEI.UserTE &&
16769 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16770 HasGatherUser(TEUseEI.UserTE)))
16773 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16777 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16778 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16779 UseEI.UserTE->doesNotNeedToSchedule() &&
16784 if ((TEInsertBlock != InsertPt->
getParent() ||
16785 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16786 !CheckOrdering(InsertPt))
16789 if (CheckAndUseSameNode(TEPtr))
16795 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
16796 if (It != VTEs.end()) {
16797 const TreeEntry *VTE = *It;
16798 if (
none_of(
TE->CombinedEntriesWithIndices,
16799 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16800 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16801 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16805 if (CheckAndUseSameNode(VTE))
16811 const TreeEntry *VTE = VTEs.front();
16812 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16813 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16814 VTEs = VTEs.drop_front();
16816 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
16817 return MTE->State == TreeEntry::Vectorize;
16819 if (MIt == VTEs.end())
16823 if (
none_of(
TE->CombinedEntriesWithIndices,
16824 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16825 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16826 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16830 if (CheckAndUseSameNode(VTE))
16834 if (VToTEs.
empty())
16836 if (UsedTEs.
empty()) {
16844 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16846 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16850 if (!VToTEs.
empty()) {
16856 VToTEs = SavedVToTEs;
16861 if (Idx == UsedTEs.
size()) {
16865 if (UsedTEs.
size() == 2)
16867 UsedTEs.push_back(SavedVToTEs);
16868 Idx = UsedTEs.
size() - 1;
16874 if (UsedTEs.
empty()) {
16876 return std::nullopt;
16880 if (UsedTEs.
size() == 1) {
16883 UsedTEs.front().
end());
16884 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16885 return TE1->Idx < TE2->Idx;
16888 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
16889 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
16891 if (It != FirstEntries.end() &&
16892 ((*It)->getVectorFactor() == VL.size() ||
16893 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
16894 TE->ReuseShuffleIndices.size() == VL.size() &&
16895 (*It)->isSame(
TE->Scalars)))) {
16897 if ((*It)->getVectorFactor() == VL.size()) {
16898 std::iota(std::next(
Mask.begin(), Part * VL.size()),
16899 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
16901 SmallVector<int> CommonMask =
TE->getCommonMask();
16912 Entries.
push_back(FirstEntries.front());
16914 for (
auto &
P : UsedValuesEntry)
16916 VF = FirstEntries.front()->getVectorFactor();
16919 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
16921 DenseMap<int, const TreeEntry *> VFToTE;
16922 for (
const TreeEntry *TE : UsedTEs.front()) {
16923 unsigned VF =
TE->getVectorFactor();
16924 auto It = VFToTE.
find(VF);
16925 if (It != VFToTE.
end()) {
16926 if (It->second->Idx >
TE->Idx)
16927 It->getSecond() =
TE;
16934 UsedTEs.back().
end());
16935 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16936 return TE1->Idx < TE2->Idx;
16938 for (
const TreeEntry *TE : SecondEntries) {
16939 auto It = VFToTE.
find(
TE->getVectorFactor());
16940 if (It != VFToTE.
end()) {
16949 if (Entries.
empty()) {
16951 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16952 return TE1->Idx < TE2->Idx;
16954 Entries.
push_back(SecondEntries.front());
16955 VF = std::max(Entries.
front()->getVectorFactor(),
16956 Entries.
back()->getVectorFactor());
16958 VF = Entries.
front()->getVectorFactor();
16961 for (
const TreeEntry *
E : Entries)
16965 for (
auto &
P : UsedValuesEntry) {
16967 if (ValuesToEntries[Idx].
contains(
P.first)) {
16977 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
16984 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
16986 Value *In1 = PHI1->getIncomingValue(
I);
17001 auto MightBeIgnored = [=](
Value *
V) {
17005 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17010 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17011 Value *V1 = VL[Idx];
17012 bool UsedInSameVTE =
false;
17013 auto It = UsedValuesEntry.find(V1);
17014 if (It != UsedValuesEntry.end())
17015 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17016 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17023 SmallBitVector UsedIdxs(Entries.size());
17025 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17027 auto It = UsedValuesEntry.find(V);
17028 if (It == UsedValuesEntry.end())
17034 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17035 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17037 unsigned Idx = It->second;
17044 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17045 if (!UsedIdxs.test(
I))
17051 for (std::pair<unsigned, int> &Pair : EntryLanes)
17052 if (Pair.first ==
I)
17053 Pair.first = TempEntries.
size();
17056 Entries.swap(TempEntries);
17057 if (EntryLanes.size() == Entries.size() &&
17059 .slice(Part * VL.size(),
17060 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17066 return std::nullopt;
17069 bool IsIdentity = Entries.size() == 1;
17072 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17073 unsigned Idx = Part * VL.size() + Pair.second;
17076 (ForOrder ? std::distance(
17077 Entries[Pair.first]->Scalars.begin(),
17078 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17079 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17080 IsIdentity &=
Mask[Idx] == Pair.second;
17082 if (ForOrder || IsIdentity || Entries.empty()) {
17083 switch (Entries.size()) {
17085 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17089 if (EntryLanes.size() > 2 || VL.size() <= 2)
17096 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17098 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17099 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17100 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17101 for (
int Idx : SubMask) {
17109 assert(MaxElement >= 0 && MinElement >= 0 &&
17110 MaxElement % VF >= MinElement % VF &&
17111 "Expected at least single element.");
17112 unsigned NewVF = std::max<unsigned>(
17114 (MaxElement % VF) -
17115 (MinElement % VF) + 1));
17117 for (
int &Idx : SubMask) {
17120 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17121 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17129 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17130 auto GetShuffleCost = [&,
17131 &TTI = *TTI](ArrayRef<int>
Mask,
17134 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17136 Mask, Entries.front()->getInterleaveFactor()))
17138 return ::getShuffleCost(TTI,
17143 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17145 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17146 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17147 FirstShuffleCost = ShuffleCost;
17151 bool IsIdentity =
true;
17152 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17153 if (Idx >=
static_cast<int>(NewVF)) {
17158 IsIdentity &=
static_cast<int>(
I) == Idx;
17162 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17164 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17168 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17169 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17170 SecondShuffleCost = ShuffleCost;
17174 bool IsIdentity =
true;
17175 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
17176 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
17182 IsIdentity &=
static_cast<int>(
I) == Idx;
17187 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17189 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17197 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17199 const TreeEntry *BestEntry =
nullptr;
17200 if (FirstShuffleCost < ShuffleCost) {
17201 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17202 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17204 if (Idx >= static_cast<int>(VF))
17205 Idx = PoisonMaskElem;
17207 BestEntry = Entries.front();
17208 ShuffleCost = FirstShuffleCost;
17210 if (SecondShuffleCost < ShuffleCost) {
17211 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17212 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17214 if (Idx < static_cast<int>(VF))
17215 Idx = PoisonMaskElem;
17219 BestEntry = Entries[1];
17220 ShuffleCost = SecondShuffleCost;
17222 if (BuildVectorCost >= ShuffleCost) {
17225 Entries.push_back(BestEntry);
17233 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17235 return std::nullopt;
17239BoUpSLP::isGatherShuffledEntry(
17243 assert(NumParts > 0 && NumParts < VL.
size() &&
17244 "Expected positive number of registers.");
17247 if (TE == VectorizableTree.front().get() &&
17248 (!GatheredLoadsEntriesFirst.has_value() ||
17250 [](
const std::unique_ptr<TreeEntry> &TE) {
17251 return !
TE->isGather();
17256 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17259 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17260 "Expected only single user of the gather node.");
17262 "Number of scalars must be divisible by NumParts.");
17263 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17264 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17266 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17269 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17276 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17277 std::optional<TTI::ShuffleKind> SubRes =
17278 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17281 SubEntries.
clear();
17284 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17285 (SubEntries.
front()->isSame(
TE->Scalars) ||
17286 SubEntries.
front()->isSame(VL))) {
17288 LocalSubEntries.
swap(SubEntries);
17291 std::iota(
Mask.begin(),
Mask.end(), 0);
17293 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17296 Entries.emplace_back(1, LocalSubEntries.
front());
17302 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17310 Type *ScalarTy)
const {
17311 const unsigned VF = VL.
size();
17319 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17321 if (
V->getType() != ScalarTy)
17322 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
17326 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17333 ConstantShuffleMask[
I] =
I + VF;
17336 EstimateInsertCost(
I, V);
17339 bool IsAnyNonUndefConst =
17342 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17344 ConstantShuffleMask);
17348 if (!DemandedElements.
isZero())
17352 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17356Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
17357 auto It = EntryToLastInstruction.find(
E);
17358 if (It != EntryToLastInstruction.end())
17366 if (
E->hasState()) {
17367 Front =
E->getMainOp();
17368 Opcode =
E->getOpcode();
17375 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17376 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
17377 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
17379 [=](
Value *V) ->
bool {
17380 if (Opcode == Instruction::GetElementPtr &&
17381 !isa<GetElementPtrInst>(V))
17383 auto *I = dyn_cast<Instruction>(V);
17384 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17385 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17387 "Expected gathered loads or GEPs or instructions from same basic "
17390 auto FindLastInst = [&]() {
17392 for (
Value *V :
E->Scalars) {
17396 if (
E->isCopyableElement(
I))
17398 if (LastInst->
getParent() ==
I->getParent()) {
17403 assert(((Opcode == Instruction::GetElementPtr &&
17405 E->State == TreeEntry::SplitVectorize ||
17408 (GatheredLoadsEntriesFirst.has_value() &&
17409 Opcode == Instruction::Load &&
E->isGather() &&
17410 E->Idx < *GatheredLoadsEntriesFirst)) &&
17411 "Expected vector-like or non-GEP in GEP node insts only.");
17412 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
17416 if (!DT->isReachableFromEntry(
I->getParent()))
17418 auto *NodeA = DT->getNode(LastInst->
getParent());
17419 auto *NodeB = DT->getNode(
I->getParent());
17420 assert(NodeA &&
"Should only process reachable instructions");
17421 assert(NodeB &&
"Should only process reachable instructions");
17422 assert((NodeA == NodeB) ==
17423 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17424 "Different nodes should have different DFS numbers");
17425 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17432 auto FindFirstInst = [&]() {
17434 for (
Value *V :
E->Scalars) {
17438 if (
E->isCopyableElement(
I))
17440 if (FirstInst->
getParent() ==
I->getParent()) {
17441 if (
I->comesBefore(FirstInst))
17445 assert(((Opcode == Instruction::GetElementPtr &&
17449 "Expected vector-like or non-GEP in GEP node insts only.");
17450 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
17454 if (!DT->isReachableFromEntry(
I->getParent()))
17456 auto *NodeA = DT->getNode(FirstInst->
getParent());
17457 auto *NodeB = DT->getNode(
I->getParent());
17458 assert(NodeA &&
"Should only process reachable instructions");
17459 assert(NodeB &&
"Should only process reachable instructions");
17460 assert((NodeA == NodeB) ==
17461 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17462 "Different nodes should have different DFS numbers");
17463 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17469 if (
E->State == TreeEntry::SplitVectorize) {
17470 Res = FindLastInst();
17472 for (
auto *
E : Entries) {
17475 I = &getLastInstructionInBundle(
E);
17480 EntryToLastInstruction.try_emplace(
E, Res);
17485 if (GatheredLoadsEntriesFirst.has_value() &&
17486 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17487 Opcode == Instruction::Load) {
17488 Res = FindFirstInst();
17489 EntryToLastInstruction.try_emplace(
E, Res);
17495 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
17499 const auto *It = BlocksSchedules.find(BB);
17500 if (It == BlocksSchedules.end())
17502 for (
Value *V :
E->Scalars) {
17508 if (Bundles.
empty())
17511 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
17512 if (It != Bundles.
end())
17517 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
17518 if (!
E->isGather() && !Bundle) {
17519 if ((Opcode == Instruction::GetElementPtr &&
17522 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17525 return isa<PoisonValue>(V) || E->isCopyableElement(V) ||
17526 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17528 Res = FindLastInst();
17530 Res = FindFirstInst();
17531 EntryToLastInstruction.try_emplace(
E, Res);
17540 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
17541 Res = Bundle->getBundle().back()->getInst();
17542 EntryToLastInstruction.try_emplace(
E, Res);
17565 Res = FindLastInst();
17566 assert(Res &&
"Failed to find last instruction in bundle");
17567 EntryToLastInstruction.try_emplace(
E, Res);
17571void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
17572 auto *Front =
E->getMainOp();
17573 Instruction *LastInst = &getLastInstructionInBundle(
E);
17574 assert(LastInst &&
"Failed to find last instruction in bundle");
17579 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17580 if (LastInstIt != LastInst->
getParent()->end() &&
17581 LastInstIt->getParent()->isLandingPad())
17582 LastInstIt = std::next(LastInstIt);
17585 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
17586 E->doesNotNeedToSchedule()) ||
17587 (GatheredLoadsEntriesFirst.has_value() &&
17588 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17589 E->getOpcode() == Instruction::Load)) {
17590 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17594 Builder.SetInsertPoint(
17598 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17601Value *BoUpSLP::gather(
17603 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
17609 SmallSet<int, 4> PostponedIndices;
17610 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17612 SmallPtrSet<BasicBlock *, 4> Visited;
17613 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17614 InsertBB = InsertBB->getSinglePredecessor();
17615 return InsertBB && InsertBB == InstBB;
17617 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17619 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17621 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17622 PostponedIndices.
insert(
I).second)
17626 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17629 if (
Scalar->getType() != Ty) {
17640 Scalar = Builder.CreateIntCast(
17654 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17659 GatherShuffleExtractSeq.insert(InsElt);
17665 User *UserOp =
nullptr;
17670 if (
V->getType()->isVectorTy()) {
17672 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17674 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
17676 if (SV->getOperand(0) == V)
17678 if (SV->getOperand(1) == V)
17684 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17686 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17689 "Failed to find shufflevector, caused by resize.");
17695 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17696 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17704 SmallVector<int> NonConsts;
17706 std::iota(
Mask.begin(),
Mask.end(), 0);
17707 Value *OriginalRoot = Root;
17710 SV->getOperand(0)->getType() == VecTy) {
17711 Root = SV->getOperand(0);
17712 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17715 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17724 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17729 Vec = OriginalRoot;
17731 Vec = CreateShuffle(Root, Vec, Mask);
17733 OI && OI->use_empty() &&
17734 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17735 return TE->VectorizedValue == OI;
17741 for (
int I : NonConsts)
17742 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17745 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17746 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17784 bool IsFinalized =
false;
17797 class ShuffleIRBuilder {
17810 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17811 CSEBlocks(CSEBlocks),
DL(DL) {}
17812 ~ShuffleIRBuilder() =
default;
17818 "Expected integer vector types only.");
17824 ->getIntegerBitWidth())
17825 V2 = Builder.CreateIntCast(
17828 V1 = Builder.CreateIntCast(
17832 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17834 GatherShuffleExtractSeq.insert(
I);
17835 CSEBlocks.insert(
I->getParent());
17844 unsigned VF = Mask.size();
17848 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17850 GatherShuffleExtractSeq.insert(
I);
17851 CSEBlocks.insert(
I->getParent());
17855 Value *createIdentity(
Value *V) {
return V; }
17856 Value *createPoison(
Type *Ty,
unsigned VF) {
17861 void resizeToMatch(
Value *&V1,
Value *&V2) {
17866 int VF = std::max(V1VF, V2VF);
17867 int MinVF = std::min(V1VF, V2VF);
17869 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
17871 Value *&
Op = MinVF == V1VF ? V1 : V2;
17872 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
17874 GatherShuffleExtractSeq.insert(
I);
17875 CSEBlocks.insert(
I->getParent());
17888 assert(V1 &&
"Expected at least one vector value.");
17889 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17890 R.CSEBlocks, *R.DL);
17891 return BaseShuffleAnalysis::createShuffle<Value *>(
17892 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17898 std::optional<bool> IsSigned = std::nullopt) {
17901 if (VecTy->getElementType() == ScalarTy->getScalarType())
17903 return Builder.CreateIntCast(
17904 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
17908 Value *getVectorizedValue(
const TreeEntry &E) {
17909 Value *Vec = E.VectorizedValue;
17912 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
17913 return !isa<PoisonValue>(V) &&
17914 !isKnownNonNegative(
17915 V, SimplifyQuery(*R.DL));
17921 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17925 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17926 unsigned NumParts,
bool &UseVecBaseAsInput) {
17927 UseVecBaseAsInput =
false;
17929 Value *VecBase =
nullptr;
17931 if (!E->ReorderIndices.empty()) {
17933 E->ReorderIndices.end());
17936 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
17941 VecBase = EI->getVectorOperand();
17943 VecBase = TEs.front()->VectorizedValue;
17944 assert(VecBase &&
"Expected vectorized value.");
17945 UniqueBases.
insert(VecBase);
17948 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17949 (NumParts != 1 &&
count(VL, EI) > 1) ||
17951 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17952 return UTEs.empty() || UTEs.size() > 1 ||
17953 (isa<GetElementPtrInst>(U) &&
17954 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17956 count_if(R.VectorizableTree,
17957 [&](const std::unique_ptr<TreeEntry> &TE) {
17958 return TE->UserTreeIndex.UserTE ==
17960 is_contained(VL, EI);
17964 R.eraseInstruction(EI);
17966 if (NumParts == 1 || UniqueBases.
size() == 1) {
17967 assert(VecBase &&
"Expected vectorized value.");
17968 return castToScalarTyElem(VecBase);
17970 UseVecBaseAsInput =
true;
17980 Value *Vec =
nullptr;
17987 constexpr int MaxBases = 2;
17989 auto VLMask =
zip(SubVL, SubMask);
17990 const unsigned VF = std::accumulate(
17991 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
17992 if (std::get<1>(D) == PoisonMaskElem)
17995 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
17996 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
17998 VecOp = TEs.front()->VectorizedValue;
17999 assert(VecOp &&
"Expected vectorized value.");
18000 const unsigned Size =
18001 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18002 return std::max(S, Size);
18004 for (
const auto [V,
I] : VLMask) {
18009 VecOp = TEs.front()->VectorizedValue;
18010 assert(VecOp &&
"Expected vectorized value.");
18011 VecOp = castToScalarTyElem(VecOp);
18012 Bases[
I / VF] = VecOp;
18014 if (!Bases.front())
18017 if (Bases.back()) {
18018 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18019 TransformToIdentity(SubMask);
18021 SubVec = Bases.front();
18027 ArrayRef<int> SubMask =
18028 Mask.slice(
P * SliceSize,
18031 return all_of(SubMask, [](
int Idx) {
18035 "Expected first part or all previous parts masked.");
18036 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18041 unsigned SubVecVF =
18043 NewVF = std::max(NewVF, SubVecVF);
18046 for (
int &Idx : SubMask)
18049 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18050 Vec = createShuffle(Vec, SubVec, VecMask);
18051 TransformToIdentity(VecMask);
18059 std::optional<Value *>
18065 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18067 return std::nullopt;
18070 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18071 return Builder.CreateAlignedLoad(
18078 IsFinalized =
false;
18079 CommonMask.clear();
18085 Value *V1 = getVectorizedValue(E1);
18086 Value *V2 = getVectorizedValue(E2);
18092 Value *V1 = getVectorizedValue(E1);
18097 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18100 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18101 V1 = castToScalarTyElem(V1);
18102 V2 = castToScalarTyElem(V2);
18103 if (InVectors.empty()) {
18104 InVectors.push_back(V1);
18105 InVectors.push_back(V2);
18106 CommonMask.assign(Mask.begin(), Mask.end());
18109 Value *Vec = InVectors.front();
18110 if (InVectors.size() == 2) {
18111 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18112 transformMaskAfterShuffle(CommonMask, CommonMask);
18115 Vec = createShuffle(Vec,
nullptr, CommonMask);
18116 transformMaskAfterShuffle(CommonMask, CommonMask);
18118 V1 = createShuffle(V1, V2, Mask);
18119 unsigned VF = std::max(getVF(V1), getVF(Vec));
18120 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18122 CommonMask[Idx] = Idx + VF;
18123 InVectors.front() = Vec;
18124 if (InVectors.size() == 2)
18125 InVectors.back() = V1;
18127 InVectors.push_back(V1);
18132 "castToScalarTyElem expects V1 to be FixedVectorType");
18133 V1 = castToScalarTyElem(V1);
18134 if (InVectors.empty()) {
18135 InVectors.push_back(V1);
18136 CommonMask.assign(Mask.begin(), Mask.end());
18139 const auto *It =
find(InVectors, V1);
18140 if (It == InVectors.end()) {
18141 if (InVectors.size() == 2 ||
18142 InVectors.front()->getType() != V1->
getType()) {
18143 Value *V = InVectors.front();
18144 if (InVectors.size() == 2) {
18145 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18146 transformMaskAfterShuffle(CommonMask, CommonMask);
18148 CommonMask.size()) {
18149 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
18150 transformMaskAfterShuffle(CommonMask, CommonMask);
18152 unsigned VF = std::max(CommonMask.size(), Mask.size());
18153 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18155 CommonMask[Idx] = V->getType() != V1->
getType()
18157 : Mask[Idx] + getVF(V1);
18158 if (V->getType() != V1->
getType())
18159 V1 = createShuffle(V1,
nullptr, Mask);
18160 InVectors.front() = V;
18161 if (InVectors.size() == 2)
18162 InVectors.back() = V1;
18164 InVectors.push_back(V1);
18169 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18171 InVectors.push_back(V1);
18176 for (
Value *V : InVectors)
18177 VF = std::max(VF, getVF(V));
18178 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18180 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18189 Value *Root =
nullptr) {
18190 return R.gather(VL, Root, ScalarTy,
18192 return createShuffle(V1, V2, Mask);
18201 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18206 IsFinalized =
true;
18209 if (InVectors.
size() == 2) {
18210 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18213 Vec = createShuffle(Vec,
nullptr, CommonMask);
18215 transformMaskAfterShuffle(CommonMask, CommonMask);
18217 "Expected vector length for the final value before action.");
18221 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18222 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18225 return createShuffle(V1, V2, Mask);
18227 InVectors.
front() = Vec;
18229 if (!SubVectors.empty()) {
18231 if (InVectors.
size() == 2) {
18232 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18235 Vec = createShuffle(Vec,
nullptr, CommonMask);
18237 transformMaskAfterShuffle(CommonMask, CommonMask);
18238 auto CreateSubVectors = [&](
Value *Vec,
18239 SmallVectorImpl<int> &CommonMask) {
18240 for (
auto [
E, Idx] : SubVectors) {
18241 Value *
V = getVectorizedValue(*
E);
18248 Type *OrigScalarTy = ScalarTy;
18251 Builder, Vec, V, InsertionIndex,
18252 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18254 ScalarTy = OrigScalarTy;
18255 if (!CommonMask.
empty()) {
18256 std::iota(std::next(CommonMask.
begin(), Idx),
18257 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
18263 if (SubVectorsMask.
empty()) {
18264 Vec = CreateSubVectors(Vec, CommonMask);
18267 copy(SubVectorsMask, SVMask.begin());
18268 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18271 I1 = I2 + CommonMask.
size();
18276 Vec = createShuffle(InsertVec, Vec, SVMask);
18277 transformMaskAfterShuffle(CommonMask, SVMask);
18279 InVectors.
front() = Vec;
18282 if (!ExtMask.
empty()) {
18283 if (CommonMask.
empty()) {
18287 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18290 NewMask[
I] = CommonMask[ExtMask[
I]];
18292 CommonMask.
swap(NewMask);
18295 if (CommonMask.
empty()) {
18296 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18297 return InVectors.
front();
18299 if (InVectors.
size() == 2)
18300 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18301 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18305 assert((IsFinalized || CommonMask.empty()) &&
18306 "Shuffle construction must be finalized.");
18310Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18314template <
typename BVTy,
typename ResTy,
typename... Args>
18315ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18317 assert(E->isGather() &&
"Expected gather node.");
18318 unsigned VF = E->getVectorFactor();
18320 bool NeedFreeze =
false;
18323 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18325 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18328 E->CombinedEntriesWithIndices.size());
18329 transform(
E->CombinedEntriesWithIndices, SubVectors.begin(),
18330 [&](
const auto &
P) {
18331 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18336 E->ReorderIndices.end());
18337 if (!ReorderMask.empty())
18343 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18345 if (
E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18348 SubVectorsMask.
clear();
18352 unsigned I,
unsigned SliceSize,
18353 bool IsNotPoisonous) {
18355 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18358 TreeEntry *UserTE =
E->UserTreeIndex.UserTE;
18359 unsigned EdgeIdx =
E->UserTreeIndex.EdgeIdx;
18360 if (UserTE->getNumOperands() != 2)
18362 if (!IsNotPoisonous) {
18363 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18364 [=](
const std::unique_ptr<TreeEntry> &TE) {
18365 return TE->UserTreeIndex.UserTE == UserTE &&
18366 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18368 if (It == VectorizableTree.end())
18371 if (!(*It)->ReorderIndices.empty()) {
18375 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18376 Value *V0 = std::get<0>(
P);
18377 Value *V1 = std::get<1>(
P);
18385 if ((
Mask.size() < InputVF &&
18388 (
Mask.size() == InputVF &&
18391 std::next(
Mask.begin(),
I * SliceSize),
18392 std::next(
Mask.begin(),
18399 std::next(
Mask.begin(),
I * SliceSize),
18400 std::next(
Mask.begin(),
18406 BVTy ShuffleBuilder(ScalarTy, Params...);
18407 ResTy Res = ResTy();
18408 SmallVector<int>
Mask;
18409 SmallVector<int> ExtractMask(GatheredScalars.size(),
PoisonMaskElem);
18411 Value *ExtractVecBase =
nullptr;
18412 bool UseVecBaseAsInput =
false;
18415 Type *OrigScalarTy = GatheredScalars.front()->getType();
18420 bool Resized =
false;
18422 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18423 if (!ExtractShuffles.
empty()) {
18425 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
18431 ExtractEntries.
append(TEs.begin(), TEs.end());
18433 if (std::optional<ResTy> Delayed =
18434 ShuffleBuilder.needToDelay(
E, ExtractEntries)) {
18436 PostponedGathers.insert(
E);
18441 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18442 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18443 ExtractVecBase = VecBase;
18445 if (VF == VecBaseTy->getNumElements() &&
18446 GatheredScalars.size() != VF) {
18448 GatheredScalars.append(VF - GatheredScalars.size(),
18456 if (!ExtractShuffles.
empty() || !
E->hasState() ||
18457 E->getOpcode() != Instruction::Load ||
18458 (((
E->hasState() &&
E->getOpcode() == Instruction::Load) ||
18462 return isa<LoadInst>(V) && isVectorized(V);
18464 (
E->hasState() &&
E->isAltShuffle()) ||
18465 all_of(
E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18467 (
E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18469 isGatherShuffledEntry(
E, GatheredScalars, Mask, Entries, NumParts);
18471 if (!GatherShuffles.
empty()) {
18472 if (std::optional<ResTy> Delayed =
18473 ShuffleBuilder.needToDelay(
E, Entries)) {
18475 PostponedGathers.insert(
E);
18480 if (GatherShuffles.
size() == 1 &&
18482 Entries.
front().front()->isSame(
E->Scalars)) {
18485 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18488 Mask.resize(
E->Scalars.size());
18489 const TreeEntry *FrontTE = Entries.
front().front();
18490 if (FrontTE->ReorderIndices.empty() &&
18491 ((FrontTE->ReuseShuffleIndices.empty() &&
18492 E->Scalars.size() == FrontTE->Scalars.size()) ||
18493 (
E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18494 std::iota(
Mask.begin(),
Mask.end(), 0);
18501 Mask[
I] = FrontTE->findLaneForValue(V);
18506 ShuffleBuilder.resetForSameNode();
18507 ShuffleBuilder.add(*FrontTE, Mask);
18509 Res = ShuffleBuilder.finalize(
E->getCommonMask(), {}, {});
18513 if (GatheredScalars.size() != VF &&
18515 return any_of(TEs, [&](
const TreeEntry *TE) {
18516 return TE->getVectorFactor() == VF;
18519 GatheredScalars.append(VF - GatheredScalars.size(),
18523 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
18529 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18530 SmallVectorImpl<int> &ReuseMask,
18531 bool IsRootPoison) {
18534 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18537 SmallVector<int> UndefPos;
18538 DenseMap<Value *, unsigned> UniquePositions;
18541 int NumNonConsts = 0;
18560 Scalars.
front() = OrigV;
18563 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18564 Scalars[Res.first->second] = OrigV;
18565 ReuseMask[
I] = Res.first->second;
18568 if (NumNonConsts == 1) {
18573 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18576 ReuseMask[SinglePos] = SinglePos;
18577 }
else if (!UndefPos.
empty() && IsSplat) {
18584 (
E->UserTreeIndex &&
any_of(
V->uses(), [
E](
const Use &U) {
18587 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18588 is_contained(E->UserTreeIndex.UserTE->Scalars,
18592 if (It != Scalars.
end()) {
18594 int Pos = std::distance(Scalars.
begin(), It);
18595 for (
int I : UndefPos) {
18597 ReuseMask[
I] = Pos;
18606 for (
int I : UndefPos) {
18615 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18616 bool IsNonPoisoned =
true;
18617 bool IsUsedInExpr =
true;
18618 Value *Vec1 =
nullptr;
18619 if (!ExtractShuffles.
empty()) {
18623 Value *Vec2 =
nullptr;
18624 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18628 if (UseVecBaseAsInput) {
18629 Vec1 = ExtractVecBase;
18631 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18637 Value *VecOp = EI->getVectorOperand();
18639 !TEs.
empty() && TEs.
front()->VectorizedValue)
18640 VecOp = TEs.
front()->VectorizedValue;
18643 }
else if (Vec1 != VecOp) {
18644 assert((!Vec2 || Vec2 == VecOp) &&
18645 "Expected only 1 or 2 vectors shuffle.");
18651 IsUsedInExpr =
false;
18654 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18657 IsUsedInExpr &= FindReusedSplat(
18660 ExtractMask.size(), IsNotPoisonedVec);
18661 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18662 IsNonPoisoned &= IsNotPoisonedVec;
18664 IsUsedInExpr =
false;
18669 if (!GatherShuffles.
empty()) {
18670 unsigned SliceSize =
18674 for (
const auto [
I, TEs] :
enumerate(Entries)) {
18677 "No shuffles with empty entries list expected.");
18681 "Expected shuffle of 1 or 2 entries.");
18685 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
18686 if (TEs.
size() == 1) {
18687 bool IsNotPoisonedVec =
18688 TEs.
front()->VectorizedValue
18692 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
18693 SliceSize, IsNotPoisonedVec);
18694 ShuffleBuilder.add(*TEs.
front(), VecMask);
18695 IsNonPoisoned &= IsNotPoisonedVec;
18697 IsUsedInExpr =
false;
18698 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
18699 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
18710 int EMSz = ExtractMask.size();
18711 int MSz =
Mask.size();
18714 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
18715 bool IsIdentityShuffle =
18716 ((UseVecBaseAsInput ||
18718 [](
const std::optional<TTI::ShuffleKind> &SK) {
18722 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
18724 (!GatherShuffles.
empty() &&
18726 [](
const std::optional<TTI::ShuffleKind> &SK) {
18730 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
18732 bool EnoughConstsForShuffle =
18742 (!IsIdentityShuffle ||
18743 (GatheredScalars.size() == 2 &&
18751 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
18752 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
18759 SmallVector<int> BVMask(GatheredScalars.size(),
PoisonMaskElem);
18760 TryPackScalars(GatheredScalars, BVMask,
true);
18761 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18762 ShuffleBuilder.add(BV, BVMask);
18766 (IsSingleShuffle && ((IsIdentityShuffle &&
18769 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18772 Res = ShuffleBuilder.finalize(
18773 E->ReuseShuffleIndices, SubVectors, SubVectorsMask,
E->Scalars.size(),
18774 [&](
Value *&Vec, SmallVectorImpl<int> &Mask,
auto CreateShuffle) {
18775 bool IsSplat = isSplat(NonConstants);
18776 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18777 TryPackScalars(NonConstants, BVMask, false);
18778 auto CheckIfSplatIsProfitable = [&]() {
18781 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18782 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18783 if (isa<ExtractElementInst>(V) || isVectorized(V))
18785 InstructionCost SplatCost = TTI->getVectorInstrCost(
18786 Instruction::InsertElement, VecTy, CostKind, 0,
18787 PoisonValue::get(VecTy), V);
18788 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18789 for (auto [Idx, I] : enumerate(BVMask))
18790 if (I != PoisonMaskElem)
18791 NewMask[Idx] = Mask.size();
18792 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18793 NewMask, CostKind);
18794 InstructionCost BVCost = TTI->getVectorInstrCost(
18795 Instruction::InsertElement, VecTy, CostKind,
18796 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18799 if (count(BVMask, PoisonMaskElem) <
18800 static_cast<int>(BVMask.size() - 1)) {
18801 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18802 for (auto [Idx, I] : enumerate(BVMask))
18803 if (I != PoisonMaskElem)
18805 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18806 VecTy, NewMask, CostKind);
18808 return SplatCost <= BVCost;
18810 if (!IsSplat ||
Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18814 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
18820 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18822 transform(BVMask, SplatMask.begin(), [](
int I) {
18823 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18826 BV = CreateShuffle(BV,
nullptr, SplatMask);
18829 Mask[Idx] = BVMask.size() + Idx;
18830 Vec = CreateShuffle(Vec, BV, Mask);
18838 SmallVector<int> ReuseMask(GatheredScalars.size(),
PoisonMaskElem);
18839 TryPackScalars(GatheredScalars, ReuseMask,
true);
18840 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
18841 ShuffleBuilder.add(BV, ReuseMask);
18842 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18847 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
18851 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18852 ShuffleBuilder.add(BV, Mask);
18853 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18858 Res = ShuffleBuilder.createFreeze(Res);
18862Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
18863 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
18865 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
18873 for (
Value *V : VL)
18886 IRBuilderBase::InsertPointGuard Guard(Builder);
18888 Value *
V =
E->Scalars.front();
18889 Type *ScalarTy =
V->getType();
18892 auto It = MinBWs.find(
E);
18893 if (It != MinBWs.end()) {
18899 if (
E->VectorizedValue)
18900 return E->VectorizedValue;
18902 if (
E->isGather()) {
18904 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
18905 setInsertPointAfterBundle(
E);
18906 Value *Vec = createBuildVector(
E, ScalarTy);
18907 E->VectorizedValue = Vec;
18910 if (
E->State == TreeEntry::SplitVectorize) {
18911 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
18912 "Expected exactly 2 combined entries.");
18913 setInsertPointAfterBundle(
E);
18915 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
18917 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
18918 "Expected same first part of scalars.");
18921 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
18923 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
18924 "Expected same second part of scalars.");
18926 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
18927 bool IsSigned =
false;
18928 auto It = MinBWs.find(OpE);
18929 if (It != MinBWs.end())
18930 IsSigned = It->second.second;
18933 if (isa<PoisonValue>(V))
18935 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18942 Op1 = Builder.CreateIntCast(
18947 GetOperandSignedness(&OpTE1));
18952 Op2 = Builder.CreateIntCast(
18957 GetOperandSignedness(&OpTE2));
18959 if (
E->ReorderIndices.empty()) {
18963 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
18966 if (ScalarTyNumElements != 1) {
18970 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
18972 E->CombinedEntriesWithIndices.back().second *
18973 ScalarTyNumElements);
18974 E->VectorizedValue = Vec;
18977 unsigned CommonVF =
18978 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
18981 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
18983 Op1 = Builder.CreateShuffleVector(Op1, Mask);
18987 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
18989 Op2 = Builder.CreateShuffleVector(Op2, Mask);
18991 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
18992 E->VectorizedValue = Vec;
18996 bool IsReverseOrder =
18998 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
19000 if (
E->getOpcode() == Instruction::Store &&
19001 E->State == TreeEntry::Vectorize) {
19002 ArrayRef<int>
Mask =
19003 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19004 E->ReorderIndices.size());
19005 ShuffleBuilder.add(V, Mask);
19006 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19007 E->State == TreeEntry::CompressVectorize) {
19008 ShuffleBuilder.addOrdered(V, {});
19010 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19013 E->CombinedEntriesWithIndices.size());
19015 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19016 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19019 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19020 "Expected either combined subnodes or reordering");
19021 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19024 assert(!
E->isGather() &&
"Unhandled state");
19025 unsigned ShuffleOrOp =
19026 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19028 auto GetOperandSignedness = [&](
unsigned Idx) {
19029 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19030 bool IsSigned =
false;
19031 auto It = MinBWs.find(OpE);
19032 if (It != MinBWs.end())
19033 IsSigned = It->second.second;
19036 if (isa<PoisonValue>(V))
19038 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19042 switch (ShuffleOrOp) {
19043 case Instruction::PHI: {
19044 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19045 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19046 "PHI reordering is free.");
19048 Builder.SetInsertPoint(PH->getParent(),
19049 PH->getParent()->getFirstNonPHIIt());
19051 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19055 Builder.SetInsertPoint(PH->getParent(),
19056 PH->getParent()->getFirstInsertionPt());
19059 V = FinalShuffle(V,
E);
19061 E->VectorizedValue =
V;
19068 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19075 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19079 if (!VisitedBBs.
insert(IBB).second) {
19082 TreeEntry *OpTE = getOperandEntry(
E,
I);
19083 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19084 OpTE->VectorizedValue = VecOp;
19090 Value *Vec = vectorizeOperand(
E,
I);
19091 if (VecTy != Vec->
getType()) {
19093 MinBWs.contains(getOperandEntry(
E,
I))) &&
19094 "Expected item in MinBWs.");
19095 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19101 "Invalid number of incoming values");
19102 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19103 return E->VectorizedValue;
19106 case Instruction::ExtractElement: {
19107 Value *
V =
E->getSingleOperand(0);
19108 setInsertPointAfterBundle(
E);
19109 V = FinalShuffle(V,
E);
19110 E->VectorizedValue =
V;
19113 case Instruction::ExtractValue: {
19115 Builder.SetInsertPoint(LI);
19116 Value *
Ptr = LI->getPointerOperand();
19117 LoadInst *
V = Builder.CreateAlignedLoad(VecTy,
Ptr, LI->getAlign());
19119 NewV = FinalShuffle(NewV,
E);
19120 E->VectorizedValue = NewV;
19123 case Instruction::InsertElement: {
19124 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19126 Value *
V = vectorizeOperand(
E, 1);
19128 Type *ScalarTy =
Op.front()->getType();
19131 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
19132 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19133 V = Builder.CreateIntCast(
19143 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19145 const unsigned NumElts =
19147 const unsigned NumScalars =
E->Scalars.size();
19150 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19153 SmallVector<int>
Mask;
19154 if (!
E->ReorderIndices.empty()) {
19159 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19162 bool IsIdentity =
true;
19164 Mask.swap(PrevMask);
19165 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19168 IsIdentity &= InsertIdx -
Offset ==
I;
19171 if (!IsIdentity || NumElts != NumScalars) {
19172 Value *V2 =
nullptr;
19173 bool IsVNonPoisonous =
19175 SmallVector<int> InsertMask(Mask);
19176 if (NumElts != NumScalars &&
Offset == 0) {
19185 InsertMask[*InsertIdx] = *InsertIdx;
19186 if (!
Ins->hasOneUse())
19189 Ins->getUniqueUndroppableUser());
19191 SmallBitVector UseMask =
19192 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19193 SmallBitVector IsFirstPoison =
19195 SmallBitVector IsFirstUndef =
19197 if (!IsFirstPoison.
all()) {
19199 for (
unsigned I = 0;
I < NumElts;
I++) {
19201 IsFirstUndef.
test(
I)) {
19202 if (IsVNonPoisonous) {
19203 InsertMask[
I] =
I < NumScalars ?
I : 0;
19208 if (Idx >= NumScalars)
19209 Idx = NumScalars - 1;
19210 InsertMask[
I] = NumScalars + Idx;
19223 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19225 GatherShuffleExtractSeq.insert(
I);
19226 CSEBlocks.insert(
I->getParent());
19231 for (
unsigned I = 0;
I < NumElts;
I++) {
19235 SmallBitVector UseMask =
19236 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19237 SmallBitVector IsFirstUndef =
19239 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19240 NumElts != NumScalars) {
19241 if (IsFirstUndef.
all()) {
19243 SmallBitVector IsFirstPoison =
19245 if (!IsFirstPoison.
all()) {
19246 for (
unsigned I = 0;
I < NumElts;
I++) {
19248 InsertMask[
I] =
I + NumElts;
19251 V = Builder.CreateShuffleVector(
19257 GatherShuffleExtractSeq.insert(
I);
19258 CSEBlocks.insert(
I->getParent());
19262 SmallBitVector IsFirstPoison =
19264 for (
unsigned I = 0;
I < NumElts;
I++) {
19268 InsertMask[
I] += NumElts;
19270 V = Builder.CreateShuffleVector(
19271 FirstInsert->getOperand(0), V, InsertMask,
19274 GatherShuffleExtractSeq.insert(
I);
19275 CSEBlocks.insert(
I->getParent());
19280 ++NumVectorInstructions;
19281 E->VectorizedValue =
V;
19284 case Instruction::ZExt:
19285 case Instruction::SExt:
19286 case Instruction::FPToUI:
19287 case Instruction::FPToSI:
19288 case Instruction::FPExt:
19289 case Instruction::PtrToInt:
19290 case Instruction::IntToPtr:
19291 case Instruction::SIToFP:
19292 case Instruction::UIToFP:
19293 case Instruction::Trunc:
19294 case Instruction::FPTrunc:
19295 case Instruction::BitCast: {
19296 setInsertPointAfterBundle(
E);
19298 Value *InVec = vectorizeOperand(
E, 0);
19303 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
19305 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19308 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19309 if (SrcIt != MinBWs.end())
19310 SrcBWSz = SrcIt->second.first;
19311 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
19312 if (BWSz == SrcBWSz) {
19313 VecOpcode = Instruction::BitCast;
19314 }
else if (BWSz < SrcBWSz) {
19315 VecOpcode = Instruction::Trunc;
19316 }
else if (It != MinBWs.end()) {
19317 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19318 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19319 }
else if (SrcIt != MinBWs.end()) {
19320 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19322 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19324 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19325 !SrcIt->second.second) {
19326 VecOpcode = Instruction::UIToFP;
19328 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19330 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19331 V = FinalShuffle(V,
E);
19333 E->VectorizedValue =
V;
19334 ++NumVectorInstructions;
19337 case Instruction::FCmp:
19338 case Instruction::ICmp: {
19339 setInsertPointAfterBundle(
E);
19341 Value *
L = vectorizeOperand(
E, 0);
19342 Value *
R = vectorizeOperand(
E, 1);
19343 if (
L->getType() !=
R->getType()) {
19346 MinBWs.contains(getOperandEntry(
E, 0)) ||
19347 MinBWs.contains(getOperandEntry(
E, 1))) &&
19348 "Expected item in MinBWs.");
19353 ->getIntegerBitWidth()) {
19354 Type *CastTy =
R->getType();
19355 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19357 Type *CastTy =
L->getType();
19358 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19363 Value *
V = Builder.CreateCmp(P0, L, R);
19366 ICmp->setSameSign(
false);
19369 V = FinalShuffle(V,
E);
19371 E->VectorizedValue =
V;
19372 ++NumVectorInstructions;
19375 case Instruction::Select: {
19376 setInsertPointAfterBundle(
E);
19379 Value *True = vectorizeOperand(
E, 1);
19380 Value *False = vectorizeOperand(
E, 2);
19384 MinBWs.contains(getOperandEntry(
E, 1)) ||
19385 MinBWs.contains(getOperandEntry(
E, 2))) &&
19386 "Expected item in MinBWs.");
19387 if (True->
getType() != VecTy)
19388 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19389 if (False->
getType() != VecTy)
19390 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19395 assert(TrueNumElements >= CondNumElements &&
19396 TrueNumElements % CondNumElements == 0 &&
19397 "Cannot vectorize Instruction::Select");
19399 "Cannot vectorize Instruction::Select");
19400 if (CondNumElements != TrueNumElements) {
19403 Cond = Builder.CreateShuffleVector(
19408 "Cannot vectorize Instruction::Select");
19409 Value *
V = Builder.CreateSelect(
Cond, True, False);
19410 V = FinalShuffle(V,
E);
19412 E->VectorizedValue =
V;
19413 ++NumVectorInstructions;
19416 case Instruction::FNeg: {
19417 setInsertPointAfterBundle(
E);
19419 Value *
Op = vectorizeOperand(
E, 0);
19421 Value *
V = Builder.CreateUnOp(
19427 V = FinalShuffle(V,
E);
19429 E->VectorizedValue =
V;
19430 ++NumVectorInstructions;
19434 case Instruction::Freeze: {
19435 setInsertPointAfterBundle(
E);
19437 Value *
Op = vectorizeOperand(
E, 0);
19439 if (
Op->getType() != VecTy) {
19441 MinBWs.contains(getOperandEntry(
E, 0))) &&
19442 "Expected item in MinBWs.");
19443 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
19445 Value *
V = Builder.CreateFreeze(
Op);
19446 V = FinalShuffle(V,
E);
19448 E->VectorizedValue =
V;
19449 ++NumVectorInstructions;
19453 case Instruction::Add:
19454 case Instruction::FAdd:
19455 case Instruction::Sub:
19456 case Instruction::FSub:
19457 case Instruction::Mul:
19458 case Instruction::FMul:
19459 case Instruction::UDiv:
19460 case Instruction::SDiv:
19461 case Instruction::FDiv:
19462 case Instruction::URem:
19463 case Instruction::SRem:
19464 case Instruction::FRem:
19465 case Instruction::Shl:
19466 case Instruction::LShr:
19467 case Instruction::AShr:
19468 case Instruction::And:
19469 case Instruction::Or:
19470 case Instruction::Xor: {
19471 setInsertPointAfterBundle(
E);
19475 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19480 return CI && CI->getValue().countr_one() >= It->second.first;
19482 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
19483 E->VectorizedValue =
V;
19484 ++NumVectorInstructions;
19492 MinBWs.contains(getOperandEntry(
E, 0)) ||
19493 MinBWs.contains(getOperandEntry(
E, 1))) &&
19494 "Expected item in MinBWs.");
19496 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
19498 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
19501 Value *
V = Builder.CreateBinOp(
19508 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
19510 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19512 I->setHasNoUnsignedWrap(
false);
19515 V = FinalShuffle(V,
E);
19517 E->VectorizedValue =
V;
19518 ++NumVectorInstructions;
19522 case Instruction::Load: {
19525 setInsertPointAfterBundle(
E);
19529 FixedVectorType *StridedLoadTy =
nullptr;
19530 Value *PO = LI->getPointerOperand();
19531 if (
E->State == TreeEntry::Vectorize) {
19532 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19533 }
else if (
E->State == TreeEntry::CompressVectorize) {
19534 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19535 CompressEntryToData.at(
E);
19536 Align CommonAlignment = LI->getAlign();
19542 for (
int I : CompressMask)
19546 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
19549 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19552 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19563 }
else if (
E->State == TreeEntry::StridedVectorize) {
19566 PO = IsReverseOrder ? PtrN : Ptr0;
19567 Type *StrideTy = DL->getIndexType(PO->
getType());
19569 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
19570 StridedLoadTy = SPtrInfo.Ty;
19571 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
19572 unsigned StridedLoadEC =
19575 Value *Stride = SPtrInfo.StrideVal;
19577 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19578 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
19579 SCEVExpander Expander(*SE, *DL,
"strided-load-vec");
19580 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
19581 &*Builder.GetInsertPoint());
19584 Builder.CreateIntCast(Stride, StrideTy,
true);
19585 StrideVal = Builder.CreateMul(
19586 NewStride, ConstantInt::get(
19587 StrideTy, (IsReverseOrder ? -1 : 1) *
19589 DL->getTypeAllocSize(ScalarTy))));
19591 auto *Inst = Builder.CreateIntrinsic(
19592 Intrinsic::experimental_vp_strided_load,
19593 {StridedLoadTy, PO->
getType(), StrideTy},
19596 Builder.getInt32(StridedLoadEC)});
19597 Inst->addParamAttr(
19602 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19603 Value *VecPtr = vectorizeOperand(
E, 0);
19608 unsigned ScalarTyNumElements =
19610 unsigned VecTyNumElements =
19612 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19613 "Cannot expand getelementptr.");
19614 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19617 return Builder.getInt64(I % ScalarTyNumElements);
19619 VecPtr = Builder.CreateGEP(
19620 VecTy->getElementType(),
19621 Builder.CreateShuffleVector(
19627 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19629 Value *
V =
E->State == TreeEntry::CompressVectorize
19633 V = FinalShuffle(V,
E);
19634 E->VectorizedValue =
V;
19635 ++NumVectorInstructions;
19638 case Instruction::Store: {
19641 setInsertPointAfterBundle(
E);
19643 Value *VecValue = vectorizeOperand(
E, 0);
19644 if (VecValue->
getType() != VecTy)
19646 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19647 VecValue = FinalShuffle(VecValue,
E);
19651 if (
E->State == TreeEntry::Vectorize) {
19652 ST = Builder.CreateAlignedStore(VecValue,
Ptr,
SI->getAlign());
19654 assert(
E->State == TreeEntry::StridedVectorize &&
19655 "Expected either strided or consecutive stores.");
19656 if (!
E->ReorderIndices.empty()) {
19658 Ptr =
SI->getPointerOperand();
19661 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
19662 auto *Inst = Builder.CreateIntrinsic(
19663 Intrinsic::experimental_vp_strided_store,
19664 {VecTy,
Ptr->getType(), StrideTy},
19667 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19668 Builder.getAllOnesMask(VecTy->getElementCount()),
19669 Builder.getInt32(
E->Scalars.size())});
19670 Inst->addParamAttr(
19678 E->VectorizedValue =
V;
19679 ++NumVectorInstructions;
19682 case Instruction::GetElementPtr: {
19684 setInsertPointAfterBundle(
E);
19686 Value *Op0 = vectorizeOperand(
E, 0);
19689 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
19690 Value *OpVec = vectorizeOperand(
E, J);
19694 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19697 for (
Value *V :
E->Scalars) {
19704 V = FinalShuffle(V,
E);
19706 E->VectorizedValue =
V;
19707 ++NumVectorInstructions;
19711 case Instruction::Call: {
19713 setInsertPointAfterBundle(
E);
19718 CI,
ID, VecTy->getNumElements(),
19719 It != MinBWs.end() ? It->second.first : 0, TTI);
19722 VecCallCosts.first <= VecCallCosts.second;
19724 Value *ScalarArg =
nullptr;
19735 ScalarArg = CEI->getArgOperand(
I);
19738 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
19739 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19740 ScalarArg = Builder.getFalse();
19747 Value *OpVec = vectorizeOperand(
E,
I);
19748 ScalarArg = CEI->getArgOperand(
I);
19751 It == MinBWs.end()) {
19754 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
19755 }
else if (It != MinBWs.end()) {
19756 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
19765 if (!UseIntrinsic) {
19770 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19777 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
19780 V = FinalShuffle(V,
E);
19782 E->VectorizedValue =
V;
19783 ++NumVectorInstructions;
19786 case Instruction::ShuffleVector: {
19789 setInsertPointAfterBundle(
E);
19790 Value *Src = vectorizeOperand(
E, 0);
19793 SmallVector<int> NewMask(ThisMask.size());
19795 return SVSrc->getShuffleMask()[Mask];
19797 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19798 SVSrc->getOperand(1), NewMask);
19800 V = Builder.CreateShuffleVector(Src, ThisMask);
19805 V = FinalShuffle(V,
E);
19813 "Invalid Shuffle Vector Operand");
19817 setInsertPointAfterBundle(
E);
19818 LHS = vectorizeOperand(
E, 0);
19819 RHS = vectorizeOperand(
E, 1);
19821 setInsertPointAfterBundle(
E);
19822 LHS = vectorizeOperand(
E, 0);
19828 assert((It != MinBWs.end() ||
19829 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
19830 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
19831 MinBWs.contains(getOperandEntry(
E, 0)) ||
19832 MinBWs.contains(getOperandEntry(
E, 1))) &&
19833 "Expected item in MinBWs.");
19834 Type *CastTy = VecTy;
19840 ->getIntegerBitWidth())
19846 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
19848 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
19853 V0 = Builder.CreateBinOp(
19855 V1 = Builder.CreateBinOp(
19858 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
19861 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
19864 unsigned SrcBWSz = DL->getTypeSizeInBits(
19866 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19867 if (BWSz <= SrcBWSz) {
19868 if (BWSz < SrcBWSz)
19869 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
19871 "Expected same type as operand.");
19875 E->VectorizedValue =
LHS;
19876 ++NumVectorInstructions;
19880 V0 = Builder.CreateCast(
19882 V1 = Builder.CreateCast(
19887 for (
Value *V : {V0, V1}) {
19889 GatherShuffleExtractSeq.insert(
I);
19890 CSEBlocks.insert(
I->getParent());
19898 SmallVector<int>
Mask;
19899 E->buildAltOpShuffleMask(
19900 [
E,
this](Instruction *
I) {
19901 assert(
E->getMatchingMainOpOrAltOp(
I) &&
19902 "Unexpected main/alternate opcode");
19906 Mask, &OpScalars, &AltScalars);
19910 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
19913 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
19915 if (isa<PoisonValue>(V))
19917 auto *IV = cast<Instruction>(V);
19918 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19920 I->setHasNoUnsignedWrap(
false);
19922 DropNuwFlag(V0,
E->getOpcode());
19923 DropNuwFlag(V1,
E->getAltOpcode());
19929 V = Builder.CreateShuffleVector(V0, V1, Mask);
19932 GatherShuffleExtractSeq.insert(
I);
19933 CSEBlocks.insert(
I->getParent());
19937 E->VectorizedValue =
V;
19938 ++NumVectorInstructions;
19956 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
19959 EntryToLastInstruction.clear();
19961 for (
auto &BSIter : BlocksSchedules)
19962 scheduleBlock(*
this, BSIter.second.get());
19965 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19966 if (TE->isGather())
19968 (void)getLastInstructionInBundle(TE.get());
19972 Builder.SetInsertPoint(ReductionRoot->
getParent(),
19975 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
19979 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19980 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
19981 TE->UserTreeIndex.UserTE->hasState() &&
19982 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
19983 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
19984 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
19985 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
19986 all_of(TE->UserTreeIndex.UserTE->Scalars,
19987 [](
Value *V) { return isUsedOutsideBlock(V); })) {
19989 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
19993 for (
auto &Entry : GatherEntries) {
19995 Builder.SetInsertPoint(Entry.second);
19996 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20001 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20002 if (GatheredLoadsEntriesFirst.has_value() &&
20003 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20004 (!TE->isGather() || TE->UserTreeIndex)) {
20005 assert((TE->UserTreeIndex ||
20006 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20007 "Expected gathered load node.");
20016 for (
const TreeEntry *E : PostponedNodes) {
20017 auto *TE =
const_cast<TreeEntry *
>(E);
20019 TE->VectorizedValue =
nullptr;
20038 if (UI->comesBefore(InsertPt))
20041 Builder.SetInsertPoint(InsertPt);
20043 Builder.SetInsertPoint(PrevVec);
20045 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20048 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20049 Builder.GetInsertPoint()->comesBefore(VecI))
20050 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20051 Builder.GetInsertPoint());
20052 if (Vec->
getType() != PrevVec->getType()) {
20054 PrevVec->getType()->isIntOrIntVectorTy() &&
20055 "Expected integer vector types only.");
20056 std::optional<bool> IsSigned;
20057 for (
Value *V : TE->Scalars) {
20059 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20060 auto It = MinBWs.find(MNTE);
20061 if (It != MinBWs.end()) {
20062 IsSigned = IsSigned.value_or(
false) || It->second.second;
20067 if (IsSigned.value_or(
false))
20070 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20071 auto It = MinBWs.find(BVE);
20072 if (It != MinBWs.end()) {
20073 IsSigned = IsSigned.value_or(
false) || It->second.second;
20078 if (IsSigned.value_or(
false))
20082 IsSigned.value_or(
false) ||
20086 if (IsSigned.value_or(
false))
20090 if (IsSigned.value_or(
false)) {
20092 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20093 if (It != MinBWs.end())
20094 IsSigned = It->second.second;
20097 "Expected user node or perfect diamond match in MinBWs.");
20098 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20100 PrevVec->replaceAllUsesWith(Vec);
20101 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20104 auto It = PostponedValues.
find(PrevVec);
20105 if (It != PostponedValues.
end()) {
20106 for (TreeEntry *VTE : It->getSecond())
20107 VTE->VectorizedValue = Vec;
20127 for (
const auto &ExternalUse : ExternalUses) {
20128 Value *Scalar = ExternalUse.Scalar;
20135 const TreeEntry *E = &ExternalUse.E;
20136 assert(E &&
"Invalid scalar");
20137 assert(!E->isGather() &&
"Extracting from a gather list");
20139 if (E->getOpcode() == Instruction::GetElementPtr &&
20143 Value *Vec = E->VectorizedValue;
20144 assert(Vec &&
"Can't find vectorizable value");
20146 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20147 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20148 if (Scalar->getType() != Vec->
getType()) {
20149 Value *Ex =
nullptr;
20150 Value *ExV =
nullptr;
20152 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20153 auto It = ScalarToEEs.
find(Scalar);
20154 if (It != ScalarToEEs.
end()) {
20157 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20158 : Builder.GetInsertBlock());
20159 if (EEIt != It->second.end()) {
20160 Value *PrevV = EEIt->second.first;
20162 I && !ReplaceInst &&
20163 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20164 Builder.GetInsertPoint()->comesBefore(
I)) {
20165 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20166 Builder.GetInsertPoint());
20171 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20180 IgnoredExtracts.
insert(EE);
20183 auto *CloneInst = Inst->clone();
20184 CloneInst->insertBefore(Inst->getIterator());
20185 if (Inst->hasName())
20186 CloneInst->takeName(Inst);
20191 Value *V = ES->getVectorOperand();
20194 V = ETEs.front()->VectorizedValue;
20196 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20197 IV->comesBefore(IVec))
20198 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20200 Ex = Builder.CreateExtractElement(Vec, Lane);
20201 }
else if (
auto *VecTy =
20204 unsigned VecTyNumElements = VecTy->getNumElements();
20209 ExternalUse.Lane * VecTyNumElements);
20211 Ex = Builder.CreateExtractElement(Vec, Lane);
20216 if (Scalar->getType() != Ex->
getType())
20217 ExV = Builder.CreateIntCast(
20222 : &F->getEntryBlock(),
20223 std::make_pair(Ex, ExV));
20229 GatherShuffleExtractSeq.insert(ExI);
20230 CSEBlocks.insert(ExI->getParent());
20236 "In-tree scalar of vector type is not insertelement?");
20245 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20248 (ExternallyUsedValues.
count(Scalar) ||
20249 ExternalUsesWithNonUsers.count(Scalar) ||
20250 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20254 if (ExternalUsesAsOriginalScalar.contains(U))
20256 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20257 return !UseEntries.empty() &&
20258 (E->State == TreeEntry::Vectorize ||
20259 E->State == TreeEntry::StridedVectorize ||
20260 E->State == TreeEntry::CompressVectorize) &&
20261 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20262 return (UseEntry->State == TreeEntry::Vectorize ||
20264 TreeEntry::StridedVectorize ||
20266 TreeEntry::CompressVectorize) &&
20267 doesInTreeUserNeedToExtract(
20268 Scalar, getRootEntryInstruction(*UseEntry),
20272 "Scalar with nullptr User must be registered in "
20273 "ExternallyUsedValues map or remain as scalar in vectorized "
20277 if (
PHI->getParent()->isLandingPad())
20278 Builder.SetInsertPoint(
20281 PHI->getParent()->getLandingPadInst()->getIterator()));
20283 Builder.SetInsertPoint(
PHI->getParent(),
20284 PHI->getParent()->getFirstNonPHIIt());
20286 Builder.SetInsertPoint(VecI->getParent(),
20287 std::next(VecI->getIterator()));
20290 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20292 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20294 if (Scalar != NewInst) {
20297 "Extractelements should not be replaced.");
20298 Scalar->replaceAllUsesWith(NewInst);
20308 if (!UsedInserts.
insert(VU).second)
20311 auto BWIt = MinBWs.find(E);
20313 auto *ScalarTy = FTy->getElementType();
20314 auto Key = std::make_pair(Vec, ScalarTy);
20315 auto VecIt = VectorCasts.
find(
Key);
20316 if (VecIt == VectorCasts.
end()) {
20319 if (IVec->getParent()->isLandingPad())
20320 Builder.SetInsertPoint(IVec->getParent(),
20321 std::next(IVec->getParent()
20322 ->getLandingPadInst()
20325 Builder.SetInsertPoint(
20326 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20328 Builder.SetInsertPoint(IVec->getNextNode());
20330 Vec = Builder.CreateIntCast(
20335 BWIt->second.second);
20338 Vec = VecIt->second;
20345 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20352 unsigned Idx = *InsertIdx;
20353 if (It == ShuffledInserts.
end()) {
20355 It = std::next(ShuffledInserts.
begin(),
20356 ShuffledInserts.
size() - 1);
20361 Mask[Idx] = ExternalUse.Lane;
20373 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
20374 if (PH->getIncomingValue(
I) == Scalar) {
20376 PH->getIncomingBlock(
I)->getTerminator();
20378 Builder.SetInsertPoint(VecI->getParent(),
20379 std::next(VecI->getIterator()));
20381 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
20383 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20384 PH->setOperand(
I, NewInst);
20389 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20393 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20394 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20405 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20407 CombinedMask1[
I] = Mask[
I];
20409 CombinedMask2[
I] = Mask[
I] - VF;
20411 ShuffleInstructionBuilder ShuffleBuilder(
20413 ShuffleBuilder.add(V1, CombinedMask1);
20415 ShuffleBuilder.add(V2, CombinedMask2);
20416 return ShuffleBuilder.finalize({}, {}, {});
20419 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
20420 bool ForSingleMask) {
20421 unsigned VF =
Mask.size();
20424 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20425 Vec = CreateShuffle(Vec,
nullptr, Mask);
20426 return std::make_pair(Vec,
true);
20428 if (!ForSingleMask) {
20430 for (
unsigned I = 0;
I < VF; ++
I) {
20434 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20438 return std::make_pair(Vec,
false);
20442 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
20445 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
20446 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
20447 Builder.SetInsertPoint(LastInsert);
20448 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20453 return cast<VectorType>(Vec->getType())
20454 ->getElementCount()
20455 .getKnownMinValue();
20458 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20460 assert((Vals.size() == 1 || Vals.size() == 2) &&
20461 "Expected exactly 1 or 2 input values.");
20462 if (Vals.size() == 1) {
20465 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20466 ->getNumElements() ||
20467 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20468 return CreateShuffle(Vals.front(), nullptr, Mask);
20469 return Vals.front();
20471 return CreateShuffle(Vals.
front() ? Vals.
front()
20473 Vals.
back(), Mask);
20475 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20477 InsertElementInst *
II =
nullptr;
20478 if (It != ShuffledInserts[
I].InsertElements.rend())
20481 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20482 assert(
II &&
"Must be an insertelement instruction.");
20489 for (Instruction *
II :
reverse(Inserts)) {
20490 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20492 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20493 II->moveAfter(NewI);
20497 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
20498 IE->replaceUsesOfWith(
IE->getOperand(0),
20500 IE->replaceUsesOfWith(
IE->getOperand(1),
20504 CSEBlocks.insert(LastInsert->
getParent());
20509 for (
auto &TEPtr : VectorizableTree) {
20510 TreeEntry *
Entry = TEPtr.get();
20513 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20516 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20519 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20522 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20526 EE && IgnoredExtracts.contains(EE))
20533 for (User *U :
Scalar->users()) {
20538 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20541 "Deleting out-of-tree value");
20545 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20554 V->mergeDIAssignID(RemovedInsts);
20557 if (UserIgnoreList) {
20558 for (Instruction *
I : RemovedInsts) {
20559 const TreeEntry *
IE = getTreeEntries(
I).front();
20560 if (
IE->Idx != 0 &&
20561 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20562 (ValueToGatherNodes.lookup(
I).contains(
20563 VectorizableTree.front().get()) ||
20564 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20565 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20566 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20567 IE->UserTreeIndex &&
20569 !(GatheredLoadsEntriesFirst.has_value() &&
20570 IE->Idx >= *GatheredLoadsEntriesFirst &&
20571 VectorizableTree.front()->isGather() &&
20573 !(!VectorizableTree.front()->isGather() &&
20574 VectorizableTree.front()->isCopyableElement(
I)))
20579 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20580 (match(U.getUser(), m_LogicalAnd()) ||
20581 match(U.getUser(), m_LogicalOr())) &&
20582 U.getOperandNo() == 0;
20583 if (IsPoisoningLogicalOp) {
20584 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20587 return UserIgnoreList->contains(
U.getUser());
20591 for (SelectInst *SI : LogicalOpSelects)
20601 Builder.ClearInsertionPoint();
20602 InstrElementSize.clear();
20604 const TreeEntry &RootTE = *VectorizableTree.front();
20605 Value *Vec = RootTE.VectorizedValue;
20606 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20607 It != MinBWs.end() &&
20608 ReductionBitWidth != It->second.first) {
20609 IRBuilder<>::InsertPointGuard Guard(Builder);
20610 Builder.SetInsertPoint(ReductionRoot->getParent(),
20611 ReductionRoot->getIterator());
20612 Vec = Builder.CreateIntCast(
20616 It->second.second);
20622 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
20623 <<
" gather sequences instructions.\n");
20630 Loop *L = LI->getLoopFor(
I->getParent());
20635 BasicBlock *PreHeader = L->getLoopPreheader();
20643 auto *OpI = dyn_cast<Instruction>(V);
20644 return OpI && L->contains(OpI);
20650 CSEBlocks.insert(PreHeader);
20655 CSEWorkList.
reserve(CSEBlocks.size());
20658 assert(DT->isReachableFromEntry(
N));
20665 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
20666 "Different nodes should have different DFS numbers");
20667 return A->getDFSNumIn() <
B->getDFSNumIn();
20675 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
20678 if (I1->getType() != I2->getType())
20683 return I1->isIdenticalTo(I2);
20684 if (SI1->isIdenticalTo(SI2))
20686 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
20687 if (SI1->getOperand(
I) != SI2->getOperand(
I))
20690 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20694 unsigned LastUndefsCnt = 0;
20695 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
20701 NewMask[
I] != SM1[
I])
20704 NewMask[
I] = SM1[
I];
20708 return SM1.
size() - LastUndefsCnt > 1 &&
20712 SM1.
size() - LastUndefsCnt));
20718 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
20720 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
20721 "Worklist not sorted properly!");
20728 !GatherShuffleExtractSeq.contains(&In))
20733 bool Replaced =
false;
20736 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20737 DT->dominates(V->getParent(), In.getParent())) {
20738 In.replaceAllUsesWith(V);
20741 if (!NewMask.
empty())
20742 SI->setShuffleMask(NewMask);
20747 GatherShuffleExtractSeq.contains(V) &&
20748 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20749 DT->dominates(In.getParent(), V->getParent())) {
20751 V->replaceAllUsesWith(&In);
20754 if (!NewMask.
empty())
20755 SI->setShuffleMask(NewMask);
20763 Visited.push_back(&In);
20768 GatherShuffleExtractSeq.clear();
20771BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20774 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20775 for (
Value *V : VL) {
20776 if (S.isNonSchedulable(V))
20779 if (S.isCopyableElement(V)) {
20781 ScheduleCopyableData &SD =
20782 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
20784 BundlePtr->add(&SD);
20787 ScheduleData *BundleMember = getScheduleData(V);
20788 assert(BundleMember &&
"no ScheduleData for bundle member "
20789 "(maybe not in same basic block)");
20791 BundlePtr->add(BundleMember);
20792 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
20795 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
20801std::optional<BoUpSLP::ScheduleBundle *>
20803 const InstructionsState &S,
20810 bool HasCopyables = S.areInstructionsWithCopyableElements();
20812 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
20816 SmallVector<ScheduleData *> ControlDependentMembers;
20817 for (
Value *V : VL) {
20819 if (!
I || (HasCopyables && S.isCopyableElement(V)))
20821 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20822 for (
const Use &U :
I->operands()) {
20825 .first->getSecond();
20828 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
20829 if (ScheduleData *OpSD = getScheduleData(
Op);
20830 OpSD && OpSD->hasValidDependencies()) {
20831 OpSD->clearDirectDependencies();
20832 if (RegionHasStackSave ||
20834 ControlDependentMembers.
push_back(OpSD);
20839 if (!ControlDependentMembers.
empty()) {
20840 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20841 calculateDependencies(
Invalid,
true, SLP,
20842 ControlDependentMembers);
20849 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
20851 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
20854 SmallVector<ScheduleData *> ControlDependentMembers;
20855 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20856 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20857 for (ScheduleEntity *SE : Bundle.getBundle()) {
20859 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20860 BundleMember && BundleMember->hasValidDependencies()) {
20861 BundleMember->clearDirectDependencies();
20862 if (RegionHasStackSave ||
20864 BundleMember->getInst()))
20865 ControlDependentMembers.
push_back(BundleMember);
20870 if (SD->hasValidDependencies() &&
20871 (!S.areInstructionsWithCopyableElements() ||
20872 !S.isCopyableElement(SD->getInst())) &&
20873 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
20874 EI.UserTE->hasState() &&
20875 (!EI.UserTE->hasCopyableElements() ||
20876 !EI.UserTE->isCopyableElement(SD->getInst())))
20877 SD->clearDirectDependencies();
20878 for (
const Use &U : SD->getInst()->operands()) {
20881 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
20882 .first->getSecond();
20885 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
20887 if (ScheduleData *OpSD = getScheduleData(
Op);
20888 OpSD && OpSD->hasValidDependencies()) {
20889 OpSD->clearDirectDependencies();
20890 if (RegionHasStackSave ||
20892 ControlDependentMembers.
push_back(OpSD);
20903 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20904 for_each(ScheduleDataMap, [&](
auto &
P) {
20905 if (BB !=
P.first->getParent())
20907 ScheduleData *SD =
P.second;
20908 if (isInSchedulingRegion(*SD))
20909 SD->clearDependencies();
20911 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
20912 for_each(
P.second, [&](ScheduleCopyableData *SD) {
20913 if (isInSchedulingRegion(*SD))
20914 SD->clearDependencies();
20921 if (Bundle && !Bundle.getBundle().empty()) {
20922 if (S.areInstructionsWithCopyableElements() ||
20923 !ScheduleCopyableDataMap.empty())
20924 CheckIfNeedToClearDeps(Bundle);
20925 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
20927 calculateDependencies(Bundle, !ReSchedule, SLP,
20928 ControlDependentMembers);
20929 }
else if (!ControlDependentMembers.
empty()) {
20930 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20931 calculateDependencies(
Invalid, !ReSchedule, SLP,
20932 ControlDependentMembers);
20937 initialFillReadyList(ReadyInsts);
20944 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20945 !ReadyInsts.empty()) {
20946 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20947 assert(Picked->isReady() &&
"must be ready to schedule");
20948 schedule(*SLP, S, EI, Picked, ReadyInsts);
20949 if (Picked == &Bundle)
20956 for (
Value *V : VL) {
20957 if (S.isNonSchedulable(V))
20959 if (!extendSchedulingRegion(V, S)) {
20966 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20967 TryScheduleBundleImpl(
false,
Invalid);
20968 return std::nullopt;
20972 bool ReSchedule =
false;
20973 for (
Value *V : VL) {
20974 if (S.isNonSchedulable(V))
20978 if (!CopyableData.
empty()) {
20979 for (ScheduleCopyableData *SD : CopyableData)
20980 ReadyInsts.remove(SD);
20982 ScheduleData *BundleMember = getScheduleData(V);
20983 assert((BundleMember || S.isCopyableElement(V)) &&
20984 "no ScheduleData for bundle member (maybe not in same basic block)");
20990 ReadyInsts.remove(BundleMember);
20992 !Bundles.
empty()) {
20993 for (ScheduleBundle *
B : Bundles)
20994 ReadyInsts.remove(
B);
20997 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21004 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21005 <<
" was already scheduled\n");
21009 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21010 TryScheduleBundleImpl(ReSchedule, Bundle);
21011 if (!Bundle.isReady()) {
21012 for (ScheduleEntity *BD : Bundle.getBundle()) {
21016 if (BD->isReady()) {
21018 if (Bundles.
empty()) {
21019 ReadyInsts.insert(BD);
21022 for (ScheduleBundle *
B : Bundles)
21024 ReadyInsts.insert(
B);
21027 ScheduledBundlesList.pop_back();
21028 SmallVector<ScheduleData *> ControlDependentMembers;
21029 SmallPtrSet<Instruction *, 4> Visited;
21030 for (
Value *V : VL) {
21031 if (S.isNonSchedulable(V))
21034 if (S.isCopyableElement(
I)) {
21037 auto KV = std::make_pair(EI,
I);
21038 assert(ScheduleCopyableDataMap.contains(KV) &&
21039 "no ScheduleCopyableData for copyable element");
21040 ScheduleCopyableData *SD =
21041 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
21042 ScheduleCopyableDataMapByUsers[
I].remove(SD);
21045 const auto *It =
find(
Op,
I);
21046 assert(It !=
Op.end() &&
"Lane not set");
21047 SmallPtrSet<Instruction *, 4> Visited;
21049 int Lane = std::distance(
Op.begin(), It);
21050 assert(Lane >= 0 &&
"Lane not set");
21052 !EI.UserTE->ReorderIndices.empty())
21053 Lane = EI.UserTE->ReorderIndices[Lane];
21054 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21055 "Couldn't find extract lane");
21057 if (!Visited.
insert(In).second) {
21061 ScheduleCopyableDataMapByInstUser
21062 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
21065 }
while (It !=
Op.end());
21067 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
21068 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
21070 if (ScheduleCopyableDataMapByUsers[
I].
empty())
21071 ScheduleCopyableDataMapByUsers.erase(
I);
21072 ScheduleCopyableDataMap.erase(KV);
21074 if (ScheduleData *OpSD = getScheduleData(
I);
21075 OpSD && OpSD->hasValidDependencies()) {
21076 OpSD->clearDirectDependencies();
21077 if (RegionHasStackSave ||
21079 ControlDependentMembers.
push_back(OpSD);
21083 ScheduledBundles.find(
I)->getSecond().pop_back();
21085 if (!ControlDependentMembers.
empty()) {
21086 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21087 calculateDependencies(
Invalid,
false, SLP,
21088 ControlDependentMembers);
21090 return std::nullopt;
21095BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21097 if (ChunkPos >= ChunkSize) {
21098 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21101 return &(ScheduleDataChunks.back()[ChunkPos++]);
21104bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21105 Value *V,
const InstructionsState &S) {
21107 assert(
I &&
"bundle member must be an instruction");
21108 if (getScheduleData(
I))
21110 if (!ScheduleStart) {
21112 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21114 ScheduleEnd =
I->getNextNode();
21115 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21116 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21124 ++ScheduleStart->getIterator().getReverse();
21130 return II->isAssumeLikeIntrinsic();
21133 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21134 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21135 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21137 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21138 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21145 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21146 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21148 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21149 assert(
I->getParent() == ScheduleStart->getParent() &&
21150 "Instruction is in wrong basic block.");
21151 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21157 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21158 "Expected to reach top of the basic block or instruction down the "
21160 assert(
I->getParent() == ScheduleEnd->getParent() &&
21161 "Instruction is in wrong basic block.");
21162 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21164 ScheduleEnd =
I->getNextNode();
21165 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21166 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21170void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21172 ScheduleData *PrevLoadStore,
21173 ScheduleData *NextLoadStore) {
21174 ScheduleData *CurrentLoadStore = PrevLoadStore;
21179 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21181 SD = allocateScheduleDataChunks();
21182 ScheduleDataMap[
I] = SD;
21184 assert(!isInSchedulingRegion(*SD) &&
21185 "new ScheduleData already in scheduling region");
21186 SD->init(SchedulingRegionID,
I);
21188 if (
I->mayReadOrWriteMemory() &&
21192 Intrinsic::pseudoprobe))) {
21194 if (CurrentLoadStore) {
21195 CurrentLoadStore->setNextLoadStore(SD);
21197 FirstLoadStoreInRegion = SD;
21199 CurrentLoadStore = SD;
21204 RegionHasStackSave =
true;
21206 if (NextLoadStore) {
21207 if (CurrentLoadStore)
21208 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21210 LastLoadStoreInRegion = CurrentLoadStore;
21214void BoUpSLP::BlockScheduling::calculateDependencies(
21215 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21217 SmallVector<ScheduleEntity *> WorkList;
21218 auto ProcessNode = [&](ScheduleEntity *SE) {
21220 if (CD->hasValidDependencies())
21223 CD->initDependencies();
21224 CD->resetUnscheduledDeps();
21225 const EdgeInfo &EI = CD->getEdgeInfo();
21228 const auto *It =
find(
Op, CD->getInst());
21229 assert(It !=
Op.end() &&
"Lane not set");
21230 SmallPtrSet<Instruction *, 4> Visited;
21232 int Lane = std::distance(
Op.begin(), It);
21233 assert(Lane >= 0 &&
"Lane not set");
21235 !EI.UserTE->ReorderIndices.empty())
21236 Lane = EI.UserTE->ReorderIndices[Lane];
21237 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21238 "Couldn't find extract lane");
21240 if (EI.UserTE->isCopyableElement(In)) {
21243 if (ScheduleCopyableData *UseSD =
21244 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21245 CD->incDependencies();
21246 if (!UseSD->isScheduled())
21247 CD->incrementUnscheduledDeps(1);
21248 if (!UseSD->hasValidDependencies() ||
21249 (InsertInReadyList && UseSD->isReady()))
21252 }
else if (Visited.
insert(In).second) {
21253 if (ScheduleData *UseSD = getScheduleData(In)) {
21254 CD->incDependencies();
21255 if (!UseSD->isScheduled())
21256 CD->incrementUnscheduledDeps(1);
21257 if (!UseSD->hasValidDependencies() ||
21258 (InsertInReadyList && UseSD->isReady()))
21263 }
while (It !=
Op.end());
21264 if (CD->isReady() && CD->getDependencies() == 0 &&
21265 (EI.UserTE->hasState() &&
21266 (EI.UserTE->getMainOp()->getParent() !=
21267 CD->getInst()->getParent() ||
21269 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21270 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21271 auto *IU = dyn_cast<Instruction>(U);
21274 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21280 CD->incDependencies();
21281 CD->incrementUnscheduledDeps(1);
21287 if (BundleMember->hasValidDependencies())
21289 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21290 BundleMember->initDependencies();
21291 BundleMember->resetUnscheduledDeps();
21293 SmallDenseMap<Value *, unsigned> UserToNumOps;
21294 for (User *U : BundleMember->getInst()->users()) {
21297 if (ScheduleData *UseSD = getScheduleData(U)) {
21301 if (areAllOperandsReplacedByCopyableData(
21304 BundleMember->incDependencies();
21305 if (!UseSD->isScheduled())
21306 BundleMember->incrementUnscheduledDeps(1);
21307 if (!UseSD->hasValidDependencies() ||
21308 (InsertInReadyList && UseSD->isReady()))
21312 for (ScheduleCopyableData *UseSD :
21313 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21314 BundleMember->incDependencies();
21315 if (!UseSD->isScheduled())
21316 BundleMember->incrementUnscheduledDeps(1);
21317 if (!UseSD->hasValidDependencies() ||
21318 (InsertInReadyList && UseSD->isReady()))
21322 SmallPtrSet<const Instruction *, 4> Visited;
21325 if (!Visited.
insert(
I).second)
21327 auto *DepDest = getScheduleData(
I);
21328 assert(DepDest &&
"must be in schedule window");
21329 DepDest->addControlDependency(BundleMember);
21330 BundleMember->incDependencies();
21331 if (!DepDest->isScheduled())
21332 BundleMember->incrementUnscheduledDeps(1);
21333 if (!DepDest->hasValidDependencies() ||
21334 (InsertInReadyList && DepDest->isReady()))
21342 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21343 I != ScheduleEnd;
I =
I->getNextNode()) {
21348 MakeControlDependent(
I);
21356 if (RegionHasStackSave) {
21361 match(BundleMember->getInst(),
21363 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21364 I != ScheduleEnd;
I =
I->getNextNode()) {
21375 MakeControlDependent(
I);
21385 BundleMember->getInst()->mayReadOrWriteMemory()) {
21386 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21387 I != ScheduleEnd;
I =
I->getNextNode()) {
21393 MakeControlDependent(
I);
21400 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21401 if (!NextLoadStore)
21405 "NextLoadStore list for non memory effecting bundle?");
21408 unsigned NumAliased = 0;
21409 unsigned DistToSrc = 1;
21410 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21412 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21413 DepDest = DepDest->getNextLoadStore()) {
21414 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21424 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21426 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21433 DepDest->addMemoryDependency(BundleMember);
21434 BundleMember->incDependencies();
21435 if (!DepDest->isScheduled())
21436 BundleMember->incrementUnscheduledDeps(1);
21437 if (!DepDest->hasValidDependencies() ||
21438 (InsertInReadyList && DepDest->isReady()))
21462 "expected at least one instruction to schedule");
21464 WorkList.
push_back(Bundle.getBundle().front());
21466 SmallPtrSet<ScheduleBundle *, 16> Visited;
21467 while (!WorkList.
empty()) {
21472 CopyableBundle.
push_back(&CD->getBundle());
21473 Bundles = CopyableBundle;
21475 Bundles = getScheduleBundles(SD->getInst());
21477 if (Bundles.
empty()) {
21478 if (!SD->hasValidDependencies())
21480 if (InsertInReadyList && SD->isReady()) {
21481 ReadyInsts.insert(SD);
21482 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21486 for (ScheduleBundle *Bundle : Bundles) {
21487 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21489 assert(isInSchedulingRegion(*Bundle) &&
21490 "ScheduleData not in scheduling region");
21491 for_each(Bundle->getBundle(), ProcessNode);
21493 if (InsertInReadyList && SD->isReady()) {
21494 for (ScheduleBundle *Bundle : Bundles) {
21495 assert(isInSchedulingRegion(*Bundle) &&
21496 "ScheduleData not in scheduling region");
21497 if (!Bundle->isReady())
21499 ReadyInsts.insert(Bundle);
21507void BoUpSLP::BlockScheduling::resetSchedule() {
21509 "tried to reset schedule on block which has not been scheduled");
21510 for_each(ScheduleDataMap, [&](
auto &
P) {
21511 if (BB !=
P.first->getParent())
21513 ScheduleData *SD =
P.second;
21514 if (isInSchedulingRegion(*SD)) {
21515 SD->setScheduled(
false);
21516 SD->resetUnscheduledDeps();
21519 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21520 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21521 if (isInSchedulingRegion(*SD)) {
21522 SD->setScheduled(false);
21523 SD->resetUnscheduledDeps();
21527 for_each(ScheduledBundles, [&](
auto &
P) {
21528 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21529 if (isInSchedulingRegion(*Bundle))
21530 Bundle->setScheduled(false);
21534 for (
auto &
P : ScheduleCopyableDataMap) {
21535 if (isInSchedulingRegion(*
P.second)) {
21536 P.second->setScheduled(
false);
21537 P.second->resetUnscheduledDeps();
21540 ReadyInsts.clear();
21543void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21544 if (!BS->ScheduleStart)
21547 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21554 BS->resetSchedule();
21561 struct ScheduleDataCompare {
21562 bool operator()(
const ScheduleEntity *SD1,
21563 const ScheduleEntity *SD2)
const {
21564 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21567 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21572 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21573 I =
I->getNextNode()) {
21575 if (!Bundles.
empty()) {
21576 for (ScheduleBundle *Bundle : Bundles) {
21577 Bundle->setSchedulingPriority(Idx++);
21578 if (!Bundle->hasValidDependencies())
21579 BS->calculateDependencies(*Bundle,
false,
this);
21582 for (ScheduleCopyableData *SD :
reverse(SDs)) {
21583 ScheduleBundle &Bundle = SD->getBundle();
21584 Bundle.setSchedulingPriority(Idx++);
21585 if (!Bundle.hasValidDependencies())
21586 BS->calculateDependencies(Bundle,
false,
this);
21591 BS->getScheduleCopyableDataUsers(
I);
21592 if (ScheduleData *SD = BS->getScheduleData(
I)) {
21595 SDTEs.
front()->doesNotNeedToSchedule() ||
21597 "scheduler and vectorizer bundle mismatch");
21598 SD->setSchedulingPriority(Idx++);
21599 if (!SD->hasValidDependencies() &&
21600 (!CopyableData.
empty() ||
21601 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
21602 assert(TE->isGather() &&
"expected gather node");
21603 return TE->hasState() && TE->hasCopyableElements() &&
21604 TE->isCopyableElement(I);
21610 ScheduleBundle Bundle;
21612 BS->calculateDependencies(Bundle,
false,
this);
21615 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
21616 ScheduleBundle &Bundle = SD->getBundle();
21617 Bundle.setSchedulingPriority(Idx++);
21618 if (!Bundle.hasValidDependencies())
21619 BS->calculateDependencies(Bundle,
false,
this);
21622 BS->initialFillReadyList(ReadyInsts);
21624 Instruction *LastScheduledInst = BS->ScheduleEnd;
21627 SmallPtrSet<Instruction *, 16> Scheduled;
21628 while (!ReadyInsts.empty()) {
21629 auto *Picked = *ReadyInsts.begin();
21630 ReadyInsts.erase(ReadyInsts.begin());
21635 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21636 Instruction *PickedInst = BundleMember->getInst();
21638 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21639 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21640 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
21642 if (PickedInst->
getNextNode() != LastScheduledInst)
21644 LastScheduledInst = PickedInst;
21646 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21647 LastScheduledInst);
21651 if (PickedInst->
getNextNode() != LastScheduledInst)
21653 LastScheduledInst = PickedInst;
21655 auto Invalid = InstructionsState::invalid();
21660#ifdef EXPENSIVE_CHECKS
21664#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21666 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21667 I =
I->getNextNode()) {
21670 [](
const ScheduleBundle *Bundle) {
21671 return Bundle->isScheduled();
21673 "must be scheduled at this point");
21678 BS->ScheduleStart =
nullptr;
21686 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21691 auto E = InstrElementSize.find(V);
21692 if (E != InstrElementSize.end())
21709 Value *FirstNonBool =
nullptr;
21710 while (!Worklist.
empty()) {
21715 auto *Ty =
I->getType();
21718 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21726 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21734 for (
Use &U :
I->operands()) {
21736 if (Visited.
insert(J).second &&
21742 FirstNonBool = U.get();
21753 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21755 Width = DL->getTypeSizeInBits(V->getType());
21759 InstrElementSize[
I] = Width;
21764bool BoUpSLP::collectValuesToDemote(
21765 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
21768 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
21773 unsigned OrigBitWidth =
21774 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21781 if (NodesToKeepBWs.
contains(E.Idx))
21787 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
21788 if (isa<PoisonValue>(R))
21790 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21792 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
21795 if (getTreeEntries(V).
size() > 1)
21801 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
21807 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21812 unsigned BitWidth2 =
21813 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21814 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21820 BitWidth1 = std::min(BitWidth1, BitWidth2);
21825 auto FinalAnalysis = [&, TTI = TTI]() {
21826 if (!IsProfitableToDemote)
21829 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
21831 if (Res &&
E.isGather()) {
21832 if (
E.hasState()) {
21833 if (
const TreeEntry *SameTE =
21834 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
21836 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
21837 ToDemote, Visited, NodesToKeepBWs,
21838 MaxDepthLevel, IsProfitableToDemote,
21846 SmallPtrSet<Value *, 4> UniqueBases;
21847 for (
Value *V :
E.Scalars) {
21851 UniqueBases.
insert(EE->getVectorOperand());
21853 const unsigned VF =
E.Scalars.size();
21854 Type *OrigScalarTy =
E.Scalars.front()->getType();
21855 if (UniqueBases.
size() <= 2 ||
21868 if (
E.isGather() || !Visited.
insert(&
E).second ||
21870 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21871 return isa<InsertElementInst>(U) && !isVectorized(U);
21874 return FinalAnalysis();
21877 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21878 return isVectorized(U) ||
21879 (E.Idx == 0 && UserIgnoreList &&
21880 UserIgnoreList->contains(U)) ||
21881 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21882 !U->getType()->isScalableTy() &&
21883 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21884 }) && !IsPotentiallyTruncated(V,
BitWidth);
21889 bool &NeedToExit) {
21890 NeedToExit =
false;
21891 unsigned InitLevel = MaxDepthLevel;
21893 unsigned Level = InitLevel;
21894 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
21895 ToDemote, Visited, NodesToKeepBWs, Level,
21896 IsProfitableToDemote, IsTruncRoot)) {
21897 if (!IsProfitableToDemote)
21900 if (!FinalAnalysis())
21904 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21908 auto AttemptCheckBitwidth =
21909 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
21911 NeedToExit =
false;
21912 unsigned BestFailBitwidth = 0;
21914 if (Checker(
BitWidth, OrigBitWidth))
21916 if (BestFailBitwidth == 0 && FinalAnalysis())
21920 if (BestFailBitwidth == 0) {
21931 auto TryProcessInstruction =
21933 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
21937 for (
Value *V :
E.Scalars)
21938 (void)IsPotentiallyTruncated(V,
BitWidth);
21943 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
21946 bool NeedToExit =
false;
21947 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21951 if (!ProcessOperands(
Operands, NeedToExit))
21960 return IsProfitableToDemote;
21963 if (
E.State == TreeEntry::SplitVectorize)
21964 return TryProcessInstruction(
21966 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
21967 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
21969 switch (
E.getOpcode()) {
21973 case Instruction::Trunc:
21974 if (IsProfitableToDemoteRoot)
21975 IsProfitableToDemote =
true;
21976 return TryProcessInstruction(
BitWidth);
21977 case Instruction::ZExt:
21978 case Instruction::SExt:
21979 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
21980 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
21981 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
21983 IsProfitableToDemote =
true;
21984 return TryProcessInstruction(
BitWidth);
21988 case Instruction::Add:
21989 case Instruction::Sub:
21990 case Instruction::Mul:
21991 case Instruction::And:
21992 case Instruction::Or:
21993 case Instruction::Xor: {
21994 return TryProcessInstruction(
21995 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
21997 case Instruction::Freeze:
21998 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
21999 case Instruction::Shl: {
22002 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
22004 if (isa<PoisonValue>(V))
22006 auto *I = cast<Instruction>(V);
22007 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22008 return AmtKnownBits.getMaxValue().ult(BitWidth);
22011 return TryProcessInstruction(
22012 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
22014 case Instruction::LShr: {
22018 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22020 if (isa<PoisonValue>(V))
22022 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22023 if (E.isCopyableElement(V))
22024 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22025 auto *I = cast<Instruction>(V);
22026 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22027 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22028 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22029 SimplifyQuery(*DL));
22032 return TryProcessInstruction(
22033 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22036 case Instruction::AShr: {
22040 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22042 if (isa<PoisonValue>(V))
22044 auto *I = cast<Instruction>(V);
22045 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22046 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22047 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22049 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22052 return TryProcessInstruction(
22053 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22056 case Instruction::UDiv:
22057 case Instruction::URem: {
22059 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22062 auto *I = cast<Instruction>(V);
22063 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22064 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22065 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22068 return TryProcessInstruction(
22069 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
22073 case Instruction::Select: {
22074 return TryProcessInstruction(
22075 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
22079 case Instruction::PHI: {
22080 const unsigned NumOps =
E.getNumOperands();
22083 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
22088 case Instruction::Call: {
22093 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
22094 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
22097 function_ref<bool(
unsigned,
unsigned)> CallChecker;
22098 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22101 auto *I = cast<Instruction>(V);
22102 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22103 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22104 return MaskedValueIsZero(I->getOperand(0), Mask,
22105 SimplifyQuery(*DL)) &&
22106 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22108 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22109 "Expected min/max intrinsics only.");
22110 unsigned SignBits = OrigBitWidth -
BitWidth;
22112 unsigned Op0SignBits =
22114 unsigned Op1SignBits =
22116 return SignBits <= Op0SignBits &&
22117 ((SignBits != Op0SignBits &&
22120 SimplifyQuery(*DL))) &&
22121 SignBits <= Op1SignBits &&
22122 ((SignBits != Op1SignBits &&
22127 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22130 auto *I = cast<Instruction>(V);
22131 unsigned SignBits = OrigBitWidth - BitWidth;
22132 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22133 unsigned Op0SignBits =
22134 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22135 return SignBits <= Op0SignBits &&
22136 ((SignBits != Op0SignBits &&
22137 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22138 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22141 if (
ID != Intrinsic::abs) {
22142 Operands.push_back(getOperandEntry(&
E, 1));
22143 CallChecker = CompChecker;
22145 CallChecker = AbsChecker;
22148 std::numeric_limits<InstructionCost::CostType>::max();
22150 unsigned VF =
E.Scalars.size();
22152 auto Checker = [&](
unsigned BitWidth, unsigned) {
22160 if (
Cost < BestCost) {
22166 [[maybe_unused]]
bool NeedToExit;
22167 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22177 return FinalAnalysis();
22184 bool IsStoreOrInsertElt =
22185 VectorizableTree.front()->hasState() &&
22186 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
22187 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22188 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22189 ExtraBitWidthNodes.size() <= 1 &&
22190 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22191 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22194 unsigned NodeIdx = 0;
22195 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22199 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22200 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22201 "Unexpected tree is graph.");
22205 bool IsTruncRoot =
false;
22206 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22209 if (NodeIdx != 0 &&
22210 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22211 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22212 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22213 IsTruncRoot =
true;
22215 IsProfitableToDemoteRoot =
true;
22220 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22224 auto ComputeMaxBitWidth =
22225 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22226 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22230 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22231 !NodesToKeepBWs.
contains(E.Idx) &&
22232 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22234 return V->hasOneUse() || isa<Constant>(V) ||
22235 (!V->hasNUsesOrMore(UsesLimit) &&
22236 none_of(V->users(), [&](User *U) {
22237 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22238 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22239 if (TEs.empty() || is_contained(TEs, UserTE))
22241 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22243 isa<SIToFPInst, UIToFPInst>(U) ||
22244 (UserTE->hasState() &&
22245 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22246 SelectInst>(UserTE->getMainOp()) ||
22247 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22249 unsigned UserTESz = DL->getTypeSizeInBits(
22250 UserTE->Scalars.front()->getType());
22251 if (all_of(TEs, [&](const TreeEntry *TE) {
22252 auto It = MinBWs.find(TE);
22253 return It != MinBWs.end() &&
22254 It->second.first > UserTESz;
22257 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22261 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22262 auto It = MinBWs.find(UserTE);
22263 if (It != MinBWs.end())
22264 return It->second.first;
22265 unsigned MaxBitWidth =
22266 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22267 MaxBitWidth =
bit_ceil(MaxBitWidth);
22268 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22270 return MaxBitWidth;
22276 unsigned VF = E.getVectorFactor();
22277 Type *ScalarTy = E.Scalars.front()->getType();
22284 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22293 unsigned MaxBitWidth = 1u;
22301 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22302 if (isa<PoisonValue>(R))
22304 KnownBits Known = computeKnownBits(R, *DL);
22305 return Known.isNonNegative();
22308 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22309 E.UserTreeIndex.UserTE->hasState() &&
22310 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22312 std::min(DL->getTypeSizeInBits(
22313 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22314 DL->getTypeSizeInBits(ScalarTy));
22318 for (
Value *Root : E.Scalars) {
22324 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22340 if (!IsKnownPositive)
22345 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22348 APInt Mask = DB->getDemandedBits(
I);
22349 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22351 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22354 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22359 if (NumParts > 1 &&
22367 unsigned Opcode = E.getOpcode();
22368 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22369 Opcode == Instruction::SExt ||
22370 Opcode == Instruction::ZExt || NumParts > 1;
22375 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22376 bool NeedToDemote = IsProfitableToDemote;
22378 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22379 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22380 NeedToDemote, IsTruncRoot) ||
22381 (MaxDepthLevel <= Limit &&
22382 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22383 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22384 DL->getTypeSizeInBits(TreeRootIT) /
22385 DL->getTypeSizeInBits(
22386 E.getMainOp()->getOperand(0)->getType()) >
22390 MaxBitWidth =
bit_ceil(MaxBitWidth);
22392 return MaxBitWidth;
22399 if (UserIgnoreList &&
22403 if (
all_of(*UserIgnoreList,
22408 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22409 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22410 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22411 Builder.getInt1Ty()) {
22412 ReductionBitWidth = 1;
22414 for (
Value *V : *UserIgnoreList) {
22418 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
22419 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22422 unsigned BitWidth2 = BitWidth1;
22425 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22427 ReductionBitWidth =
22428 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22430 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22431 ReductionBitWidth = 8;
22433 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22436 bool IsTopRoot = NodeIdx == 0;
22437 while (NodeIdx < VectorizableTree.size() &&
22438 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22439 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22440 RootDemotes.push_back(NodeIdx);
22442 IsTruncRoot =
true;
22444 bool IsSignedCmp =
false;
22445 if (UserIgnoreList &&
22449 IsSignedCmp =
true;
22450 while (NodeIdx < VectorizableTree.size()) {
22452 unsigned Limit = 2;
22454 ReductionBitWidth ==
22455 DL->getTypeSizeInBits(
22456 VectorizableTree.front()->Scalars.front()->getType()))
22458 unsigned MaxBitWidth = ComputeMaxBitWidth(
22459 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22460 IsTruncRoot, IsSignedCmp);
22461 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22462 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22463 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22464 else if (MaxBitWidth == 0)
22465 ReductionBitWidth = 0;
22468 for (
unsigned Idx : RootDemotes) {
22469 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
22470 uint32_t OrigBitWidth =
22471 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22472 if (OrigBitWidth > MaxBitWidth) {
22480 RootDemotes.clear();
22482 IsProfitableToDemoteRoot =
true;
22484 if (ExtraBitWidthNodes.empty()) {
22485 NodeIdx = VectorizableTree.size();
22487 unsigned NewIdx = 0;
22489 NewIdx = *ExtraBitWidthNodes.begin();
22490 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22491 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22494 NodeIdx < VectorizableTree.size() &&
22495 VectorizableTree[NodeIdx]->UserTreeIndex &&
22496 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22497 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22498 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22499 Instruction::Trunc &&
22500 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22502 NodeIdx < VectorizableTree.size() &&
22503 VectorizableTree[NodeIdx]->UserTreeIndex &&
22504 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22505 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22506 Instruction::ICmp &&
22508 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22510 auto *IC = dyn_cast<ICmpInst>(V);
22511 return IC && (IC->isSigned() ||
22512 !isKnownNonNegative(IC->getOperand(0),
22513 SimplifyQuery(*DL)) ||
22514 !isKnownNonNegative(IC->getOperand(1),
22515 SimplifyQuery(*DL)));
22521 if (MaxBitWidth == 0 ||
22525 if (UserIgnoreList)
22526 AnalyzedMinBWVals.insert_range(TreeRoot);
22533 for (
unsigned Idx : ToDemote) {
22534 TreeEntry *
TE = VectorizableTree[Idx].get();
22535 if (MinBWs.contains(TE))
22538 if (isa<PoisonValue>(R))
22540 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22542 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22583 DL = &
F.getDataLayout();
22591 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
22593 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
22598 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
22601 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
22605 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
22611 DT->updateDFSNumbers();
22614 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
22619 R.clearReductionData();
22620 collectSeedInstructions(BB);
22623 if (!Stores.empty()) {
22625 <<
" underlying objects.\n");
22626 Changed |= vectorizeStoreChains(R);
22630 Changed |= vectorizeChainsInBlock(BB, R);
22635 if (!GEPs.empty()) {
22637 <<
" underlying objects.\n");
22638 Changed |= vectorizeGEPIndices(BB, R);
22643 R.optimizeGatherSequence();
22651 unsigned Idx,
unsigned MinVF,
22656 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22657 unsigned VF = Chain.
size();
22663 VF < 2 || VF < MinVF) {
22671 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
22675 for (
Value *V : Chain)
22678 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
22679 InstructionsState S =
Analysis.buildInstructionsState(
22683 bool IsAllowedSize =
22687 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22688 (!S.getMainOp()->isSafeToRemove() ||
22691 return !isa<ExtractElementInst>(V) &&
22692 (V->getNumUses() > Chain.size() ||
22693 any_of(V->users(), [&](User *U) {
22694 return !Stores.contains(U);
22697 (ValOps.
size() > Chain.size() / 2 && !S)) {
22698 Size = (!IsAllowedSize && S) ? 1 : 2;
22702 if (
R.isLoadCombineCandidate(Chain))
22704 R.buildTree(Chain);
22706 if (
R.isTreeTinyAndNotFullyVectorizable()) {
22707 if (
R.isGathered(Chain.front()) ||
22709 return std::nullopt;
22710 Size =
R.getCanonicalGraphSize();
22713 if (
R.isProfitableToReorder()) {
22714 R.reorderTopToBottom();
22715 R.reorderBottomToTop();
22717 R.transformNodes();
22718 R.buildExternalUses();
22720 R.computeMinimumValueSizes();
22722 Size =
R.getCanonicalGraphSize();
22723 if (S && S.getOpcode() == Instruction::Load)
22731 using namespace ore;
22733 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
22735 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
22736 <<
" and with tree size "
22737 <<
NV(
"TreeSize",
R.getTreeSize()));
22751 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22752 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22753 unsigned Size = First ? Val.first : Val.second;
22765 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22766 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22767 unsigned P = First ? Val.first : Val.second;
22770 return V + (P - Mean) * (P - Mean);
22773 return Dev * 96 / (Mean * Mean) == 0;
22781class RelatedStoreInsts {
22784 : AllStores(AllStores) {
22785 reset(BaseInstrIdx);
22788 void reset(
unsigned NewBaseInstr) {
22789 assert(NewBaseInstr < AllStores.size() &&
22790 "Instruction index out of bounds");
22791 BaseInstrIdx = NewBaseInstr;
22793 insertOrLookup(NewBaseInstr, 0);
22800 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
22801 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22802 return Inserted ? std::nullopt : std::make_optional(It->second);
22805 using DistToInstMap = std::map<int64_t, unsigned>;
22806 const DistToInstMap &getStores()
const {
return Instrs; }
22810 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
22811 ScalarEvolution &SE)
const {
22812 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22815 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
22821 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
22822 int64_t DistFromCurBase) {
22823 DistToInstMap PrevSet = std::move(Instrs);
22824 reset(NewBaseInstIdx);
22829 for (
auto [Dist, InstIdx] : PrevSet) {
22830 if (InstIdx >= MinSafeIdx)
22831 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22837 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
22838 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
22839 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
22844 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22845 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22850 unsigned BaseInstrIdx;
22853 DistToInstMap Instrs;
22861bool SLPVectorizerPass::vectorizeStores(
22863 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22870 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22871 int64_t PrevDist = -1;
22875 auto &[Dist, InstIdx] =
Data;
22876 if (
Operands.empty() || Dist - PrevDist == 1) {
22877 Operands.push_back(Stores[InstIdx]);
22879 if (Idx != StoreSeq.size() - 1)
22884 Operands.push_back(Stores[InstIdx]);
22890 .
insert({Operands.front(),
22891 cast<StoreInst>(Operands.front())->getValueOperand(),
22893 cast<StoreInst>(Operands.back())->getValueOperand(),
22898 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
22899 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
22903 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22905 Type *StoreTy =
Store->getValueOperand()->getType();
22906 Type *ValueTy = StoreTy;
22908 ValueTy = Trunc->getSrcTy();
22917 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22920 MinVF = std::max<unsigned>(2, MinVF);
22922 if (MaxVF < MinVF) {
22923 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22925 <<
"MinVF (" << MinVF <<
")\n");
22929 unsigned NonPowerOf2VF = 0;
22934 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
22936 NonPowerOf2VF = CandVF;
22937 assert(NonPowerOf2VF != MaxVF &&
22938 "Non-power-of-2 VF should not be equal to MaxVF");
22945 unsigned MaxRegVF = MaxVF;
22948 if (MaxVF < MinVF) {
22949 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22951 <<
"MinVF (" << MinVF <<
")\n");
22955 SmallVector<unsigned> CandidateVFs;
22956 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
22961 unsigned Repeat = 0;
22962 constexpr unsigned MaxAttempts = 4;
22963 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(
Operands.size());
22964 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
22965 P.first =
P.second = 1;
22966 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
22967 auto IsNotVectorized = [](
bool First,
22968 const std::pair<unsigned, unsigned> &
P) {
22969 return First ?
P.first > 0 :
P.second > 0;
22971 auto IsVectorized = [](
bool First,
22972 const std::pair<unsigned, unsigned> &
P) {
22973 return First ?
P.first == 0 :
P.second == 0;
22975 auto VFIsProfitable = [](
bool First,
unsigned Size,
22976 const std::pair<unsigned, unsigned> &
P) {
22979 auto FirstSizeSame = [](
unsigned Size,
22980 const std::pair<unsigned, unsigned> &
P) {
22981 return Size ==
P.first;
22985 bool RepeatChanged =
false;
22986 bool AnyProfitableGraph =
false;
22987 for (
unsigned VF : CandidateVFs) {
22988 AnyProfitableGraph =
false;
22989 unsigned FirstUnvecStore =
22990 std::distance(RangeSizes.begin(),
22991 find_if(RangeSizes, std::bind(IsNotVectorized,
22992 VF >= MaxRegVF, _1)));
22996 while (FirstUnvecStore < End) {
22997 unsigned FirstVecStore = std::distance(
22998 RangeSizes.begin(),
22999 find_if(RangeSizes.drop_front(FirstUnvecStore),
23000 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23001 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23002 for (
unsigned SliceStartIdx = FirstUnvecStore;
23003 SliceStartIdx + VF <= MaxSliceEnd;) {
23014 ->getValueOperand()
23017 ->getValueOperand()
23020 "Expected all operands of same type.");
23021 if (!NonSchedulable.
empty()) {
23022 auto [NonSchedSizeMax, NonSchedSizeMin] =
23024 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23027 SliceStartIdx += NonSchedSizeMax;
23032 std::optional<bool> Res =
23033 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23039 .first->getSecond()
23047 AnyProfitableGraph = RepeatChanged =
Changed =
true;
23050 for (std::pair<unsigned, unsigned> &
P :
23051 RangeSizes.slice(SliceStartIdx, VF))
23052 P.first =
P.second = 0;
23053 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23054 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
23055 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23056 P.first =
P.second = 0;
23057 FirstUnvecStore = SliceStartIdx + VF;
23059 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23060 for (std::pair<unsigned, unsigned> &
P :
23061 RangeSizes.slice(SliceStartIdx + VF,
23062 MaxSliceEnd - (SliceStartIdx + VF)))
23063 P.first =
P.second = 0;
23064 if (MaxSliceEnd == End)
23065 End = SliceStartIdx;
23066 MaxSliceEnd = SliceStartIdx;
23068 SliceStartIdx += VF;
23071 if (VF > 2 && Res &&
23072 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
23073 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23075 SliceStartIdx += VF;
23080 if (VF > MaxRegVF && TreeSize > 1 &&
23081 all_of(RangeSizes.slice(SliceStartIdx, VF),
23082 std::bind(FirstSizeSame, TreeSize, _1))) {
23083 SliceStartIdx += VF;
23084 while (SliceStartIdx != MaxSliceEnd &&
23085 RangeSizes[SliceStartIdx].first == TreeSize)
23089 if (TreeSize > 1) {
23090 for (std::pair<unsigned, unsigned> &
P :
23091 RangeSizes.slice(SliceStartIdx, VF)) {
23092 if (VF >= MaxRegVF)
23093 P.second = std::max(
P.second, TreeSize);
23095 P.first = std::max(
P.first, TreeSize);
23099 AnyProfitableGraph =
true;
23101 if (FirstUnvecStore >= End)
23103 if (MaxSliceEnd - FirstUnvecStore < VF &&
23104 MaxSliceEnd - FirstUnvecStore >= MinVF)
23105 AnyProfitableGraph =
true;
23106 FirstUnvecStore = std::distance(
23107 RangeSizes.begin(),
23108 find_if(RangeSizes.drop_front(MaxSliceEnd),
23109 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23111 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23115 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23116 return P.first == 0 &&
P.second == 0;
23120 if (Repeat >= MaxAttempts ||
23121 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23123 constexpr unsigned StoresLimit = 64;
23124 const unsigned MaxTotalNum = std::min<unsigned>(
23126 static_cast<unsigned>(
23129 RangeSizes.begin(),
23130 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23132 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23135 CandidateVFs.clear();
23137 CandidateVFs.push_back(Limit);
23138 if (VF > MaxTotalNum || VF >= StoresLimit)
23140 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23142 P.first = std::max(
P.second,
P.first);
23146 CandidateVFs.push_back(VF);
23186 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
23187 std::optional<int64_t> PtrDist;
23188 auto *RelatedStores =
find_if(
23189 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23190 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23191 return PtrDist.has_value();
23195 if (RelatedStores == SortedStores.
end()) {
23203 if (std::optional<unsigned> PrevInst =
23204 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23205 TryToVectorize(RelatedStores->getStores());
23206 RelatedStores->clearVectorizedStores(VectorizedStores);
23207 RelatedStores->rebase(*PrevInst + 1,
23212 Type *PrevValTy =
nullptr;
23214 if (
R.isDeleted(SI))
23217 PrevValTy =
SI->getValueOperand()->getType();
23219 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23220 for (RelatedStoreInsts &StoreSeq : SortedStores)
23221 TryToVectorize(StoreSeq.getStores());
23222 SortedStores.clear();
23223 PrevValTy =
SI->getValueOperand()->getType();
23225 FillStoresSet(
I, SI);
23229 for (RelatedStoreInsts &StoreSeq : SortedStores)
23230 TryToVectorize(StoreSeq.getStores());
23235void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23243 for (Instruction &
I : *BB) {
23247 if (!
SI->isSimple())
23258 if (
GEP->getNumIndices() != 1)
23260 Value *Idx =
GEP->idx_begin()->get();
23265 if (
GEP->getType()->isVectorTy())
23277 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23278 << VL.
size() <<
".\n");
23289 for (
Value *V : VL) {
23290 Type *Ty =
V->getType();
23294 R.getORE()->emit([&]() {
23295 std::string TypeStr;
23296 llvm::raw_string_ostream OS(TypeStr);
23298 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
23299 <<
"Cannot SLP vectorize list: type "
23300 << TypeStr +
" is unsupported by vectorizer";
23307 unsigned Sz =
R.getVectorElementSize(I0);
23308 unsigned MinVF =
R.getMinVF(Sz);
23309 unsigned MaxVF = std::max<unsigned>(
23311 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23313 R.getORE()->emit([&]() {
23314 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
23315 <<
"Cannot SLP vectorize list: vectorization factor "
23316 <<
"less than 2 is not supported";
23322 bool CandidateFound =
false;
23325 unsigned NextInst = 0, MaxInst = VL.size();
23326 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23332 if (TTI->getNumberOfParts(VecTy) == VF)
23334 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23335 unsigned ActualVF = std::min(MaxInst -
I, VF);
23340 if (MaxVFOnly && ActualVF < MaxVF)
23342 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23347 for (
Value *V : VL.drop_front(
I)) {
23351 !Inst || !
R.isDeleted(Inst)) {
23354 if (Idx == ActualVF)
23359 if (Idx != ActualVF)
23362 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23366 if (
R.isTreeTinyAndNotFullyVectorizable())
23368 if (
R.isProfitableToReorder()) {
23369 R.reorderTopToBottom();
23372 R.transformNodes();
23373 R.buildExternalUses();
23375 R.computeMinimumValueSizes();
23377 CandidateFound =
true;
23378 MinCost = std::min(MinCost,
Cost);
23381 <<
" for VF=" << ActualVF <<
"\n");
23384 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
23386 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23387 <<
" and with tree size "
23388 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23399 if (!
Changed && CandidateFound) {
23400 R.getORE()->emit([&]() {
23401 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
23402 <<
"List vectorization was possible but not beneficial with cost "
23403 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23407 R.getORE()->emit([&]() {
23408 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
23409 <<
"Cannot SLP vectorize list: vectorization was impossible"
23410 <<
" with available vectorization factors";
23445 using ReductionOpsType = SmallVector<Value *, 16>;
23446 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23447 ReductionOpsListType ReductionOps;
23451 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23452 WeakTrackingVH ReductionRoot;
23457 bool IsSupportedHorRdxIdentityOp =
false;
23464 static bool isCmpSelMinMax(Instruction *
I) {
23472 static bool isBoolLogicOp(Instruction *
I) {
23478 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
23479 bool TwoElementReduction =
false) {
23480 if (Kind == RecurKind::None)
23489 if (TwoElementReduction)
23492 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23496 return I->getFastMathFlags().noNaNs();
23499 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23502 return I->isAssociative();
23505 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
23511 return I->getOperand(2);
23512 return I->getOperand(Index);
23517 Value *
RHS,
const Twine &Name,
bool UseSelect) {
23521 case RecurKind::Or: {
23530 case RecurKind::And: {
23539 case RecurKind::Add:
23540 case RecurKind::Mul:
23541 case RecurKind::Xor:
23542 case RecurKind::FAdd:
23543 case RecurKind::FMul: {
23548 case RecurKind::SMax:
23549 case RecurKind::SMin:
23550 case RecurKind::UMax:
23551 case RecurKind::UMin:
23558 case RecurKind::FMax:
23559 case RecurKind::FMin:
23560 case RecurKind::FMaximum:
23561 case RecurKind::FMinimum:
23562 case RecurKind::FMaximumNum:
23563 case RecurKind::FMinimumNum: {
23576 const ReductionOpsListType &ReductionOps) {
23577 bool UseSelect = ReductionOps.size() == 2 ||
23579 (ReductionOps.size() == 1 &&
23581 assert((!UseSelect || ReductionOps.size() != 2 ||
23583 "Expected cmp + select pairs for reduction");
23584 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
23602 return RecurKind::None;
23604 return RecurKind::Add;
23606 return RecurKind::Mul;
23609 return RecurKind::And;
23612 return RecurKind::Or;
23614 return RecurKind::Xor;
23616 return RecurKind::FAdd;
23618 return RecurKind::FMul;
23621 return RecurKind::FMax;
23623 return RecurKind::FMin;
23626 return RecurKind::FMaximum;
23628 return RecurKind::FMinimum;
23634 return RecurKind::SMax;
23636 return RecurKind::SMin;
23638 return RecurKind::UMax;
23640 return RecurKind::UMin;
23666 return RecurKind::None;
23670 return RecurKind::None;
23673 return RecurKind::None;
23677 return RecurKind::None;
23682 return RecurKind::None;
23685 return RecurKind::SMax;
23688 return RecurKind::SMin;
23691 return RecurKind::UMax;
23694 return RecurKind::UMin;
23697 return RecurKind::None;
23701 static unsigned getFirstOperandIndex(Instruction *
I) {
23702 return isCmpSelMinMax(
I) ? 1 : 0;
23707 static unsigned getNumberOfOperands(Instruction *
I) {
23708 return isCmpSelMinMax(
I) ? 3 : 2;
23713 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
23714 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
23717 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
23719 return I->getParent() == BB;
23723 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
23724 if (IsCmpSelMinMax) {
23728 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23729 return I->hasNUses(2);
23737 void initReductionOps(Instruction *
I) {
23738 if (isCmpSelMinMax(
I))
23739 ReductionOps.assign(2, ReductionOpsType());
23741 ReductionOps.assign(1, ReductionOpsType());
23745 void addReductionOps(Instruction *
I) {
23746 if (isCmpSelMinMax(
I)) {
23748 ReductionOps[1].emplace_back(
I);
23750 ReductionOps[0].emplace_back(
I);
23755 int Sz =
Data.size();
23764 : ReductionRoot(
I), ReductionLimit(2) {
23765 RdxKind = HorizontalReduction::getRdxKind(
I);
23766 ReductionOps.emplace_back().push_back(
I);
23769 ReducedValsToOps[
V].push_back(
I);
23772 bool matchReductionForOperands()
const {
23775 assert(ReductionRoot &&
"Reduction root is not set!");
23778 return Ops.size() == 2;
23786 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23787 ScalarEvolution &SE,
const DataLayout &
DL,
23788 const TargetLibraryInfo &TLI) {
23789 RdxKind = HorizontalReduction::getRdxKind(Root);
23790 if (!isVectorizable(RdxKind, Root))
23802 if (!Sel->getCondition()->hasOneUse())
23805 ReductionRoot = Root;
23810 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23812 1, std::make_pair(Root, 0));
23817 SmallVectorImpl<Value *> &PossibleReducedVals,
23818 SmallVectorImpl<Instruction *> &ReductionOps,
23821 getNumberOfOperands(TreeN)))) {
23822 Value *EdgeVal = getRdxOperand(TreeN,
I);
23823 ReducedValsToOps[EdgeVal].push_back(TreeN);
23831 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23832 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23833 !isVectorizable(RdxKind, EdgeInst) ||
23834 (
R.isAnalyzedReductionRoot(EdgeInst) &&
23836 PossibleReducedVals.push_back(EdgeVal);
23839 ReductionOps.push_back(EdgeInst);
23848 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
23850 PossibleReducedVals;
23851 initReductionOps(Root);
23853 SmallSet<size_t, 2> LoadKeyUsed;
23855 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
23860 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
23861 if (LIt != LoadsMap.
end()) {
23862 for (LoadInst *RLI : LIt->second) {
23868 for (LoadInst *RLI : LIt->second) {
23875 if (LIt->second.size() > 2) {
23877 hash_value(LIt->second.back()->getPointerOperand());
23883 .first->second.push_back(LI);
23887 while (!Worklist.empty()) {
23888 auto [TreeN,
Level] = Worklist.pop_back_val();
23891 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23892 addReductionOps(TreeN);
23895 for (
Value *V : PossibleRedVals) {
23899 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
23901 for (Instruction *
I :
reverse(PossibleReductionOps))
23902 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
23904 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
23907 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
23908 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
23910 for (
auto &Slice : PossibleRedVals) {
23912 auto RedValsVect = Slice.second.takeVector();
23914 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
23915 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
23917 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
23918 return P1.size() > P2.size();
23925 }
else if (!isGoodForReduction(
Data)) {
23928 if (!LI || !LastLI ||
23933 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
23939 return P1.size() > P2.
size();
23945 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
23946 const TargetLibraryInfo &TLI, AssumptionCache *AC,
23947 DominatorTree &DT) {
23948 constexpr unsigned RegMaxNumber = 4;
23949 constexpr unsigned RedValsMaxNumber = 128;
23953 if (
unsigned NumReducedVals = std::accumulate(
23954 ReducedVals.
begin(), ReducedVals.
end(), 0,
23956 if (!isGoodForReduction(Vals))
23958 return Num + Vals.size();
23960 NumReducedVals < ReductionLimit &&
23964 for (ReductionOpsType &RdxOps : ReductionOps)
23965 for (
Value *RdxOp : RdxOps)
23970 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
23976 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
23977 ReducedVals.
front().size());
23981 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
23983 "Expected min/max reduction to have select root instruction");
23986 "Expected min/max reduction to have compare condition");
23990 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
23991 return isBoolLogicOp(cast<Instruction>(V));
23994 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
23995 if (VectorizedTree) {
23999 if (AnyBoolLogicOp) {
24000 auto It = ReducedValsToOps.
find(VectorizedTree);
24001 auto It1 = ReducedValsToOps.
find(Res);
24002 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
24004 (It != ReducedValsToOps.
end() &&
24005 any_of(It->getSecond(), [&](Instruction *
I) {
24006 return isBoolLogicOp(I) &&
24007 getRdxOperand(I, 0) == VectorizedTree;
24011 (It1 != ReducedValsToOps.
end() &&
24012 any_of(It1->getSecond(), [&](Instruction *
I) {
24013 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24017 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
24021 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
24027 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24028 ReductionOps.front().size());
24029 for (ReductionOpsType &RdxOps : ReductionOps)
24030 for (
Value *RdxOp : RdxOps) {
24033 IgnoreList.insert(RdxOp);
24036 FastMathFlags RdxFMF;
24038 for (
Value *U : IgnoreList)
24040 RdxFMF &= FPMO->getFastMathFlags();
24046 for (
Value *V : Candidates)
24047 TrackedVals.try_emplace(V, V);
24049 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24050 Value *
V) ->
unsigned & {
24051 auto *It = MV.
find(V);
24052 assert(It != MV.
end() &&
"Unable to find given key.");
24056 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
24059 SmallPtrSet<Value *, 4> RequiredExtract;
24060 WeakTrackingVH VectorizedTree =
nullptr;
24061 bool CheckForReusedReductionOps =
false;
24066 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
24068 InstructionsState S = States[
I];
24071 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
24072 for (
Value *ReducedVal : OrigReducedVals) {
24073 Value *RdxVal = TrackedVals.at(ReducedVal);
24080 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24084 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24086 bool ShuffledExtracts =
false;
24088 if (S && S.getOpcode() == Instruction::ExtractElement &&
24089 !S.isAltShuffle() &&
I + 1 <
E) {
24091 for (
Value *RV : ReducedVals[
I + 1]) {
24092 Value *RdxVal = TrackedVals.at(RV);
24099 CommonCandidates.push_back(RdxVal);
24100 TrackedToOrig.try_emplace(RdxVal, RV);
24102 SmallVector<int>
Mask;
24105 Candidates.
swap(CommonCandidates);
24106 ShuffledExtracts =
true;
24113 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24114 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24116 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24117 Value *OrigV = TrackedToOrig.at(VC);
24118 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24120 V.analyzedReductionRoot(ResI);
24122 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24126 unsigned NumReducedVals = Candidates.
size();
24127 if (NumReducedVals < ReductionLimit &&
24128 (NumReducedVals < 2 || !
isSplat(Candidates)))
24133 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24134 RdxKind != RecurKind::FMul &&
24135 RdxKind != RecurKind::FMulAdd;
24137 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24138 if (IsSupportedHorRdxIdentityOp)
24139 for (
Value *V : Candidates) {
24140 Value *OrigV = TrackedToOrig.at(V);
24141 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24153 bool SameScaleFactor =
false;
24154 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24155 SameValuesCounter.
size() != Candidates.size();
24157 if (OptReusedScalars) {
24159 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24160 RdxKind == RecurKind::Xor) &&
24162 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24163 return P.second == SameValuesCounter.
front().second;
24165 Candidates.resize(SameValuesCounter.
size());
24166 transform(SameValuesCounter, Candidates.begin(),
24167 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24168 NumReducedVals = Candidates.size();
24170 if (NumReducedVals == 1) {
24171 Value *OrigV = TrackedToOrig.at(Candidates.front());
24172 unsigned Cnt = At(SameValuesCounter, OrigV);
24174 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24175 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24176 VectorizedVals.try_emplace(OrigV, Cnt);
24177 ExternallyUsedValues.
insert(OrigV);
24182 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24183 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24184 const unsigned MaxElts = std::clamp<unsigned>(
24186 RegMaxNumber * RedValsMaxNumber);
24188 unsigned ReduxWidth = NumReducedVals;
24189 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24190 unsigned NumParts, NumRegs;
24191 Type *ScalarTy = Candidates.front()->getType();
24198 while (NumParts > NumRegs) {
24199 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24200 ReduxWidth =
bit_floor(ReduxWidth - 1);
24206 if (NumParts > NumRegs / 2)
24211 ReduxWidth = GetVectorFactor(ReduxWidth);
24212 ReduxWidth = std::min(ReduxWidth, MaxElts);
24214 unsigned Start = 0;
24215 unsigned Pos =
Start;
24217 unsigned PrevReduxWidth = ReduxWidth;
24218 bool CheckForReusedReductionOpsLocal =
false;
24219 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24220 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24221 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24224 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24227 if (Pos < NumReducedVals - ReduxWidth + 1)
24228 return IsAnyRedOpGathered;
24231 if (ReduxWidth > 1)
24232 ReduxWidth = GetVectorFactor(ReduxWidth);
24233 return IsAnyRedOpGathered;
24235 bool AnyVectorized =
false;
24236 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24237 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24238 ReduxWidth >= ReductionLimit) {
24241 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24243 CheckForReusedReductionOps =
true;
24246 PrevReduxWidth = ReduxWidth;
24249 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24252 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24254 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24256 V.areAnalyzedReductionVals(VL)) {
24257 (void)AdjustReducedVals(
true);
24264 return RedValI &&
V.isDeleted(RedValI);
24267 V.buildTree(VL, IgnoreList);
24268 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24269 if (!AdjustReducedVals())
24270 V.analyzedReductionVals(VL);
24273 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24274 if (!AdjustReducedVals())
24275 V.analyzedReductionVals(VL);
24278 V.reorderTopToBottom();
24281 VL.front()->getType()->isIntOrIntVectorTy() ||
24282 ReductionLimit > 2);
24286 ExternallyUsedValues);
24290 LocalExternallyUsedValues.insert(ReductionRoot);
24291 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24292 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24294 for (
Value *V : ReducedVals[Cnt])
24296 LocalExternallyUsedValues.insert(TrackedVals[V]);
24298 if (!IsSupportedHorRdxIdentityOp) {
24301 "Reused values counter map is not empty");
24302 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24303 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24305 Value *
V = Candidates[Cnt];
24306 Value *OrigV = TrackedToOrig.at(V);
24307 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24310 V.transformNodes();
24313 SmallPtrSet<Value *, 4> Visited;
24314 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24315 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24317 Value *RdxVal = Candidates[Cnt];
24318 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24319 RdxVal = It->second;
24320 if (!Visited.
insert(RdxVal).second)
24324 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24325 LocalExternallyUsedValues.insert(RdxVal);
24328 Value *OrigV = TrackedToOrig.at(RdxVal);
24330 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24331 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
24332 LocalExternallyUsedValues.insert(RdxVal);
24335 if (!IsSupportedHorRdxIdentityOp)
24336 SameValuesCounter.
clear();
24337 for (
Value *RdxVal : VL)
24338 if (RequiredExtract.
contains(RdxVal))
24339 LocalExternallyUsedValues.insert(RdxVal);
24340 V.buildExternalUses(LocalExternallyUsedValues);
24342 V.computeMinimumValueSizes();
24346 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
24349 <<
" for reduction\n");
24353 V.getORE()->emit([&]() {
24354 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
24355 ReducedValsToOps.
at(VL[0]).front())
24356 <<
"Vectorizing horizontal reduction is possible "
24357 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24358 <<
" and threshold "
24361 if (!AdjustReducedVals()) {
24362 V.analyzedReductionVals(VL);
24364 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24367 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24368 VF >= ReductionLimit;
24370 *
TTI, VL.front()->getType(), VF - 1)) {
24372 V.getCanonicalGraphSize() !=
V.getTreeSize())
24375 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
24382 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24383 <<
Cost <<
". (HorRdx)\n");
24384 V.getORE()->emit([&]() {
24385 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
24386 ReducedValsToOps.
at(VL[0]).front())
24387 <<
"Vectorized horizontal reduction with cost "
24388 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24389 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24398 if (IsCmpSelMinMax)
24399 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24402 Value *VectorizedRoot =
V.vectorizeTree(
24403 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24406 for (
Value *RdxVal : Candidates) {
24407 Value *OrigVal = TrackedToOrig.at(RdxVal);
24408 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24409 if (TransformedRdxVal != RdxVal)
24410 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24419 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24422 if (OptReusedScalars && !SameScaleFactor) {
24423 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24424 SameValuesCounter, TrackedToOrig);
24427 Type *ScalarTy = VL.front()->getType();
24432 OptReusedScalars && SameScaleFactor
24433 ? SameValuesCounter.
front().second
24436 ?
V.isSignedMinBitwidthRootNode()
24440 for (
Value *RdxVal : VL) {
24441 Value *OrigV = TrackedToOrig.at(RdxVal);
24442 if (IsSupportedHorRdxIdentityOp) {
24443 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24446 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24447 if (!
V.isVectorized(RdxVal))
24448 RequiredExtract.
insert(RdxVal);
24452 ReduxWidth = NumReducedVals - Pos;
24453 if (ReduxWidth > 1)
24454 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24455 AnyVectorized =
true;
24457 if (OptReusedScalars && !AnyVectorized) {
24458 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24459 Value *RdxVal = TrackedVals.at(
P.first);
24460 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24461 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24462 VectorizedVals.try_emplace(
P.first,
P.second);
24467 if (!VectorValuesAndScales.
empty())
24468 VectorizedTree = GetNewVectorizedTree(
24470 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24472 if (!VectorizedTree) {
24473 if (!CheckForReusedReductionOps) {
24474 for (ReductionOpsType &RdxOps : ReductionOps)
24475 for (
Value *RdxOp : RdxOps)
24497 auto FixBoolLogicalOps =
24500 if (!AnyBoolLogicOp)
24502 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24503 getRdxOperand(RedOp1, 0) ==
LHS ||
24506 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24507 getRdxOperand(RedOp2, 0) ==
RHS ||
24512 if (
LHS != VectorizedTree)
24520 unsigned Sz = InstVals.
size();
24522 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24525 Value *RdxVal1 = InstVals[
I].second;
24526 Value *StableRdxVal1 = RdxVal1;
24527 auto It1 = TrackedVals.find(RdxVal1);
24528 if (It1 != TrackedVals.end())
24529 StableRdxVal1 = It1->second;
24530 Value *RdxVal2 = InstVals[
I + 1].second;
24531 Value *StableRdxVal2 = RdxVal2;
24532 auto It2 = TrackedVals.find(RdxVal2);
24533 if (It2 != TrackedVals.end())
24534 StableRdxVal2 = It2->second;
24538 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24540 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24541 StableRdxVal2,
"op.rdx", ReductionOps);
24542 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24545 ExtraReds[Sz / 2] = InstVals.
back();
24551 SmallPtrSet<Value *, 8> Visited;
24553 for (
Value *RdxVal : Candidates) {
24554 if (!Visited.
insert(RdxVal).second)
24556 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24557 for (Instruction *RedOp :
24563 bool InitStep =
true;
24564 while (ExtraReductions.
size() > 1) {
24566 FinalGen(ExtraReductions, InitStep);
24567 ExtraReductions.
swap(NewReds);
24570 VectorizedTree = ExtraReductions.
front().second;
24572 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24579 SmallPtrSet<Value *, 4> IgnoreSet;
24588 for (
auto *U :
Ignore->users()) {
24590 "All users must be either in the reduction ops list.");
24593 if (!
Ignore->use_empty()) {
24595 Ignore->replaceAllUsesWith(
P);
24598 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24600 return VectorizedTree;
24606 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24607 Value *Vec,
unsigned Scale,
bool IsSigned,
24631 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
24634 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
24636 if (Rdx->
getType() != DestTy)
24642 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24649 bool IsCmpSelMinMax, FastMathFlags FMF,
24650 const BoUpSLP &R, DominatorTree &DT,
24651 const DataLayout &
DL,
24652 const TargetLibraryInfo &TLI) {
24654 Type *ScalarTy = ReducedVals.
front()->getType();
24655 unsigned ReduxWidth = ReducedVals.
size();
24656 FixedVectorType *VectorTy =
R.getReductionType();
24661 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
24664 int Cnt = ReducedVals.
size();
24665 for (
Value *RdxVal : ReducedVals) {
24670 Cost += GenCostFn();
24674 for (User *U : RdxVal->
users()) {
24676 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24677 if (RdxKind == RecurKind::FAdd) {
24687 FMACost -= FMulCost;
24689 ScalarCost += FMACost;
24696 ScalarCost = InstructionCost::getInvalid();
24700 Cost += ScalarCost;
24702 Cost += GenCostFn();
24711 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
24713 case RecurKind::Add:
24714 case RecurKind::Mul:
24715 case RecurKind::Or:
24716 case RecurKind::And:
24717 case RecurKind::Xor:
24718 case RecurKind::FAdd:
24719 case RecurKind::FMul: {
24722 if (DoesRequireReductionOp) {
24725 unsigned ScalarTyNumElements = VecTy->getNumElements();
24730 ReducedVals.size()),
24741 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24742 std::make_pair(RedTy,
true));
24743 if (RType == RedTy) {
24748 RdxOpcode, !IsSigned, RedTy,
24754 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24755 std::make_pair(RedTy,
true));
24758 if (RdxKind == RecurKind::FAdd) {
24763 for (
Value *RdxVal : ReducedVals) {
24769 FMF &= FPCI->getFastMathFlags();
24772 if (!
Ops.empty()) {
24777 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24778 {RVecTy, RVecTy, RVecTy}, FMF);
24784 Instruction::FMul, RVecTy,
CostKind);
24786 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
24787 FMACost -= FMulCost;
24791 if (FMACost.isValid())
24792 VectorCost += FMACost;
24796 if (RType != RedTy) {
24797 unsigned Opcode = Instruction::Trunc;
24799 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24805 ScalarCost = EvaluateScalarCost([&]() {
24810 case RecurKind::FMax:
24811 case RecurKind::FMin:
24812 case RecurKind::FMaximum:
24813 case RecurKind::FMinimum:
24814 case RecurKind::SMax:
24815 case RecurKind::SMin:
24816 case RecurKind::UMax:
24817 case RecurKind::UMin: {
24820 if (DoesRequireReductionOp) {
24826 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24827 std::make_pair(RedTy,
true));
24829 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24831 if (RType != RedTy) {
24832 unsigned Opcode = Instruction::Trunc;
24834 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24840 ScalarCost = EvaluateScalarCost([&]() {
24841 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24850 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
24852 <<
" (It is a splitting reduction)\n");
24853 return VectorCost - ScalarCost;
24859 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24861 Value *ReducedSubTree =
nullptr;
24863 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
24864 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
24865 if (ReducedSubTree)
24866 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24867 "op.rdx", ReductionOps);
24869 ReducedSubTree = Rdx;
24871 if (VectorValuesAndScales.
size() == 1) {
24872 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
24873 CreateSingleOp(Vec, Scale, IsSigned);
24874 return ReducedSubTree;
24878 Value *VecRes =
nullptr;
24879 bool VecResSignedness =
false;
24880 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
24886 case RecurKind::Add: {
24887 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
24890 <<
". (HorRdx)\n");
24893 std::iota(std::next(
Mask.begin(), VF *
I),
24894 std::next(
Mask.begin(), VF * (
I + 1)), 0);
24895 ++NumVectorInstructions;
24906 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
24907 <<
". (HorRdx)\n");
24908 ++NumVectorInstructions;
24912 case RecurKind::Xor: {
24915 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
24920 case RecurKind::FAdd: {
24924 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
24925 <<
". (HorRdx)\n");
24926 ++NumVectorInstructions;
24930 case RecurKind::And:
24931 case RecurKind::Or:
24932 case RecurKind::SMax:
24933 case RecurKind::SMin:
24934 case RecurKind::UMax:
24935 case RecurKind::UMin:
24936 case RecurKind::FMax:
24937 case RecurKind::FMin:
24938 case RecurKind::FMaximum:
24939 case RecurKind::FMinimum:
24942 case RecurKind::Sub:
24943 case RecurKind::AddChainWithSubs:
24944 case RecurKind::Mul:
24945 case RecurKind::FMul:
24946 case RecurKind::FMulAdd:
24947 case RecurKind::AnyOf:
24948 case RecurKind::FindFirstIVSMin:
24949 case RecurKind::FindFirstIVUMin:
24950 case RecurKind::FindLastIVSMax:
24951 case RecurKind::FindLastIVUMax:
24952 case RecurKind::FMaxNum:
24953 case RecurKind::FMinNum:
24954 case RecurKind::FMaximumNum:
24955 case RecurKind::FMinimumNum:
24956 case RecurKind::None:
24963 VecResSignedness = IsSigned;
24965 ++NumVectorInstructions;
24966 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
24972 std::iota(
Mask.begin(),
Mask.end(), 0);
24974 if (VecResVF < VecVF) {
24978 if (VecResVF != VecVF) {
24980 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
24997 if (VecResVF < VecVF) {
25003 if (VecResVF != VecVF)
25005 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
25006 if (VecResVF != VecVF)
25011 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25012 CreateVecOp(Vec, Scale, IsSigned);
25013 CreateSingleOp(VecRes, 1,
false);
25015 return ReducedSubTree;
25019 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
25020 const TargetTransformInfo *
TTI,
Type *DestTy) {
25021 assert(VectorizedValue &&
"Need to have a vectorized tree node");
25022 assert(RdxKind != RecurKind::FMulAdd &&
25023 "A call to the llvm.fmuladd intrinsic is not handled yet");
25026 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
25027 RdxKind == RecurKind::Add &&
25032 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
25033 ++NumVectorInstructions;
25036 ++NumVectorInstructions;
25041 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
25043 assert(IsSupportedHorRdxIdentityOp &&
25044 "The optimization of matched scalar identity horizontal reductions "
25045 "must be supported.");
25047 return VectorizedValue;
25049 case RecurKind::Add: {
25051 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
25053 << VectorizedValue <<
". (HorRdx)\n");
25054 return Builder.
CreateMul(VectorizedValue, Scale);
25056 case RecurKind::Xor: {
25058 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
25059 <<
". (HorRdx)\n");
25062 return VectorizedValue;
25064 case RecurKind::FAdd: {
25066 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
25068 << VectorizedValue <<
". (HorRdx)\n");
25069 return Builder.
CreateFMul(VectorizedValue, Scale);
25071 case RecurKind::And:
25072 case RecurKind::Or:
25073 case RecurKind::SMax:
25074 case RecurKind::SMin:
25075 case RecurKind::UMax:
25076 case RecurKind::UMin:
25077 case RecurKind::FMax:
25078 case RecurKind::FMin:
25079 case RecurKind::FMaximum:
25080 case RecurKind::FMinimum:
25082 return VectorizedValue;
25083 case RecurKind::Sub:
25084 case RecurKind::AddChainWithSubs:
25085 case RecurKind::Mul:
25086 case RecurKind::FMul:
25087 case RecurKind::FMulAdd:
25088 case RecurKind::AnyOf:
25089 case RecurKind::FindFirstIVSMin:
25090 case RecurKind::FindFirstIVUMin:
25091 case RecurKind::FindLastIVSMax:
25092 case RecurKind::FindLastIVUMax:
25093 case RecurKind::FMaxNum:
25094 case RecurKind::FMinNum:
25095 case RecurKind::FMaximumNum:
25096 case RecurKind::FMinimumNum:
25097 case RecurKind::None:
25106 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25107 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25108 const DenseMap<Value *, Value *> &TrackedToOrig) {
25109 assert(IsSupportedHorRdxIdentityOp &&
25110 "The optimization of matched scalar identity horizontal reductions "
25111 "must be supported.");
25114 if (VTy->getElementType() != VL.
front()->getType()) {
25118 R.isSignedMinBitwidthRootNode());
25121 case RecurKind::Add: {
25124 for (
Value *V : VL) {
25125 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25126 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
25130 << VectorizedValue <<
". (HorRdx)\n");
25131 return Builder.
CreateMul(VectorizedValue, Scale);
25133 case RecurKind::And:
25134 case RecurKind::Or:
25137 <<
". (HorRdx)\n");
25138 return VectorizedValue;
25139 case RecurKind::SMax:
25140 case RecurKind::SMin:
25141 case RecurKind::UMax:
25142 case RecurKind::UMin:
25143 case RecurKind::FMax:
25144 case RecurKind::FMin:
25145 case RecurKind::FMaximum:
25146 case RecurKind::FMinimum:
25149 <<
". (HorRdx)\n");
25150 return VectorizedValue;
25151 case RecurKind::Xor: {
25156 SmallVector<int>
Mask(
25159 std::iota(
Mask.begin(),
Mask.end(), 0);
25160 bool NeedShuffle =
false;
25161 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25163 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25164 if (Cnt % 2 == 0) {
25166 NeedShuffle =
true;
25172 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25176 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
25177 return VectorizedValue;
25179 case RecurKind::FAdd: {
25182 for (
Value *V : VL) {
25183 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25184 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25187 return Builder.
CreateFMul(VectorizedValue, Scale);
25189 case RecurKind::Sub:
25190 case RecurKind::AddChainWithSubs:
25191 case RecurKind::Mul:
25192 case RecurKind::FMul:
25193 case RecurKind::FMulAdd:
25194 case RecurKind::AnyOf:
25195 case RecurKind::FindFirstIVSMin:
25196 case RecurKind::FindFirstIVUMin:
25197 case RecurKind::FindLastIVSMax:
25198 case RecurKind::FindLastIVUMax:
25199 case RecurKind::FMaxNum:
25200 case RecurKind::FMinNum:
25201 case RecurKind::FMaximumNum:
25202 case RecurKind::FMinimumNum:
25203 case RecurKind::None:
25213 return HorizontalReduction::getRdxKind(V);
25219 unsigned AggregateSize = 1;
25221 Type *CurrentType =
IV->getType();
25224 for (
auto *Elt : ST->elements())
25225 if (Elt != ST->getElementType(0))
25226 return std::nullopt;
25227 AggregateSize *= ST->getNumElements();
25228 CurrentType = ST->getElementType(0);
25230 AggregateSize *= AT->getNumElements();
25231 CurrentType = AT->getElementType();
25233 AggregateSize *= VT->getNumElements();
25234 return AggregateSize;
25236 return AggregateSize;
25238 return std::nullopt;
25247 unsigned OperandOffset,
const BoUpSLP &R) {
25250 std::optional<unsigned> OperandIndex =
25252 if (!OperandIndex || R.isDeleted(LastInsertInst))
25256 BuildVectorOpds, InsertElts, *OperandIndex, R);
25259 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25260 InsertElts[*OperandIndex] = LastInsertInst;
25263 }
while (LastInsertInst !=
nullptr &&
25290 "Expected insertelement or insertvalue instruction!");
25293 "Expected empty result vectors!");
25296 if (!AggregateSize)
25298 BuildVectorOpds.
resize(*AggregateSize);
25299 InsertElts.
resize(*AggregateSize);
25304 if (BuildVectorOpds.
size() >= 2)
25322 auto DominatedReduxValue = [&](
Value *R) {
25330 if (
P->getIncomingBlock(0) == ParentBB) {
25332 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25336 if (Rdx && DominatedReduxValue(Rdx))
25349 if (
P->getIncomingBlock(0) == BBLatch) {
25351 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25355 if (Rdx && DominatedReduxValue(Rdx))
25391 "Expected binop, select, or intrinsic for reduction matching");
25393 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25395 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25406 Value *Op0 =
nullptr;
25407 Value *Op1 =
nullptr;
25416 Value *B0 =
nullptr, *B1 =
nullptr;
25421bool SLPVectorizerPass::vectorizeHorReduction(
25422 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
25423 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25432 auto SelectRoot = [&]() {
25451 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25452 Stack.emplace(SelectRoot(), 0);
25453 SmallPtrSet<Value *, 8> VisitedInstrs;
25456 if (
R.isAnalyzedReductionRoot(Inst))
25461 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25463 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25465 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25466 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25478 while (!
Stack.empty()) {
25481 std::tie(Inst, Level) =
Stack.front();
25486 if (
R.isDeleted(Inst))
25488 if (
Value *VectorizedV = TryToReduce(Inst)) {
25492 Stack.emplace(
I, Level);
25495 if (
R.isDeleted(Inst))
25499 if (!TryAppendToPostponedInsts(Inst)) {
25510 if (VisitedInstrs.
insert(
Op).second)
25515 !
R.isDeleted(
I) &&
I->getParent() == BB)
25516 Stack.emplace(
I, Level);
25521bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
25528 if ((
I->getOpcode() == Instruction::FAdd ||
25529 I->getOpcode() == Instruction::FSub) &&
25539 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25540 R.isDeleted(Op0) ||
R.isDeleted(Op1))
25550 if (
A &&
B &&
B->hasOneUse()) {
25553 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
25555 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
25559 if (
B &&
A &&
A->hasOneUse()) {
25562 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
25564 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
25568 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
25572 Type *Ty = Inst->getType();
25576 if (!HorRdx.matchReductionForOperands())
25582 TTI.getScalarizationOverhead(
25585 TTI.getInstructionCost(Inst,
CostKind);
25597 FMF = FPCI->getFastMathFlags();
25598 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25605 if (RedCost >= ScalarCost)
25608 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
25610 if (Candidates.
size() == 1)
25611 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
25614 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
25615 if (!BestCandidate)
25617 return (*BestCandidate == 0 &&
25618 TryToReduce(
I, {Candidates[*BestCandidate].first,
25619 Candidates[*BestCandidate].second})) ||
25620 tryToVectorizeList({Candidates[*BestCandidate].first,
25621 Candidates[*BestCandidate].second},
25625bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
25626 BasicBlock *BB,
BoUpSLP &R) {
25628 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
25629 Res |= tryToVectorize(PostponedInsts, R);
25636 for (
Value *V : Insts)
25638 Res |= tryToVectorize(Inst, R);
25642bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25645 if (!
R.canMapToVector(IVI->
getType()))
25648 SmallVector<Value *, 16> BuildVectorOpds;
25649 SmallVector<Value *, 16> BuildVectorInsts;
25653 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
25654 R.getORE()->emit([&]() {
25655 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
25656 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
25657 "trying reduction first.";
25661 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
25663 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25666bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25669 SmallVector<Value *, 16> BuildVectorInsts;
25670 SmallVector<Value *, 16> BuildVectorOpds;
25671 SmallVector<int>
Mask;
25677 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
25678 R.getORE()->emit([&]() {
25679 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
25680 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
25681 "trying reduction first.";
25685 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
25686 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25689template <
typename T>
25694 bool MaxVFOnly,
BoUpSLP &R) {
25707 if (!
I || R.isDeleted(
I)) {
25711 auto *SameTypeIt = IncIt;
25714 AreCompatible(VL, *SameTypeIt))) {
25717 if (
I && !R.isDeleted(
I))
25722 unsigned NumElts = VL.
size();
25723 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
25724 << NumElts <<
")\n");
25734 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
25737 VL.
swap(Candidates);
25738 Candidates.
clear();
25746 auto GetMinNumElements = [&R](
Value *V) {
25747 unsigned EltSize = R.getVectorElementSize(V);
25748 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25750 if (NumElts < GetMinNumElements(*IncIt) &&
25751 (Candidates.
empty() ||
25752 Candidates.
front()->getType() == (*IncIt)->getType())) {
25760 if (Candidates.
size() > 1 &&
25761 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25762 if (TryToVectorizeHelper(Candidates,
false)) {
25765 }
else if (MaxVFOnly) {
25768 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
25771 if (!
I || R.isDeleted(
I)) {
25775 auto *SameTypeIt = It;
25776 while (SameTypeIt != End &&
25779 AreCompatible(*SameTypeIt, *It))) {
25782 if (
I && !R.isDeleted(
I))
25785 unsigned NumElts = VL.
size();
25786 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
25792 Candidates.
clear();
25796 IncIt = SameTypeIt;
25808template <
bool IsCompatibility>
25813 "Expected valid element types only.");
25815 return IsCompatibility;
25818 if (CI1->getOperand(0)->getType()->getTypeID() <
25820 return !IsCompatibility;
25821 if (CI1->getOperand(0)->getType()->getTypeID() >
25824 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25826 return !IsCompatibility;
25827 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25836 if (BasePred1 < BasePred2)
25837 return !IsCompatibility;
25838 if (BasePred1 > BasePred2)
25841 bool CI1Preds = Pred1 == BasePred1;
25842 bool CI2Preds = Pred2 == BasePred1;
25843 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
25844 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
25849 return !IsCompatibility;
25854 if (IsCompatibility) {
25855 if (I1->getParent() != I2->getParent())
25862 return NodeI2 !=
nullptr;
25865 assert((NodeI1 == NodeI2) ==
25867 "Different nodes should have different DFS numbers");
25868 if (NodeI1 != NodeI2)
25872 if (S && (IsCompatibility || !S.isAltShuffle()))
25874 if (IsCompatibility)
25876 if (I1->getOpcode() != I2->getOpcode())
25877 return I1->getOpcode() < I2->getOpcode();
25880 return IsCompatibility;
25883template <
typename ItT>
25885 BasicBlock *BB,
BoUpSLP &R) {
25888 for (CmpInst *
I : CmpInsts) {
25889 if (
R.isDeleted(
I))
25893 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
25894 if (
R.isDeleted(
I))
25899 for (CmpInst *
I : CmpInsts) {
25900 if (
R.isDeleted(
I))
25919 for (Instruction *V : CmpInsts)
25922 if (Vals.
size() <= 1)
25925 Vals, CompareSorter, AreCompatibleCompares,
25928 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
25929 return any_of(
V->users(), [V](User *U) {
25930 auto *Select = dyn_cast<SelectInst>(U);
25932 Select->getParent() != cast<Instruction>(V)->getParent();
25935 if (ArePossiblyReducedInOtherBlock)
25937 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25943bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25944 BasicBlock *BB,
BoUpSLP &R) {
25946 "This function only accepts Insert instructions");
25947 bool OpsChanged =
false;
25949 for (
auto *
I :
reverse(Instructions)) {
25955 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
25958 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
25961 if (
R.isDeleted(
I))
25963 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
25969 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
25971 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
25976 OpsChanged |= tryToVectorize(PostponedInsts, R);
25982bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
25985 SmallPtrSet<Value *, 16> VisitedInstrs;
25989 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
25990 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
25993 "Expected vectorizable types only.");
26003 V2->getType()->getScalarSizeInBits())
26006 V2->getType()->getScalarSizeInBits())
26010 if (Opcodes1.
size() < Opcodes2.
size())
26012 if (Opcodes1.
size() > Opcodes2.
size())
26014 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26023 return NodeI2 !=
nullptr;
26026 assert((NodeI1 == NodeI2) ==
26028 "Different nodes should have different DFS numbers");
26029 if (NodeI1 != NodeI2)
26032 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
26048 DT->getNode(V1->getParent());
26050 DT->getNode(V2->getParent());
26052 return NodeI2 !=
nullptr;
26055 assert((NodeI1 == NodeI2) ==
26057 "Different nodes should have different DFS numbers");
26058 if (NodeI1 != NodeI2)
26060 return V1->comesBefore(V2);
26073 return *Id1 < *Id2;
26077 if (
I1->getOpcode() == I2->getOpcode())
26079 return I1->getOpcode() < I2->getOpcode();
26102 auto ValID1 = Opcodes1[
I]->getValueID();
26103 auto ValID2 = Opcodes2[
I]->getValueID();
26104 if (ValID1 == ValID2)
26106 if (ValID1 < ValID2)
26108 if (ValID1 > ValID2)
26117 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
26123 if (VL.empty() || V1 == VL.back())
26125 Value *V2 = VL.back();
26130 if (Opcodes1.
size() != Opcodes2.
size())
26132 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26138 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
26140 if (
I1->getParent() != I2->getParent())
26148 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26154 bool HaveVectorizedPhiNodes =
false;
26158 for (Instruction &
I : *BB) {
26165 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26170 if (Incoming.
size() <= 1)
26175 for (
Value *V : Incoming) {
26176 SmallVectorImpl<Value *> &Opcodes =
26178 if (!Opcodes.
empty())
26181 SmallPtrSet<Value *, 4> Visited;
26182 while (!Nodes.empty()) {
26186 for (
Value *V :
PHI->incoming_values()) {
26188 Nodes.push_back(PHI1);
26197 Incoming, PHICompare, AreCompatiblePHIs,
26199 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26202 Changed |= HaveVectorizedPhiNodes;
26203 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26205 return !
PHI ||
R.isDeleted(
PHI);
26207 PHIToOpcodes.
clear();
26209 }
while (HaveVectorizedPhiNodes);
26211 VisitedInstrs.
clear();
26213 InstSetVector PostProcessInserts;
26214 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26217 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26218 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26219 if (VectorizeCmps) {
26221 PostProcessCmps.
clear();
26223 PostProcessInserts.clear();
26229 return PostProcessCmps.
contains(Cmp);
26231 PostProcessInserts.contains(
I);
26237 return I->use_empty() &&
26247 if (
R.isDeleted(&*It))
26250 if (!VisitedInstrs.
insert(&*It).second) {
26251 if (HasNoUsers(&*It) &&
26252 VectorizeInsertsAndCmps(It->isTerminator())) {
26265 if (
P->getNumIncomingValues() == 2) {
26268 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26282 if (BB ==
P->getIncomingBlock(
I) ||
26283 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
26289 PI && !IsInPostProcessInstrs(PI)) {
26291 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26293 if (Res &&
R.isDeleted(
P)) {
26303 if (HasNoUsers(&*It)) {
26304 bool OpsChanged =
false;
26315 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26316 SI->getValueOperand()->hasOneUse();
26318 if (TryToVectorizeRoot) {
26319 for (
auto *V : It->operand_values()) {
26323 VI && !IsInPostProcessInstrs(VI))
26325 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26332 VectorizeInsertsAndCmps(It->isTerminator());
26344 PostProcessInserts.insert(&*It);
26352bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
26354 for (
auto &Entry : GEPs) {
26357 if (
Entry.second.size() < 2)
26360 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26361 <<
Entry.second.size() <<
".\n");
26369 return !R.isDeleted(GEP);
26371 if (It ==
Entry.second.end())
26373 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26374 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26375 if (MaxVecRegSize < EltSize)
26378 unsigned MaxElts = MaxVecRegSize / EltSize;
26379 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26380 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26393 Candidates.remove_if([&R](
Value *
I) {
26403 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
26404 auto *GEPI = GEPList[
I];
26405 if (!Candidates.count(GEPI))
26407 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
26408 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
26409 auto *GEPJ = GEPList[J];
26410 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26412 Candidates.remove(GEPI);
26413 Candidates.remove(GEPJ);
26414 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26415 Candidates.remove(GEPJ);
26422 if (Candidates.
size() < 2)
26428 SmallVector<Value *, 16> Bundle(Candidates.
size());
26429 auto BundleIndex = 0
u;
26430 for (
auto *V : Candidates) {
26432 auto *GEPIdx =
GEP->idx_begin()->get();
26434 Bundle[BundleIndex++] = GEPIdx;
26446 Changed |= tryToVectorizeList(Bundle, R);
26452bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26457 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
26458 if (
V->getValueOperand()->getType()->getTypeID() <
26461 if (
V->getValueOperand()->getType()->getTypeID() >
26464 if (
V->getPointerOperandType()->getTypeID() <
26465 V2->getPointerOperandType()->getTypeID())
26467 if (
V->getPointerOperandType()->getTypeID() >
26468 V2->getPointerOperandType()->getTypeID())
26470 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26473 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26479 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26480 DT->getNode(
I1->getParent());
26481 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26482 DT->getNode(I2->getParent());
26483 assert(NodeI1 &&
"Should only process reachable instructions");
26484 assert(NodeI2 &&
"Should only process reachable instructions");
26485 assert((NodeI1 == NodeI2) ==
26487 "Different nodes should have different DFS numbers");
26488 if (NodeI1 != NodeI2)
26490 return I1->getOpcode() < I2->getOpcode();
26492 return V->getValueOperand()->getValueID() <
26496 bool SameParent =
true;
26502 StoreInst *V2 = VL.
back();
26527 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
26529 for (
auto [SI, V] :
zip(VL, NewVL))
26530 V =
SI->getValueOperand();
26531 NewVL.back() = V1->getValueOperand();
26532 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
26533 InstructionsState S =
Analysis.buildInstructionsState(
26541 return V1->getValueOperand()->
getValueID() ==
26546 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26547 for (
auto &Pair : Stores) {
26548 if (Pair.second.size() < 2)
26552 << Pair.second.size() <<
".\n");
26561 Pair.second.rend());
26563 ReversedStores, StoreSorter, AreCompatibleStores,
26565 return vectorizeStores(Candidates, R, Attempted);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
~ShuffleInstructionBuilder()
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(unsigned CounterName)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
bool isStridedLoad(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, const bool IsAnyPointerUsedOutGraph, const int64_t Diff, StridedPtrInfo &SPtrInfo) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const