160#define LV_NAME "loop-vectorize"
161#define DEBUG_TYPE LV_NAME
171 "llvm.loop.vectorize.followup_vectorized";
173 "llvm.loop.vectorize.followup_epilogue";
176STATISTIC(LoopsVectorized,
"Number of loops vectorized");
177STATISTIC(LoopsAnalyzed,
"Number of loops analyzed for vectorization");
178STATISTIC(LoopsEpilogueVectorized,
"Number of epilogues vectorized");
182 cl::desc(
"Enable vectorization of epilogue loops."));
186 cl::desc(
"When epilogue vectorization is enabled, and a value greater than "
187 "1 is specified, forces the given VF for all applicable epilogue "
192 cl::desc(
"Only loops with vectorization factor equal to or larger than "
193 "the specified value are considered for epilogue vectorization."));
199 cl::desc(
"Loops with a constant trip count that is smaller than this "
200 "value are vectorized only if no scalar iteration overheads "
205 cl::desc(
"The maximum allowed number of runtime memory checks"));
221 "prefer-predicate-over-epilogue",
224 cl::desc(
"Tail-folding and predication preferences over creating a scalar "
228 "Don't tail-predicate loops, create scalar epilogue"),
230 "predicate-else-scalar-epilogue",
231 "prefer tail-folding, create scalar epilogue if tail "
234 "predicate-dont-vectorize",
235 "prefers tail-folding, don't attempt vectorization if "
236 "tail-folding fails.")));
239 "force-tail-folding-style",
cl::desc(
"Force the tail folding style"),
242 clEnumValN(TailFoldingStyle::None,
"none",
"Disable tail folding"),
244 TailFoldingStyle::Data,
"data",
245 "Create lane mask for data only, using active.lane.mask intrinsic"),
246 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
247 "data-without-lane-mask",
248 "Create lane mask with compare/stepvector"),
249 clEnumValN(TailFoldingStyle::DataAndControlFlow,
"data-and-control",
250 "Create lane mask using active.lane.mask intrinsic, and use "
251 "it for both data and control flow"),
252 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253 "data-and-control-without-rt-check",
254 "Similar to data-and-control, but remove the runtime check"),
255 clEnumValN(TailFoldingStyle::DataWithEVL,
"data-with-evl",
256 "Use predicated EVL instructions for tail folding. If EVL "
257 "is unsupported, fallback to data-without-lane-mask.")));
261 cl::desc(
"Maximize bandwidth when selecting vectorization factor which "
262 "will be determined by the smallest type in loop."));
266 cl::desc(
"Enable vectorization on interleaved memory accesses in a loop"));
272 cl::desc(
"Enable vectorization on masked interleaved memory accesses in a loop"));
276 cl::desc(
"A flag that overrides the target's number of scalar registers."));
280 cl::desc(
"A flag that overrides the target's number of vector registers."));
284 cl::desc(
"A flag that overrides the target's max interleave factor for "
289 cl::desc(
"A flag that overrides the target's max interleave factor for "
290 "vectorized loops."));
294 cl::desc(
"A flag that overrides the target's expected cost for "
295 "an instruction to a single constant value. Mostly "
296 "useful for getting consistent testing."));
301 "Pretend that scalable vectors are supported, even if the target does "
302 "not support them. This flag should only be used for testing."));
307 "The cost of a loop that is considered 'small' by the interleaver."));
311 cl::desc(
"Enable the use of the block frequency analysis to access PGO "
312 "heuristics minimizing code growth in cold regions and being more "
313 "aggressive in hot regions."));
319 "Enable runtime interleaving until load/store ports are saturated"));
324 cl::desc(
"Max number of stores to be predicated behind an if."));
328 cl::desc(
"Count the induction variable only once when interleaving"));
332 cl::desc(
"Enable if predication of stores during vectorization."));
336 cl::desc(
"The maximum interleave count to use when interleaving a scalar "
337 "reduction in a nested loop."));
342 cl::desc(
"Prefer in-loop vector reductions, "
343 "overriding the targets preference."));
347 cl::desc(
"Enable the vectorisation of loops with in-order (strict) "
353 "Prefer predicating a reduction operation over an after loop select."));
358 cl::desc(
"Enable VPlan-native vectorization path with "
359 "support for outer loop vectorization."));
369 "Build VPlan for every supported loop nest in the function and bail "
370 "out right after the build (stress test the VPlan H-CFG construction "
371 "in the VPlan-native vectorization path)."));
375 cl::desc(
"Enable loop interleaving in Loop vectorization passes"));
378 cl::desc(
"Run the Loop vectorization passes"));
382 cl::desc(
"Use dot format instead of plain text when dumping VPlans"));
385 "force-widen-divrem-via-safe-divisor",
cl::Hidden,
387 "Override cost based safe divisor widening for div/rem instructions"));
390 "vectorizer-maximize-bandwidth-for-vector-calls",
cl::init(
true),
392 cl::desc(
"Try wider VFs if they enable the use of vector variants"));
411 return DL.getTypeAllocSizeInBits(Ty) !=
DL.getTypeSizeInBits(Ty);
450 unsigned Factor = Vals.
size();
451 assert(Factor > 1 &&
"Tried to interleave invalid number of vectors");
455 for (
Value *Val : Vals)
456 assert(Val->getType() == VecTy &&
"Tried to interleave mismatched types");
461 if (VecTy->isScalableTy()) {
462 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
463 return Builder.
CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
472 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
479class GeneratedRTChecks;
523 this->MinProfitableTripCount = VecWidth;
539 virtual std::pair<BasicBlock *, Value *>
566 VPValue *BlockInMask,
bool NeedsMaskForGaps);
581 std::pair<BasicBlock *, Value *> AdditionalBypass = {
nullptr,
nullptr});
651 const SCEV2ValueTy &ExpandedSCEVs,
652 std::pair<BasicBlock *, Value *> AdditionalBypass = {
nullptr,
nullptr});
799 "A high UF for the epilogue loop is likely not beneficial.");
819 GeneratedRTChecks &Checks)
821 EPI.MainLoopVF,
EPI.MainLoopVF,
EPI.MainLoopUF, LVL,
828 const SCEV2ValueTy &ExpandedSCEVs)
final {
835 virtual std::pair<BasicBlock *, Value *>
859 GeneratedRTChecks &Check)
864 std::pair<BasicBlock *, Value *>
888 GeneratedRTChecks &Checks)
895 std::pair<BasicBlock *, Value *>
917 if (
I->getDebugLoc() !=
Empty)
918 return I->getDebugLoc();
920 for (
Use &
Op :
I->operands()) {
922 if (OpInst->getDebugLoc() !=
Empty)
923 return OpInst->getDebugLoc();
926 return I->getDebugLoc();
935 dbgs() <<
"LV: " << Prefix << DebugMsg;
957 CodeRegion =
I->getParent();
960 if (
I->getDebugLoc())
961 DL =
I->getDebugLoc();
978 return B.CreateElementCount(Ty, VF);
984 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
"Invalid loop count");
998 <<
"loop not vectorized: " << OREMsg);
1016 "Vectorizing: ", TheLoop->
isInnermost() ?
"innermost loop" :
"outer loop",
1022 <<
"vectorized " << LoopType <<
"loop (vectorization width: "
1024 <<
", interleaved count: " <<
ore::NV(
"InterleaveCount", IC) <<
")";
1036 if (
const DebugLoc LoopDbgLoc = L->getStartLoc())
1037 LoopDbgLoc.print(
OS);
1040 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1185 "Profitable to scalarize relevant only for VF > 1.");
1188 "cost-model should not be used for outer loops (in VPlan-native path)");
1190 auto Scalars = InstsToScalarize.find(VF);
1191 assert(Scalars != InstsToScalarize.end() &&
1192 "VF not yet analyzed for scalarization profitability");
1193 return Scalars->second.contains(
I);
1200 "cost-model should not be used for outer loops (in VPlan-native path)");
1204 if (isa<PseudoProbeInst>(
I))
1210 auto UniformsPerVF = Uniforms.find(VF);
1211 assert(UniformsPerVF != Uniforms.end() &&
1212 "VF not yet analyzed for uniformity");
1213 return UniformsPerVF->second.count(
I);
1220 "cost-model should not be used for outer loops (in VPlan-native path)");
1224 auto ScalarsPerVF = Scalars.find(VF);
1225 assert(ScalarsPerVF != Scalars.end() &&
1226 "Scalar values are not calculated for VF");
1227 return ScalarsPerVF->second.count(
I);
1233 return VF.
isVector() && MinBWs.contains(
I) &&
1255 WideningDecisions[std::make_pair(
I, VF)] = std::make_pair(W,
Cost);
1266 for (
unsigned i = 0; i < Grp->
getFactor(); ++i) {
1269 WideningDecisions[std::make_pair(
I, VF)] = std::make_pair(W,
Cost);
1271 WideningDecisions[std::make_pair(
I, VF)] = std::make_pair(W, 0);
1283 "cost-model should not be used for outer loops (in VPlan-native path)");
1285 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(
I, VF);
1286 auto Itr = WideningDecisions.
find(InstOnVF);
1287 if (Itr == WideningDecisions.
end())
1289 return Itr->second.first;
1296 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(
I, VF);
1298 "The cost is not calculated");
1299 return WideningDecisions[InstOnVF].second;
1312 std::optional<unsigned> MaskPos,
1315 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1322 return CallWideningDecisions.
at(std::make_pair(CI, VF));
1330 auto *Trunc = dyn_cast<TruncInst>(
I);
1343 Value *
Op = Trunc->getOperand(0);
1363 if (VF.
isScalar() || Uniforms.contains(VF))
1367 collectLoopUniforms(VF);
1368 collectLoopScalars(VF);
1388 bool LI = isa<LoadInst>(V);
1389 bool SI = isa<StoreInst>(V);
1404 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1405 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1416 return ScalarCost < SafeDivisorCost;
1440 std::pair<InstructionCost, InstructionCost>
1481 auto RequiresScalarEpilogue = [
this](
ElementCount VF) {
1484 bool IsRequired =
all_of(Range, RequiresScalarEpilogue);
1486 (IsRequired ||
none_of(Range, RequiresScalarEpilogue)) &&
1487 "all VFs in range must agree on whether a scalar epilogue is required");
1499 if (!ChosenTailFoldingStyle)
1501 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1502 : ChosenTailFoldingStyle->second;
1510 assert(!ChosenTailFoldingStyle &&
"Tail folding must not be selected yet.");
1512 ChosenTailFoldingStyle =
1518 ChosenTailFoldingStyle = std::make_pair(
1533 IsScalableVF && UserIC <= 1 &&
1544 ChosenTailFoldingStyle =
1549 <<
"LV: Preference for VP intrinsics indicated. Will "
1550 "not try to generate VP Intrinsics "
1552 ?
"since interleave count specified is greater than 1.\n"
1553 :
"due to non-interleaving reasons.\n"));
1579 return InLoopReductions.contains(Phi);
1594 WideningDecisions.
clear();
1595 CallWideningDecisions.
clear();
1625 unsigned NumPredStores = 0;
1634 bool FoldTailByMasking);
1639 ElementCount getMaximizedVFForTarget(
unsigned MaxTripCount,
1640 unsigned SmallestType,
1641 unsigned WidestType,
1643 bool FoldTailByMasking);
1647 ElementCount getMaxLegalScalableVF(
unsigned MaxSafeElements);
1660 std::optional<InstructionCost>
1708 PredicatedBBsAfterVectorization;
1721 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1722 ChosenTailFoldingStyle;
1756 ScalarCostsTy &ScalarCosts,
1782 std::pair<InstWidening, InstructionCost>>;
1784 DecisionList WideningDecisions;
1786 using CallDecisionList =
1789 CallDecisionList CallWideningDecisions;
1812 Ops, [
this, VF](
Value *V) {
return this->needsExtract(V, VF); }));
1870class GeneratedRTChecks {
1876 Value *SCEVCheckCond =
nullptr;
1884 Value *MemRuntimeCheckCond =
nullptr;
1893 bool CostTooHigh =
false;
1894 const bool AddBranchWeights;
1896 Loop *OuterLoop =
nullptr;
1901 bool AddBranchWeights)
1902 : DT(DT), LI(LI),
TTI(
TTI), SCEVExp(SE,
DL,
"scev.check"),
1903 MemCheckExp(SE,
DL,
"scev.check"), AddBranchWeights(AddBranchWeights) {}
1931 nullptr,
"vector.scevcheck");
1938 if (RtPtrChecking.Need) {
1939 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1940 MemCheckBlock =
SplitBlock(Pred, Pred->getTerminator(), DT, LI,
nullptr,
1943 auto DiffChecks = RtPtrChecking.getDiffChecks();
1945 Value *RuntimeVF =
nullptr;
1950 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1956 MemCheckBlock->
getTerminator(), L, RtPtrChecking.getChecks(),
1959 assert(MemRuntimeCheckCond &&
1960 "no RT checks generated although RtPtrChecking "
1961 "claimed checks are required");
1964 if (!MemCheckBlock && !SCEVCheckBlock)
1974 if (SCEVCheckBlock) {
1979 if (MemCheckBlock) {
1986 if (MemCheckBlock) {
1990 if (SCEVCheckBlock) {
1996 OuterLoop =
L->getParentLoop();
2000 if (SCEVCheckBlock || MemCheckBlock)
2013 if (SCEVCheckBlock->getTerminator() == &
I)
2020 if (MemCheckBlock) {
2023 if (MemCheckBlock->getTerminator() == &
I)
2046 unsigned BestTripCount = 2;
2050 BestTripCount = SmallTC;
2054 BestTripCount = *EstimatedTC;
2057 BestTripCount = std::max(BestTripCount, 1U);
2061 NewMemCheckCost = std::max(*NewMemCheckCost.
getValue(),
2064 if (BestTripCount > 1)
2066 <<
"We expect runtime memory checks to be hoisted "
2067 <<
"out of the outer loop. Cost reduced from "
2068 << MemCheckCost <<
" to " << NewMemCheckCost <<
'\n');
2070 MemCheckCost = NewMemCheckCost;
2074 RTCheckCost += MemCheckCost;
2077 if (SCEVCheckBlock || MemCheckBlock)
2078 LLVM_DEBUG(
dbgs() <<
"Total cost of runtime checks: " << RTCheckCost
2086 ~GeneratedRTChecks() {
2090 SCEVCleaner.markResultUsed();
2092 if (!MemRuntimeCheckCond)
2093 MemCheckCleaner.markResultUsed();
2095 if (MemRuntimeCheckCond) {
2096 auto &SE = *MemCheckExp.
getSE();
2103 I.eraseFromParent();
2106 MemCheckCleaner.cleanup();
2107 SCEVCleaner.cleanup();
2110 SCEVCheckBlock->eraseFromParent();
2111 if (MemRuntimeCheckCond)
2112 MemCheckBlock->eraseFromParent();
2126 SCEVCheckCond =
nullptr;
2127 if (
auto *
C = dyn_cast<ConstantInt>(
Cond))
2138 SCEVCheckBlock->getTerminator()->eraseFromParent();
2139 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2140 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2147 if (AddBranchWeights)
2150 return SCEVCheckBlock;
2159 if (!MemRuntimeCheckCond)
2168 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2175 if (AddBranchWeights) {
2179 MemCheckBlock->getTerminator()->setDebugLoc(
2180 Pred->getTerminator()->getDebugLoc());
2183 MemRuntimeCheckCond =
nullptr;
2184 return MemCheckBlock;
2190 return Style == TailFoldingStyle::Data ||
2191 Style == TailFoldingStyle::DataAndControlFlow ||
2192 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2196 return Style == TailFoldingStyle::DataAndControlFlow ||
2197 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2227 LLVM_DEBUG(
dbgs() <<
"LV: Loop hints prevent outer loop vectorization.\n");
2233 LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: Interleave is not supported for "
2253 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2263 for (
Loop *InnerL : L)
2285 ?
B.CreateSExtOrTrunc(
Index, StepTy)
2286 :
B.CreateCast(Instruction::SIToFP,
Index, StepTy);
2287 if (CastedIndex !=
Index) {
2289 Index = CastedIndex;
2299 assert(
X->getType() ==
Y->getType() &&
"Types don't match!");
2300 if (
auto *CX = dyn_cast<ConstantInt>(
X))
2303 if (
auto *CY = dyn_cast<ConstantInt>(
Y))
2306 return B.CreateAdd(
X,
Y);
2312 assert(
X->getType()->getScalarType() ==
Y->getType() &&
2313 "Types don't match!");
2314 if (
auto *CX = dyn_cast<ConstantInt>(
X))
2317 if (
auto *CY = dyn_cast<ConstantInt>(
Y))
2320 VectorType *XVTy = dyn_cast<VectorType>(
X->getType());
2321 if (XVTy && !isa<VectorType>(
Y->getType()))
2322 Y =
B.CreateVectorSplat(XVTy->getElementCount(),
Y);
2323 return B.CreateMul(
X,
Y);
2326 switch (InductionKind) {
2329 "Vector indices not supported for integer inductions yet");
2331 "Index type does not match StartValue type");
2332 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2333 return B.CreateSub(StartValue,
Index);
2341 "Vector indices not supported for FP inductions yet");
2344 (InductionBinOp->
getOpcode() == Instruction::FAdd ||
2345 InductionBinOp->
getOpcode() == Instruction::FSub) &&
2346 "Original bin op should be defined for FP induction");
2349 return B.CreateBinOp(InductionBinOp->
getOpcode(), StartValue, MulExp,
2363 if (
F.hasFnAttribute(Attribute::VScaleRange))
2364 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2366 return std::nullopt;
2375 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2377 unsigned MaxUF = UF ? *UF :
Cost->TTI.getMaxInterleaveFactor(VF);
2379 Type *IdxTy =
Cost->Legal->getWidestInductionType();
2380 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2386 Cost->PSE.getSE()->getSmallConstantMaxTripCount(
Cost->TheLoop)) {
2389 std::optional<unsigned> MaxVScale =
2393 MaxVF *= *MaxVScale;
2396 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2444 VPValue *BlockInMask,
bool NeedsMaskForGaps) {
2446 const DataLayout &
DL = Instr->getModule()->getDataLayout();
2450 unsigned InterleaveFactor = Group->
getFactor();
2459 "Reversed masked interleave-group not supported.");
2477 for (
unsigned Part = 0; Part <
UF; Part++) {
2479 if (
auto *
I = dyn_cast<Instruction>(AddrPart))
2494 bool InBounds =
false;
2496 InBounds =
gep->isInBounds();
2504 auto CreateGroupMask = [
this, &BlockInMask, &State, &InterleaveFactor](
2505 unsigned Part,
Value *MaskForGaps) ->
Value * {
2507 assert(!MaskForGaps &&
"Interleaved groups with gaps are not supported.");
2508 assert(InterleaveFactor == 2 &&
2509 "Unsupported deinterleave factor for scalable vectors");
2510 auto *BlockInMaskPart = State.
get(BlockInMask, Part);
2515 nullptr,
"interleaved.mask");
2521 Value *BlockInMaskPart = State.
get(BlockInMask, Part);
2525 "interleaved.mask");
2532 if (isa<LoadInst>(Instr)) {
2533 Value *MaskForGaps =
nullptr;
2534 if (NeedsMaskForGaps) {
2537 assert(MaskForGaps &&
"Mask for Gaps is required but it is null");
2542 for (
unsigned Part = 0; Part <
UF; Part++) {
2544 if (BlockInMask || MaskForGaps) {
2546 "masked interleaved groups are not allowed.");
2547 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2550 GroupMask, PoisonVec,
"wide.masked.vec");
2559 if (VecTy->isScalableTy()) {
2560 assert(InterleaveFactor == 2 &&
2561 "Unsupported deinterleave factor for scalable vectors");
2563 for (
unsigned Part = 0; Part <
UF; ++Part) {
2567 Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
2568 nullptr,
"strided.vec");
2570 for (
unsigned I = 0;
I < InterleaveFactor; ++
I) {
2578 if (Member->getType() != ScalarTy) {
2586 State.
set(VPDefs[J], StridedVec, Part);
2597 for (
unsigned I = 0;
I < InterleaveFactor; ++
I) {
2606 for (
unsigned Part = 0; Part <
UF; Part++) {
2608 NewLoads[Part], StrideMask,
"strided.vec");
2611 if (Member->getType() != ScalarTy) {
2620 State.
set(VPDefs[J], StridedVec, Part);
2631 Value *MaskForGaps =
2634 "masked interleaved groups are not allowed.");
2636 "masking gaps for scalable vectors is not yet supported.");
2637 for (
unsigned Part = 0; Part <
UF; Part++) {
2640 unsigned StoredIdx = 0;
2641 for (
unsigned i = 0; i < InterleaveFactor; i++) {
2643 "Fail to get a member from an interleaved store group");
2653 Value *StoredVec = State.
get(StoredValues[StoredIdx], Part);
2661 if (StoredVec->
getType() != SubVT)
2670 if (BlockInMask || MaskForGaps) {
2671 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2686 assert(!Instr->getType()->isAggregateType() &&
"Can't handle vectors");
2690 if (isa<NoAliasScopeDeclInst>(Instr))
2695 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2699 Cloned->
setName(Instr->getName() +
".cloned");
2704 "inferred type and type from generated instructions do not match");
2710 if (
auto DL = Instr->getDebugLoc())
2716 auto InputInstance = Instance;
2720 Cloned->
setOperand(
I.index(), State.
get(Operand, InputInstance));
2727 State.
set(RepRecipe, Cloned, Instance);
2730 if (
auto *II = dyn_cast<AssumeInst>(Cloned))
2735 if (IfPredicateInstr)
2759 if (
Cost->foldTailByMasking()) {
2761 "VF*UF must be a power of 2 when folding tail by masking");
2793 auto *DstFVTy = cast<VectorType>(DstVTy);
2794 auto VF = DstFVTy->getElementCount();
2795 auto *SrcVecTy = cast<VectorType>(V->getType());
2796 assert(
VF == SrcVecTy->getElementCount() &&
"Vector dimensions do not match");
2797 Type *SrcElemTy = SrcVecTy->getElementType();
2798 Type *DstElemTy = DstFVTy->getElementType();
2799 assert((
DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) &&
2800 "Vector elements must have same size");
2811 "Only one type should be a pointer type");
2813 "Only one type should be a floating point type");
2839 auto CreateStep = [&]() ->
Value * {
2864 Value *MaxUIntTripCount =
2865 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2879 "TC check is expected to dominate Bypass");
2900 if (!SCEVCheckBlock)
2906 "Cannot SCEV check stride or overflow when optimizing for size");
2921 return SCEVCheckBlock;
2940 "Cannot emit memory checks when optimizing for size, unless forced "
2946 <<
"Code-size may be reduced by not forcing "
2947 "vectorization, or by source-code modifications "
2948 "eliminating the need for runtime checks "
2949 "(e.g., adding 'restrict').";
2957 return MemCheckBlock;
2966 "multiple exit loop without required epilogue?");
2970 LI,
nullptr,
Twine(Prefix) +
"middle.block");
2973 nullptr,
Twine(Prefix) +
"scalar.ph");
2990 BrInst->
setDebugLoc(ScalarLatchTerm->getDebugLoc());
3006 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3012 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3013 if (OrigPhi == OldInduction) {
3028 if (AdditionalBypass.first) {
3029 B.SetInsertPoint(AdditionalBypass.first,
3030 AdditionalBypass.first->getFirstInsertionPt());
3031 EndValueFromAdditionalBypass =
3034 EndValueFromAdditionalBypass->
setName(
"ind.end");
3054 if (AdditionalBypass.first)
3056 EndValueFromAdditionalBypass);
3063 const SCEV2ValueTy &ExpandedSCEVs) {
3064 const SCEV *Step =
ID.getStep();
3065 if (
auto *
C = dyn_cast<SCEVConstant>(Step))
3066 return C->getValue();
3067 if (
auto *U = dyn_cast<SCEVUnknown>(Step))
3068 return U->getValue();
3069 auto I = ExpandedSCEVs.find(Step);
3070 assert(
I != ExpandedSCEVs.end() &&
"SCEV must be expanded at this point");
3075 const SCEV2ValueTy &ExpandedSCEVs,
3076 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3077 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3078 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3079 "Inconsistent information about additional bypass.");
3088 PHINode *OrigPhi = InductionEntry.first;
3113 !
Cost->foldTailByMasking()) {
3122 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3135#ifdef EXPENSIVE_CHECKS
3142std::pair<BasicBlock *, Value *>
3144 const SCEV2ValueTy &ExpandedSCEVs) {
3229 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
3230 MissingVals[UI] = EndValue;
3238 auto *UI = cast<Instruction>(U);
3240 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
3247 Value *CountMinusOne =
B.CreateSub(
3249 CountMinusOne->
setName(
"cmo");
3252 assert(StepVPV &&
"step must have been expanded during VPlan execution");
3254 : State.
get(StepVPV, {0, 0});
3258 Escape->
setName(
"ind.escape");
3259 MissingVals[UI] = Escape;
3263 for (
auto &
I : MissingVals) {
3270 if (
PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3271 PHI->addIncoming(
I.second, MiddleBlock);
3279struct CSEDenseMapInfo {
3281 return isa<InsertElementInst>(
I) || isa<ExtractElementInst>(
I) ||
3282 isa<ShuffleVectorInst>(
I) || isa<GetElementPtrInst>(
I);
3294 assert(canHandle(
I) &&
"Unknown instruction!");
3296 I->value_op_end()));
3300 if (
LHS == getEmptyKey() ||
RHS == getEmptyKey() ||
3301 LHS == getTombstoneKey() ||
RHS == getTombstoneKey())
3303 return LHS->isIdenticalTo(
RHS);
3314 if (!CSEDenseMapInfo::canHandle(&In))
3320 In.replaceAllUsesWith(V);
3321 In.eraseFromParent();
3335 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3340 if (
auto RedCost = getReductionPatternCost(CI, VF,
RetTy,
CostKind))
3344 for (
auto &ArgOp : CI->
args())
3353 return std::min(ScalarCallCost, IntrinsicCost);
3355 return ScalarCallCost;
3368 assert(
ID &&
"Expected intrinsic call!");
3371 if (
auto *FPMO = dyn_cast<FPMathOperator>(CI))
3372 FMF = FPMO->getFastMathFlags();
3378 std::back_inserter(ParamTys),
3379 [&](
Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3382 dyn_cast<IntrinsicInst>(CI));
3388 auto *I1 = cast<IntegerType>(cast<VectorType>(
T1)->getElementType());
3389 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3390 return I1->getBitWidth() < I2->getBitWidth() ?
T1 : T2;
3394 auto *I1 = cast<IntegerType>(cast<VectorType>(
T1)->getElementType());
3395 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3396 return I1->getBitWidth() > I2->getBitWidth() ?
T1 : T2;
3414 if (
auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3428 for (
PHINode &PN : Exit->phis())
3458 KV.second->fixPhi(Plan, State);
3541 Value *RuntimeVF =
nullptr;
3543 auto *One = ConstantInt::get(IdxTy, 1);
3551 auto RecurSplice = cast<VPInstruction>(*PhiR->
user_begin());
3553 RecurSplice->getOpcode() ==
3555 "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3557 for (
VPUser *U : RecurSplice->users())
3558 if (
auto *LiveOut = dyn_cast<VPLiveOut>(U))
3561 if (!LiveOuts.
empty()) {
3567 Value *ExtractForPhiUsedOutsideLoop =
nullptr;
3573 assert(
UF > 1 &&
"VF and UF cannot both be 1");
3578 ExtractForPhiUsedOutsideLoop = State.
get(PreviousDef,
UF - 2);
3583 PHINode *LCSSAPhi = LiveOut->getPhi();
3600 Phi->setName(
"scalar.recur");
3617 auto isBlockOfUsePredicated = [&](
Use &U) ->
bool {
3618 auto *
I = cast<Instruction>(U.getUser());
3620 if (
auto *Phi = dyn_cast<PHINode>(
I))
3621 BB = Phi->getIncomingBlock(
3623 return BB == PredBB;
3634 Worklist.
insert(InstsToReanalyze.
begin(), InstsToReanalyze.
end());
3635 InstsToReanalyze.
clear();
3638 while (!Worklist.
empty()) {
3644 if (!
I || isa<PHINode>(
I) || !VectorLoop->contains(
I) ||
3645 I->mayHaveSideEffects() ||
I->mayReadFromMemory())
3653 if (
I->getParent() == PredBB) {
3654 Worklist.
insert(
I->op_begin(),
I->op_end());
3668 I->moveBefore(&*PredBB->getFirstInsertionPt());
3669 Worklist.
insert(
I->op_begin(),
I->op_end());
3681 for (
VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3686 PHINode *NewPhi = cast<PHINode>(State.
get(VPPhi, 0));
3698void LoopVectorizationCostModel::collectLoopScalars(
ElementCount VF) {
3703 "This function should not be visited twice for the same VF");
3709 Scalars[VF].
insert(Uniforms[VF].begin(), Uniforms[VF].end());
3728 "Widening decision should be ready at this moment");
3729 if (
auto *Store = dyn_cast<StoreInst>(MemAccess))
3730 if (
Ptr == Store->getValueOperand())
3733 "Ptr is neither a value or pointer operand");
3739 auto isLoopVaryingBitCastOrGEP = [&](
Value *
V) {
3740 return ((isa<BitCastInst>(V) &&
V->getType()->isPointerTy()) ||
3741 isa<GetElementPtrInst>(V)) &&
3752 if (!isLoopVaryingBitCastOrGEP(
Ptr))
3757 auto *
I = cast<Instruction>(
Ptr);
3765 return isa<LoadInst>(U) || isa<StoreInst>(U);
3769 PossibleNonScalarPtrs.
insert(
I);
3787 for (
auto &
I : *BB) {
3788 if (
auto *Load = dyn_cast<LoadInst>(&
I)) {
3789 evaluatePtrUse(Load,
Load->getPointerOperand());
3790 }
else if (
auto *Store = dyn_cast<StoreInst>(&
I)) {
3791 evaluatePtrUse(Store,
Store->getPointerOperand());
3792 evaluatePtrUse(Store,
Store->getValueOperand());
3795 for (
auto *
I : ScalarPtrs)
3796 if (!PossibleNonScalarPtrs.
count(
I)) {
3804 auto ForcedScalar = ForcedScalars.
find(VF);
3805 if (ForcedScalar != ForcedScalars.
end())
3806 for (
auto *
I : ForcedScalar->second) {
3807 LLVM_DEBUG(
dbgs() <<
"LV: Found (forced) scalar instruction: " << *
I <<
"\n");
3816 while (
Idx != Worklist.
size()) {
3818 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3820 auto *Src = cast<Instruction>(Dst->getOperand(0));
3822 auto *J = cast<Instruction>(U);
3823 return !TheLoop->contains(J) || Worklist.count(J) ||
3824 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3825 isScalarUse(J, Src));
3828 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Src <<
"\n");
3835 auto *Ind = Induction.first;
3836 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3845 auto IsDirectLoadStoreFromPtrIndvar = [&](
Instruction *Indvar,
3847 return Induction.second.getKind() ==
3849 (isa<LoadInst>(
I) || isa<StoreInst>(
I)) &&
3856 auto *I = cast<Instruction>(U);
3857 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3858 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3866 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3872 auto ScalarIndUpdate =
3874 auto *I = cast<Instruction>(U);
3875 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3876 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3878 if (!ScalarIndUpdate)
3883 Worklist.
insert(IndUpdate);
3884 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Ind <<
"\n");
3885 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *IndUpdate
3899 switch(
I->getOpcode()) {
3902 case Instruction::Call:
3905 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(
I), VF))
3907 case Instruction::Load:
3908 case Instruction::Store: {
3920 case Instruction::UDiv:
3921 case Instruction::SDiv:
3922 case Instruction::SRem:
3923 case Instruction::URem: {
3939 switch(
I->getOpcode()) {
3942 case Instruction::Load:
3943 case Instruction::Store: {
3956 (isa<LoadInst>(
I) ||
3957 (isa<StoreInst>(
I) &&
3963 case Instruction::UDiv:
3964 case Instruction::SDiv:
3965 case Instruction::SRem:
3966 case Instruction::URem:
3970 case Instruction::Call:
3975std::pair<InstructionCost, InstructionCost>
3978 assert(
I->getOpcode() == Instruction::UDiv ||
3979 I->getOpcode() == Instruction::SDiv ||
3980 I->getOpcode() == Instruction::SRem ||
3981 I->getOpcode() == Instruction::URem);
3992 ScalarizationCost = 0;
4007 ScalarizationCost += getScalarizationOverhead(
I, VF,
CostKind);
4021 Instruction::Select, VecTy,
4027 Value *Op2 =
I->getOperand(1);
4036 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4038 return {ScalarizationCost, SafeDivisorCost};
4045 "Decision should not be set yet.");
4047 assert(Group &&
"Must have a group.");
4051 auto &
DL =
I->getModule()->getDataLayout();
4058 unsigned InterleaveFactor = Group->getFactor();
4059 bool ScalarNI =
DL.isNonIntegralPointerType(ScalarTy);
4060 for (
unsigned i = 0; i < InterleaveFactor; i++) {
4065 bool MemberNI =
DL.isNonIntegralPointerType(
MemberTy);
4067 if (MemberNI != ScalarNI) {
4070 }
else if (MemberNI && ScalarNI &&
4071 ScalarTy->getPointerAddressSpace() !=
4072 MemberTy->getPointerAddressSpace()) {
4082 bool PredicatedAccessRequiresMasking =
4085 bool LoadAccessWithGapsRequiresEpilogMasking =
4086 isa<LoadInst>(
I) && Group->requiresScalarEpilogue() &&
4088 bool StoreAccessWithGapsRequiresMasking =
4089 isa<StoreInst>(
I) && (Group->getNumMembers() < Group->getFactor());
4090 if (!PredicatedAccessRequiresMasking &&
4091 !LoadAccessWithGapsRequiresEpilogMasking &&
4092 !StoreAccessWithGapsRequiresMasking)
4099 "Masked interleave-groups for predicated accesses are not enabled.");
4101 if (Group->isReverse())
4113 assert((isa<LoadInst, StoreInst>(
I)) &&
"Invalid memory instruction");
4129 auto &
DL =
I->getModule()->getDataLayout();
4136void LoopVectorizationCostModel::collectLoopUniforms(
ElementCount VF) {
4143 "This function should not be visited twice for the same VF");
4147 Uniforms[VF].
clear();
4155 auto isOutOfScope = [&](
Value *V) ->
bool {
4168 auto addToWorklistIfAllowed = [&](
Instruction *
I) ->
void {
4169 if (isOutOfScope(
I)) {
4175 LLVM_DEBUG(
dbgs() <<
"LV: Found not uniform being ScalarWithPredication: "
4179 LLVM_DEBUG(
dbgs() <<
"LV: Found uniform instruction: " << *
I <<
"\n");
4189 auto *
Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
4191 addToWorklistIfAllowed(Cmp);
4200 if (PrevVF.isVector()) {
4201 auto Iter = Uniforms.
find(PrevVF);
4202 if (Iter != Uniforms.
end() && !Iter->second.contains(
I))
4207 if (isa<LoadInst>(
I))
4218 "Widening decision should be ready at this moment");
4220 if (isUniformMemOpUse(
I))
4223 return (WideningDecision ==
CM_Widen ||
4232 if (isa<StoreInst>(
I) &&
I->getOperand(0) ==
Ptr)
4248 for (
auto &
I : *BB) {
4250 switch (II->getIntrinsicID()) {
4251 case Intrinsic::sideeffect:
4252 case Intrinsic::experimental_noalias_scope_decl:
4253 case Intrinsic::assume:
4254 case Intrinsic::lifetime_start:
4255 case Intrinsic::lifetime_end:
4257 addToWorklistIfAllowed(&
I);
4266 if (
auto *EVI = dyn_cast<ExtractValueInst>(&
I)) {
4267 assert(isOutOfScope(EVI->getAggregateOperand()) &&
4268 "Expected aggregate value to be loop invariant");
4269 addToWorklistIfAllowed(EVI);
4278 if (isUniformMemOpUse(&
I))
4279 addToWorklistIfAllowed(&
I);
4281 if (isVectorizedMemAccessUse(&
I,
Ptr))
4288 for (
auto *V : HasUniformUse) {
4289 if (isOutOfScope(V))
4291 auto *
I = cast<Instruction>(V);
4292 auto UsersAreMemAccesses =
4294 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4296 if (UsersAreMemAccesses)
4297 addToWorklistIfAllowed(
I);
4304 while (idx != Worklist.
size()) {
4307 for (
auto *OV :
I->operand_values()) {
4309 if (isOutOfScope(OV))
4313 auto *
OP = dyn_cast<PHINode>(OV);
4318 auto *OI = cast<Instruction>(OV);
4320 auto *J = cast<Instruction>(U);
4321 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4323 addToWorklistIfAllowed(OI);
4335 auto *Ind = Induction.first;
4336 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4341 auto *I = cast<Instruction>(U);
4342 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4343 isVectorizedMemAccessUse(I, Ind);
4350 auto UniformIndUpdate =
4352 auto *I = cast<Instruction>(U);
4353 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4354 isVectorizedMemAccessUse(I, IndUpdate);
4356 if (!UniformIndUpdate)
4360 addToWorklistIfAllowed(Ind);
4361 addToWorklistIfAllowed(IndUpdate);
4372 "runtime pointer checks needed. Enable vectorization of this "
4373 "loop with '#pragma clang loop vectorize(enable)' when "
4374 "compiling with -Os/-Oz",
4375 "CantVersionLoopWithOptForSize",
ORE,
TheLoop);
4381 "runtime SCEV checks needed. Enable vectorization of this "
4382 "loop with '#pragma clang loop vectorize(enable)' when "
4383 "compiling with -Os/-Oz",
4384 "CantVersionLoopWithOptForSize",
ORE,
TheLoop);
4391 "runtime stride == 1 checks needed. Enable vectorization of "
4392 "this loop without such check by compiling with -Os/-Oz",
4393 "CantVersionLoopWithOptForSize",
ORE,
TheLoop);
4401LoopVectorizationCostModel::getMaxLegalScalableVF(
unsigned MaxSafeElements) {
4407 "ScalableVectorizationDisabled",
ORE,
TheLoop);
4411 LLVM_DEBUG(
dbgs() <<
"LV: Scalable vectorization is available\n");
4414 std::numeric_limits<ElementCount::ScalarTy>::max());
4425 "Scalable vectorization not supported for the reduction "
4426 "operations found in this loop.",
4438 "for all element types found in this loop.",
4444 return MaxScalableVF;
4454 "Max legal vector width too small, scalable vectorization "
4458 return MaxScalableVF;
4462 unsigned MaxTripCount,
ElementCount UserVF,
bool FoldTailByMasking) {
4464 unsigned SmallestType, WidestType;
4471 unsigned MaxSafeElements =
4475 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4477 LLVM_DEBUG(
dbgs() <<
"LV: The max safe fixed VF is: " << MaxSafeFixedVF
4479 LLVM_DEBUG(
dbgs() <<
"LV: The max safe scalable VF is: " << MaxSafeScalableVF
4484 auto MaxSafeUserVF =
4485 UserVF.
isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4502 <<
" is unsafe, clamping to max safe VF="
4503 << MaxSafeFixedVF <<
".\n");
4508 <<
"User-specified vectorization factor "
4509 <<
ore::NV(
"UserVectorizationFactor", UserVF)
4510 <<
" is unsafe, clamping to maximum safe vectorization factor "
4511 <<
ore::NV(
"VectorizationFactor", MaxSafeFixedVF);
4513 return MaxSafeFixedVF;
4518 <<
" is ignored because scalable vectors are not "
4524 <<
"User-specified vectorization factor "
4525 <<
ore::NV(
"UserVectorizationFactor", UserVF)
4526 <<
" is ignored because the target does not support scalable "
4527 "vectors. The compiler will pick a more suitable value.";
4531 <<
" is unsafe. Ignoring scalable UserVF.\n");
4536 <<
"User-specified vectorization factor "
4537 <<
ore::NV(
"UserVectorizationFactor", UserVF)
4538 <<
" is unsafe. Ignoring the hint to let the compiler pick a "
4539 "more suitable value.";
4544 LLVM_DEBUG(
dbgs() <<
"LV: The Smallest and Widest types: " << SmallestType
4545 <<
" / " << WidestType <<
" bits.\n");
4550 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4551 MaxSafeFixedVF, FoldTailByMasking))
4555 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4556 MaxSafeScalableVF, FoldTailByMasking))
4557 if (MaxVF.isScalable()) {
4558 Result.ScalableVF = MaxVF;
4559 LLVM_DEBUG(
dbgs() <<
"LV: Found feasible scalable VF = " << MaxVF
4572 "Not inserting runtime ptr check for divergent target",
4573 "runtime pointer checks needed. Not enabled for divergent target",
4574 "CantVersionLoopWithDivergentTarget",
ORE,
TheLoop);
4583 "loop trip count is one, irrelevant for vectorization",
4588 switch (ScalarEpilogueStatus) {
4590 return computeFeasibleMaxVF(MaxTC, UserVF,
false);
4595 dbgs() <<
"LV: vector predicate hint/switch found.\n"
4596 <<
"LV: Not allowing scalar epilogue, creating predicated "
4597 <<
"vector loop.\n");
4604 dbgs() <<
"LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4606 LLVM_DEBUG(
dbgs() <<
"LV: Not allowing scalar epilogue due to low trip "
4625 LLVM_DEBUG(
dbgs() <<
"LV: Cannot fold tail by masking: vectorize with a "
4626 "scalar epilogue instead.\n");
4628 return computeFeasibleMaxVF(MaxTC, UserVF,
false);
4639 "No decisions should have been taken at this point");
4649 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4654 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4655 *MaxPowerOf2RuntimeVF,
4658 MaxPowerOf2RuntimeVF = std::nullopt;
4661 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4663 "MaxFixedVF must be a power of 2");
4664 unsigned MaxVFtimesIC =
4665 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4669 BackedgeTakenCount, SE->
getOne(BackedgeTakenCount->
getType()));
4675 LLVM_DEBUG(
dbgs() <<
"LV: No tail will remain for any chosen VF.\n");
4689 <<
"LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4690 "try to generate VP Intrinsics with scalable vector "
4696 "Expected scalable vector factor.");
4706 LLVM_DEBUG(
dbgs() <<
"LV: Cannot fold tail by masking: vectorize with a "
4707 "scalar epilogue instead.\n");
4713 LLVM_DEBUG(
dbgs() <<
"LV: Can't fold tail by masking: don't vectorize\n");
4719 "Unable to calculate the loop count due to complex control flow",
4720 "unable to calculate the loop count due to complex control flow",
4726 "Cannot optimize for size and vectorize at the same time.",
4727 "cannot optimize for size and vectorize at the same time. "
4728 "Enable vectorization of this loop with '#pragma clang loop "
4729 "vectorize(enable)' when compiling with -Os/-Oz",
4734ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4735 unsigned MaxTripCount,
unsigned SmallestType,
unsigned WidestType,
4737 bool ComputeScalableMaxVF = MaxSafeVF.
isScalable();
4745 "Scalable flags must match");
4753 ComputeScalableMaxVF);
4754 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4756 << (MaxVectorElementCount * WidestType) <<
" bits.\n");
4758 if (!MaxVectorElementCount) {
4760 << (ComputeScalableMaxVF ?
"scalable" :
"fixed")
4761 <<
" vector registers.\n");
4765 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4766 if (MaxVectorElementCount.isScalable() &&
4770 WidestRegisterMinEC *= Min;
4779 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4787 LLVM_DEBUG(
dbgs() <<
"LV: Clamping the MaxVF to maximum power of two not "
4788 "exceeding the constant trip count: "
4789 << ClampedUpperTripCount <<
"\n");
4791 ClampedUpperTripCount,
4792 FoldTailByMasking ? MaxVectorElementCount.isScalable() :
false);
4805 ComputeScalableMaxVF);
4806 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4820 for (
int i = RUs.size() - 1; i >= 0; --i) {
4821 bool Selected =
true;
4822 for (
auto &pair : RUs[i].MaxLocalUsers) {
4824 if (pair.second > TargetNumRegisters)
4836 <<
") with target's minimum: " << MinVF <<
'\n');
4852static std::optional<unsigned>
4854 const Function *Fn = L->getHeader()->getParent();
4858 auto Max = Attr.getVScaleRangeMax();
4859 if (Max && Min == Max)
4866bool LoopVectorizationPlanner::isMoreProfitable(
4873 if (!
A.Width.isScalable() && !
B.Width.isScalable() && MaxTripCount) {
4882 auto GetCostForTC = [MaxTripCount,
this](
unsigned VF,
4886 : VectorCost * (MaxTripCount / VF) +
4887 ScalarCost * (MaxTripCount % VF);
4889 auto RTCostA = GetCostForTC(
A.Width.getFixedValue(), CostA,
A.ScalarCost);
4890 auto RTCostB = GetCostForTC(
B.Width.getFixedValue(), CostB,
B.ScalarCost);
4892 return RTCostA < RTCostB;
4896 unsigned EstimatedWidthA =
A.Width.getKnownMinValue();
4897 unsigned EstimatedWidthB =
B.Width.getKnownMinValue();
4899 if (
A.Width.isScalable())
4900 EstimatedWidthA *= *VScale;
4901 if (
B.Width.isScalable())
4902 EstimatedWidthB *= *VScale;
4908 if (
A.Width.isScalable() && !
B.Width.isScalable())
4909 return (CostA *
B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4914 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4920 if (InvalidCosts.
empty())
4927 std::map<Instruction *, unsigned> Numbering;
4929 for (
auto &Pair : InvalidCosts)
4930 if (!Numbering.count(Pair.first))
4931 Numbering[Pair.first] =
I++;
4935 if (Numbering[
A.first] != Numbering[
B.first])
4936 return Numbering[
A.first] < Numbering[
B.first];
4938 return ECC(
A.second,
B.second);
4950 Subset =
Tail.take_front(1);
4959 if (Subset ==
Tail ||
Tail[Subset.size()].first !=
I) {
4960 std::string OutString;
4962 assert(!Subset.empty() &&
"Unexpected empty range");
4963 OS <<
"Instruction with invalid costs prevented vectorization at VF=(";
4964 for (
const auto &Pair : Subset)
4965 OS << (Pair.second == Subset.front().second ?
"" :
", ") << Pair.second;
4967 if (
auto *CI = dyn_cast<CallInst>(
I))
4968 OS <<
" call to " << CI->getCalledFunction()->getName();
4970 OS <<
" " <<
I->getOpcodeName();
4973 Tail =
Tail.drop_front(Subset.size());
4977 Subset =
Tail.take_front(Subset.size() + 1);
4978 }
while (!
Tail.empty());
4985 LLVM_DEBUG(
dbgs() <<
"LV: Scalar loop costs: " << ExpectedCost <<
".\n");
4986 assert(ExpectedCost.
isValid() &&
"Unexpected invalid cost for scalar loop");
4988 "Expected Scalar VF to be a candidate");
4995 if (ForceVectorization && VFCandidates.
size() > 1) {
5003 for (
const auto &i : VFCandidates) {
5013 unsigned AssumedMinimumVscale =
5016 Candidate.Width.isScalable()
5017 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5018 : Candidate.Width.getFixedValue();
5020 <<
" costs: " << (Candidate.Cost / Width));
5023 << AssumedMinimumVscale <<
")");
5027 if (!
C.second && !ForceVectorization) {
5029 dbgs() <<
"LV: Not considering vector loop of width " << i
5030 <<
" because it will not generate any vector instructions.\n");
5035 if (isMoreProfitable(Candidate, ScalarCost))
5036 ProfitableVFs.push_back(Candidate);
5038 if (isMoreProfitable(Candidate, ChosenFactor))
5039 ChosenFactor = Candidate;
5046 "There are conditional stores.",
5047 "store that is conditionally executed prevents vectorization",
5048 "ConditionalStore", ORE, OrigLoop);
5049 ChosenFactor = ScalarCost;
5053 !isMoreProfitable(ChosenFactor, ScalarCost))
dbgs()
5054 <<
"LV: Vectorization seems to be not beneficial, "
5055 <<
"but was forced by a user.\n");
5057 return ChosenFactor;
5060bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5065 [&](
PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5073 Entry.first->getIncomingValueForBlock(OrigLoop->
getLoopLatch());
5075 if (!OrigLoop->
contains(cast<Instruction>(U)))
5078 for (
User *U : Entry.first->users())
5079 if (!OrigLoop->
contains(cast<Instruction>(U)))
5108 unsigned Multiplier = 1;
5120 LLVM_DEBUG(
dbgs() <<
"LEV: Epilogue vectorization is disabled.\n");
5125 LLVM_DEBUG(
dbgs() <<
"LEV: Unable to vectorize epilogue because no "
5126 "epilogue is allowed.\n");
5132 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5133 LLVM_DEBUG(
dbgs() <<
"LEV: Unable to vectorize epilogue because the loop "
5134 "is not a supported candidate.\n");
5139 LLVM_DEBUG(
dbgs() <<
"LEV: Epilogue vectorization factor is forced.\n");
5142 return {ForcedEC, 0, 0};
5144 LLVM_DEBUG(
dbgs() <<
"LEV: Epilogue vectorization forced factor is not "
5153 dbgs() <<
"LEV: Epilogue vectorization skipped due to opt for size.\n");
5158 LLVM_DEBUG(
dbgs() <<
"LEV: Epilogue vectorization is not profitable for "
5170 EstimatedRuntimeVF *= *VScale;
5175 const SCEV *RemainingIterations =
nullptr;
5176 for (
auto &NextVF : ProfitableVFs) {
5183 if ((!NextVF.Width.isScalable() && MainLoopVF.
isScalable() &&
5190 if (!MainLoopVF.
isScalable() && !NextVF.Width.isScalable()) {
5192 if (!RemainingIterations) {
5199 SE.
getConstant(TCType, NextVF.Width.getKnownMinValue()),
5200 RemainingIterations))
5204 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5210 << Result.Width <<
"\n");
5214std::pair<unsigned, unsigned>
5216 unsigned MinWidth = -1U;
5217 unsigned MaxWidth = 8;
5230 MaxWidth = std::min<unsigned>(
5231 MaxWidth, std::min<unsigned>(
5237 MinWidth = std::min<unsigned>(
5238 MinWidth,
DL.getTypeSizeInBits(
T->getScalarType()).getFixedValue());
5239 MaxWidth = std::max<unsigned>(
5240 MaxWidth,
DL.getTypeSizeInBits(
T->getScalarType()).getFixedValue());
5243 return {MinWidth, MaxWidth};
5251 for (
Instruction &
I : BB->instructionsWithoutDebug()) {
5259 if (!isa<LoadInst>(
I) && !isa<StoreInst>(
I) && !isa<PHINode>(
I))
5264 if (
auto *PN = dyn_cast<PHINode>(&
I)) {
5278 if (
auto *ST = dyn_cast<StoreInst>(&
I))
5279 T = ST->getValueOperand()->getType();
5282 "Expected the load/store/recurrence type to be sized");
5311 LLVM_DEBUG(
dbgs() <<
"LV: Preference for VP intrinsics indicated. "
5312 "Unroll factor forced to be 1.\n");
5325 if (LoopCost == 0) {
5327 assert(LoopCost.
isValid() &&
"Expected to have chosen a VF with valid cost");
5337 for (
auto& pair : R.MaxLocalUsers) {
5338 pair.second = std::max(pair.second, 1U);
5352 unsigned IC = UINT_MAX;
5354 for (
auto& pair : R.MaxLocalUsers) {
5366 unsigned MaxLocalUsers = pair.second;
5367 unsigned LoopInvariantRegs = 0;
5368 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5369 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5371 unsigned TmpIC =
llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5375 TmpIC =
llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5376 std::max(1U, (MaxLocalUsers - 1)));
5379 IC = std::min(IC, TmpIC);
5397 EstimatedVF *= *VScale;
5399 assert(EstimatedVF >= 1 &&
"Estimated VF shouldn't be less than 1");
5405 unsigned AvailableTC =
5417 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5418 unsigned InterleaveCountLB =
bit_floor(std::max(
5419 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5420 MaxInterleaveCount = InterleaveCountLB;
5422 if (InterleaveCountUB != InterleaveCountLB) {
5423 unsigned TailTripCountUB =
5424 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5425 unsigned TailTripCountLB =
5426 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5429 if (TailTripCountUB == TailTripCountLB)
5430 MaxInterleaveCount = InterleaveCountUB;
5432 }
else if (BestKnownTC && *BestKnownTC > 0) {
5436 ? (*BestKnownTC) - 1
5444 MaxInterleaveCount =
bit_floor(std::max(
5445 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5448 assert(MaxInterleaveCount > 0 &&
5449 "Maximum interleave count must be greater than 0");
5453 if (IC > MaxInterleaveCount)
5454 IC = MaxInterleaveCount;
5457 IC = std::max(1u, IC);
5459 assert(IC > 0 &&
"Interleave count must be greater than 0.");
5463 if (VF.
isVector() && HasReductions) {
5464 LLVM_DEBUG(
dbgs() <<
"LV: Interleaving because of reductions.\n");
5472 bool ScalarInterleavingRequiresPredication =
5474 return Legal->blockNeedsPredication(BB);
5476 bool ScalarInterleavingRequiresRuntimePointerCheck =
5482 <<
"LV: IC is " << IC <<
'\n'
5483 <<
"LV: VF is " << VF <<
'\n');
5484 const bool AggressivelyInterleaveReductions =
5486 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5487 !ScalarInterleavingRequiresPredication && LoopCost <
SmallLoopCost) {
5491 unsigned SmallIC = std::min(IC, (
unsigned)llvm::bit_floor<uint64_t>(
5498 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5499 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5505 bool HasSelectCmpReductions =
5508 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5509 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5510 RdxDesc.getRecurrenceKind());
5512 if (HasSelectCmpReductions) {
5513 LLVM_DEBUG(
dbgs() <<
"LV: Not interleaving select-cmp reductions.\n");
5523 bool HasOrderedReductions =
5525 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5526 return RdxDesc.isOrdered();
5528 if (HasOrderedReductions) {
5530 dbgs() <<
"LV: Not interleaving scalar ordered reductions.\n");
5535 SmallIC = std::min(SmallIC,
F);
5536 StoresIC = std::min(StoresIC,
F);
5537 LoadsIC = std::min(LoadsIC,
F);
5541 std::max(StoresIC, LoadsIC) > SmallIC) {
5543 dbgs() <<
"LV: Interleaving to saturate store or load ports.\n");
5544 return std::max(StoresIC, LoadsIC);
5549 if (VF.
isScalar() && AggressivelyInterleaveReductions) {
5553 return std::max(IC / 2, SmallIC);
5555 LLVM_DEBUG(
dbgs() <<
"LV: Interleaving to reduce branch cost.\n");
5562 if (AggressivelyInterleaveReductions) {
5612 for (
Instruction &
I : BB->instructionsWithoutDebug()) {
5616 for (
Value *U :
I.operands()) {
5617 auto *Instr = dyn_cast<Instruction>(U);
5628 LoopInvariants.
insert(Instr);
5633 EndPoint[Instr] = IdxToInstr.
size();
5651 LLVM_DEBUG(
dbgs() <<
"LV(REG): Calculating max register usage:\n");
5653 const auto &TTICapture =
TTI;
5660 for (
unsigned int i = 0, s = IdxToInstr.
size(); i < s; ++i) {
5664 InstrList &
List = TransposeEnds[i];
5679 for (
unsigned j = 0, e = VFs.
size(); j < e; ++j) {
5687 if (VFs[j].isScalar()) {
5688 for (
auto *Inst : OpenIntervals) {
5697 for (
auto *Inst : OpenIntervals) {
5710 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5716 auto &Entry = MaxUsages[j][pair.first];
5717 Entry = std::max(Entry, pair.second);
5722 << OpenIntervals.
size() <<
'\n');
5728 for (
unsigned i = 0, e = VFs.
size(); i < e; ++i) {
5734 for (
auto *Inst : LoopInvariants) {
5737 bool IsScalar =
all_of(Inst->users(), [&](
User *U) {
5738 auto *I = cast<Instruction>(U);
5739 return TheLoop != LI->getLoopFor(I->getParent()) ||
5740 isScalarAfterVectorization(I, VFs[i]);
5746 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5750 dbgs() <<
"LV(REG): VF = " << VFs[i] <<
'\n';
5751 dbgs() <<
"LV(REG): Found max usage: " << MaxUsages[i].
size()
5753 for (
const auto &pair : MaxUsages[i]) {
5754 dbgs() <<
"LV(REG): RegisterClass: "
5758 dbgs() <<
"LV(REG): Found invariant usage: " << Invariant.
size()
5760 for (
const auto &pair : Invariant) {
5761 dbgs() <<
"LV(REG): RegisterClass: "
5775bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(
Instruction *
I,
5786 "Expecting a scalar emulated instruction");
5787 return isa<LoadInst>(
I) ||
5788 (isa<StoreInst>(
I) &&
5805 PredicatedBBsAfterVectorization[VF].
clear();
5821 !useEmulatedMaskMemRefHack(&
I, VF) &&
5822 computePredInstDiscount(&
I, ScalarCosts, VF) >= 0)
5825 PredicatedBBsAfterVectorization[VF].
insert(BB);
5833 "Instruction marked uniform-after-vectorization will be predicated");
5851 if (!
I->hasOneUse() || PredInst->
getParent() !=
I->getParent() ||
5870 for (
Use &U :
I->operands())
5871 if (
auto *J = dyn_cast<Instruction>(U.get()))
5883 while (!Worklist.
empty()) {
5887 if (ScalarCosts.contains(
I))
5918 for (
Use &U :
I->operands())
5919 if (
auto *J = dyn_cast<Instruction>(
U.get())) {
5921 "Instruction has non-scalar type");
5922 if (canBeScalarized(J))
5924 else if (needsExtract(J, VF)) {
5926 cast<VectorType>(
ToVectorTy(J->getType(), VF)),
5937 Discount += VectorCost - ScalarCost;
5938 ScalarCosts[
I] = ScalarCost;
5954 for (
Instruction &
I : BB->instructionsWithoutDebug()) {
5963 if (
C.first.isValid() &&
5971 BlockCost.first +=
C.first;
5972 BlockCost.second |=
C.second;
5974 <<
" for VF " << VF <<
" For instruction: " <<
I
5988 Cost.first += BlockCost.first;
5989 Cost.second |= BlockCost.second;
6004 const Loop *TheLoop) {
6006 auto *Gep = dyn_cast<GetElementPtrInst>(
Ptr);
6012 auto SE = PSE.
getSE();
6013 unsigned NumOperands = Gep->getNumOperands();
6014 for (
unsigned i = 1; i < NumOperands; ++i) {
6015 Value *Opd = Gep->getOperand(i);
6017 !
Legal->isInductionVariable(Opd))
6026LoopVectorizationCostModel::getMemInstScalarizationCost(
Instruction *
I,
6029 "Scalarization cost of instruction implies vectorization.");
6076 if (useEmulatedMaskMemRefHack(
I, VF))
6086LoopVectorizationCostModel::getConsecutiveMemOpCost(
Instruction *
I,
6089 auto *VectorTy = cast<VectorType>(
ToVectorTy(ValTy, VF));
6095 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6096 "Stride should be 1 or -1 for consecutive memory access");
6108 bool Reverse = ConsecutiveStride < 0;
6116LoopVectorizationCostModel::getUniformMemOpCost(
Instruction *
I,
6121 auto *VectorTy = cast<VectorType>(
ToVectorTy(ValTy, VF));
6125 if (isa<LoadInst>(
I)) {
6137 (isLoopInvariantStoreValue
6144LoopVectorizationCostModel::getGatherScatterCost(
Instruction *
I,
6147 auto *VectorTy = cast<VectorType>(
ToVectorTy(ValTy, VF));
6158LoopVectorizationCostModel::getInterleaveGroupCost(
Instruction *
I,
6161 auto *VectorTy = cast<VectorType>(
ToVectorTy(ValTy, VF));
6166 assert(Group &&
"Fail to get an interleaved access group.");
6168 unsigned InterleaveFactor = Group->getFactor();
6173 for (
unsigned IF = 0;
IF < InterleaveFactor;
IF++)
6174 if (Group->getMember(IF))
6178 bool UseMaskForGaps =
6180 (isa<StoreInst>(
I) && (Group->getNumMembers() < Group->getFactor()));
6182 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6185 if (Group->isReverse()) {
6188 "Reverse masked interleaved access not supported.");
6189 Cost += Group->getNumMembers() *
6196std::optional<InstructionCost>
6197LoopVectorizationCostModel::getReductionPatternCost(
6202 if (InLoopReductions.
empty() || VF.
isScalar() || !isa<VectorType>(Ty))
6203 return std::nullopt;
6204 auto *VectorTy = cast<VectorType>(Ty);
6221 return std::nullopt;
6232 if (!InLoopReductionImmediateChains.
count(RetI))
6233 return std::nullopt;
6237 Instruction *LastChain = InLoopReductionImmediateChains.
at(RetI);
6239 while (!isa<PHINode>(ReductionPhi))
6240 ReductionPhi = InLoopReductionImmediateChains.
at(ReductionPhi);
6269 if (RedOp && RdxDesc.
getOpcode() == Instruction::Add &&
6282 bool IsUnsigned = isa<ZExtInst>(Op0);
6299 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6300 return I == RetI ? RedCost : 0;
6304 bool IsUnsigned = isa<ZExtInst>(RedOp);
6313 if (RedCost.
isValid() && RedCost < BaseCost + ExtCost)
6314 return I == RetI ? RedCost : 0;
6315 }
else if (RedOp && RdxDesc.
getOpcode() == Instruction::Add &&
6320 bool IsUnsigned = isa<ZExtInst>(Op0);
6343 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6344 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6352 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6353 return I == RetI ? RedCost : 0;
6362 if (RedCost.
isValid() && RedCost < MulCost + BaseCost)
6363 return I == RetI ? RedCost : 0;
6367 return I == RetI ? std::optional<InstructionCost>(BaseCost) :
std::nullopt;
6371LoopVectorizationCostModel::getMemoryInstructionCost(
Instruction *
I,
6389LoopVectorizationCostModel::getInstructionCost(
Instruction *
I,
6400 auto ForcedScalar = ForcedScalars.
find(VF);
6401 if (VF.
isVector() && ForcedScalar != ForcedScalars.
end()) {
6402 auto InstSet = ForcedScalar->second;
6403 if (InstSet.count(
I))
6413 bool TypeNotScalarized =
false;
6444 if (!
RetTy->isVoidTy() &&
6466 for (
auto *V : filterExtractingOperands(Ops, VF))
6469 filterExtractingOperands(Ops, VF), Tys,
CostKind);
6491 auto isLegalToScalarize = [&]() {
6505 if (isa<LoadInst>(
I))
6510 auto &SI = cast<StoreInst>(
I);
6528 if (GatherScatterCost < ScalarizationCost)
6540 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6541 "Expected consecutive stride.");
6550 unsigned NumAccesses = 1;
6553 assert(Group &&
"Fail to get an interleaved access group.");
6559 NumAccesses = Group->getNumMembers();
6561 InterleaveCost = getInterleaveGroupCost(&
I, VF);
6566 ? getGatherScatterCost(&
I, VF) * NumAccesses
6570 getMemInstScalarizationCost(&
I, VF) * NumAccesses;
6576 if (InterleaveCost <= GatherScatterCost &&
6577 InterleaveCost < ScalarizationCost) {
6579 Cost = InterleaveCost;
6580 }
else if (GatherScatterCost < ScalarizationCost) {
6582 Cost = GatherScatterCost;
6585 Cost = ScalarizationCost;
6619 while (!Worklist.
empty()) {
6621 for (
auto &
Op :
I->operands())
6622 if (
auto *InstOp = dyn_cast<Instruction>(
Op))
6623 if ((InstOp->getParent() ==
I->getParent()) && !isa<PHINode>(InstOp) &&
6624 AddrDefs.
insert(InstOp).second)
6628 for (
auto *
I : AddrDefs) {
6629 if (isa<LoadInst>(
I)) {
6643 for (
unsigned I = 0;
I < Group->getFactor(); ++
I) {
6660 "Trying to set a vectorization decision for a scalar VF");
6679 for (
auto &ArgOp : CI->
args())
6684 for (
Type *ScalarTy : ScalarTys)
6690 if (
auto RedCost = getReductionPatternCost(CI, VF,
RetTy,
CostKind)) {
6693 std::nullopt, *RedCost);
6707 getScalarizationOverhead(CI, VF,
CostKind);
6713 bool UsesMask =
false;
6719 if (
Info.Shape.VF != VF)
6723 if (MaskRequired && !
Info.isMasked())
6727 bool ParamsOk =
true;
6729 switch (Param.ParamKind) {
6748 dyn_cast<SCEVAddRecExpr>(SE->
getSCEV(ScalarParam));
6750 if (!SAR || SAR->getLoop() !=
TheLoop) {
6756 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6784 if (VecFunc && UsesMask && !MaskRequired)
6804 if (VectorCost <=
Cost) {
6809 if (IntrinsicCost <=
Cost) {
6810 Cost = IntrinsicCost;
6829 auto hasSingleCopyAfterVectorization = [
this](
Instruction *
I,
6834 auto Scalarized = InstsToScalarize.
find(VF);
6835 assert(Scalarized != InstsToScalarize.
end() &&
6836 "VF not yet analyzed for scalarization profitability");
6837 return !Scalarized->second.count(
I) &&
6839 auto *UI = cast<Instruction>(U);
6840 return !Scalarized->second.count(UI);
6843 (void) hasSingleCopyAfterVectorization;
6851 assert(
I->getOpcode() == Instruction::GetElementPtr ||
6852 I->getOpcode() == Instruction::PHI ||
6853 (
I->getOpcode() == Instruction::BitCast &&
6854 I->getType()->isPointerTy()) ||
6855 hasSingleCopyAfterVectorization(
I, VF));
6861 switch (
I->getOpcode()) {
6862 case Instruction::GetElementPtr:
6868 case Instruction::Br: {
6875 bool ScalarPredicatedBB =
false;
6878 (PredicatedBBsAfterVectorization[VF].count(BI->
getSuccessor(0)) ||
6879 PredicatedBBsAfterVectorization[VF].count(BI->
getSuccessor(1))) &&
6881 ScalarPredicatedBB =
true;
6883 if (ScalarPredicatedBB) {
6905 case Instruction::PHI: {
6906 auto *
Phi = cast<PHINode>(
I);
6913 cast<VectorType>(VectorTy), Mask,
CostKind,
6921 return (
Phi->getNumIncomingValues() - 1) *
6929 case Instruction::UDiv:
6930 case Instruction::SDiv:
6931 case Instruction::URem:
6932 case Instruction::SRem:
6936 ScalarCost : SafeDivisorCost;
6940 case Instruction::Add:
6941 case Instruction::FAdd:
6942 case Instruction::Sub:
6943 case Instruction::FSub:
6944 case Instruction::Mul:
6945 case Instruction::FMul:
6946 case Instruction::FDiv:
6947 case Instruction::FRem:
6948 case Instruction::Shl:
6949 case Instruction::LShr:
6950 case Instruction::AShr:
6951 case Instruction::And:
6952 case Instruction::Or:
6953 case Instruction::Xor: {
6957 if (
I->getOpcode() == Instruction::Mul &&
6963 if (
auto RedCost = getReductionPatternCost(
I, VF, VectorTy,
CostKind))
6968 Value *Op2 =
I->getOperand(1);
6977 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6980 case Instruction::FNeg: {
6983 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6984 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6985 I->getOperand(0),
I);
6987 case Instruction::Select: {
6992 const Value *Op0, *Op1;
7009 Type *CondTy =
SI->getCondition()->getType();
7014 if (
auto *Cmp = dyn_cast<CmpInst>(
SI->getCondition()))
7015 Pred =
Cmp->getPredicate();
7019 case Instruction::ICmp:
7020 case Instruction::FCmp: {
7021 Type *ValTy =
I->getOperand(0)->getType();
7022 Instruction *Op0AsInstruction = dyn_cast<Instruction>(
I->getOperand(0));
7027 cast<CmpInst>(
I)->getPredicate(),
CostKind,
7030 case Instruction::Store:
7031 case Instruction::Load: {
7036 "CM decision should be taken at this point");
7043 return getMemoryInstructionCost(
I, VF);
7045 case Instruction::BitCast:
7046 if (
I->getType()->isPointerTy())
7049 case Instruction::ZExt:
7050 case Instruction::SExt:
7051 case Instruction::FPToUI:
7052 case Instruction::FPToSI:
7053 case Instruction::FPExt:
7054 case Instruction::PtrToInt:
7055 case Instruction::IntToPtr:
7056 case Instruction::SIToFP:
7057 case Instruction::UIToFP:
7058 case Instruction::Trunc:
7059 case Instruction::FPTrunc: {
7062 assert((isa<LoadInst>(
I) || isa<StoreInst>(
I)) &&
7063 "Expected a load or a store!");
7089 unsigned Opcode =
I->getOpcode();
7092 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7094 if (
StoreInst *Store = dyn_cast<StoreInst>(*
I->user_begin()))
7095 CCH = ComputeCCH(Store);
7098 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7099 Opcode == Instruction::FPExt) {
7100 if (
LoadInst *Load = dyn_cast<LoadInst>(
I->getOperand(0)))
7101 CCH = ComputeCCH(Load);
7108 auto *Trunc = cast<TruncInst>(
I);
7110 Trunc->getSrcTy(), CCH,
CostKind, Trunc);
7114 if (
auto RedCost = getReductionPatternCost(
I, VF, VectorTy,
CostKind))
7117 Type *SrcScalarTy =
I->getOperand(0)->getType();
7126 Type *MinVecTy = VectorTy;
7127 if (Opcode == Instruction::Trunc) {
7131 }
else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7141 case Instruction::Call:
7143 case Instruction::ExtractValue:
7145 case Instruction::Alloca:
7166 if ((SI = dyn_cast<StoreInst>(&
I)) &&
7209 bool InLoop = !ReductionOperations.
empty();
7212 InLoopReductions.
insert(Phi);
7215 for (
auto *
I : ReductionOperations) {
7216 InLoopReductionImmediateChains[
I] = LastChain;
7220 LLVM_DEBUG(
dbgs() <<
"LV: Using " << (InLoop ?
"inloop" :
"out of loop")
7221 <<
" reduction for phi: " << *Phi <<
"\n");
7229 return tryInsertInstruction(
7242 unsigned WidestType;
7251 unsigned N =
RegSize.getKnownMinValue() / WidestType;
7272 <<
"overriding computed VF.\n");
7277 LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing. Scalable VF requested, but "
7278 <<
"not supported by the target.\n");
7280 "Scalable vectorization requested but not supported by the target",
7281 "the scalable user-specified vectorization width for outer-loop "
7282 "vectorization cannot be used because the target does not support "
7283 "scalable vectors.",
7284 "ScalableVFUnfeasible", ORE, OrigLoop);
7289 "VF needs to be a power of two");
7291 <<
"VF " << VF <<
" to build VPlans.\n");
7298 return {VF, 0 , 0 };
7302 dbgs() <<
"LV: Not vectorizing. Inner loops aren't supported in the "
7303 "VPlan-native path.\n");
7307std::optional<VectorizationFactor>
7315 return std::nullopt;
7322 <<
"LV: Invalidate all interleaved groups due to fold-tail by masking "
7323 "which requires masked-interleaved support.\n");
7334 if (!UserVF.
isZero() && UserVFIsLegal) {
7336 "VF needs to be a power of two");
7342 buildVPlansWithVPRecipes(UserVF, UserVF);
7344 LLVM_DEBUG(
dbgs() <<
"LV: No VPlan could be built for " << UserVF
7346 return std::nullopt;
7350 return {{UserVF, 0, 0}};
7353 "InvalidCost", ORE, OrigLoop);
7366 for (
const auto &VF : VFCandidates) {
7389 return std::nullopt;
7396 [VF](
const VPlanPtr &Plan) {
return Plan->hasVF(VF); }) ==
7398 "Best VF has not a single VPlan.");
7400 for (
const VPlanPtr &Plan : VPlans) {
7401 if (Plan->hasVF(VF))
7411 bool IsUnrollMetadata =
false;
7412 MDNode *LoopID = L->getLoopID();
7415 for (
unsigned i = 1, ie = LoopID->
getNumOperands(); i < ie; ++i) {
7416 auto *MD = dyn_cast<MDNode>(LoopID->
getOperand(i));
7418 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7420 S && S->getString().starts_with(
"llvm.loop.unroll.disable");
7426 if (!IsUnrollMetadata) {
7437 L->setLoopID(NewLoopID);
7447 bool VectorizingEpilogue) {
7452 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->
getOperand(0));
7458 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7461 auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
7464 ResumePhi = cast<PHINode>(Cmp->getOperand(0));
7466 assert((!VectorizingEpilogue || ResumePhi) &&
7467 "when vectorizing the epilogue loop, we need a resume phi from main "
7484 BCBlockPhi->addIncoming(FinalValue,
Incoming);
7486 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(
Incoming),
7492 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7496 int IncomingEdgeBlockIdx =
7498 assert(IncomingEdgeBlockIdx >= 0 &&
"Invalid block index");
7500 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7501 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7503 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7505 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7508std::pair<DenseMap<const SCEV *, Value *>,
7515 "Trying to execute plan with unsupported VF");
7517 "Trying to execute plan with unsupported UF");
7519 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7520 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7522 if (!IsEpilogueVectorization)
7526 <<
", UF=" << BestUF <<
'\n');
7527 BestVPlan.
setName(
"Final VPlan");
7544 assert(IsEpilogueVectorization &&
"should only re-use the existing trip "
7545 "count during epilogue vectorization");
7549 Value *CanonicalIVStartValue;
7550 std::tie(State.
CFG.
PrevBB, CanonicalIVStartValue) =
7557 std::unique_ptr<LoopVersioning> LVer =
nullptr;
7565 LVer = std::make_unique<LoopVersioning>(
7568 State.
LVer = &*LVer;
7585 CanonicalIVStartValue, State);
7595 dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
7604 std::optional<MDNode *> VectorizedLoopID =
7611 if (VectorizedLoopID)
7612 L->setLoopID(*VectorizedLoopID);
7636#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7638 for (
const auto &Plan : VPlans)
7652std::pair<BasicBlock *, Value *>
7654 const SCEV2ValueTy &ExpandedSCEVs) {
7694 dbgs() <<
"Create Skeleton for epilogue vectorized loop (first pass)\n"
7704 dbgs() <<
"intermediate fn:\n"
7712 assert(Bypass &&
"Expected valid bypass basic block.");
7733 TCCheckBlock->
setName(
"vector.main.loop.iter.check");
7737 DT,
LI,
nullptr,
"vector.ph");
7742 "TC check is expected to dominate Bypass");
7766 return TCCheckBlock;
7775std::pair<BasicBlock *, Value *>
7777 const SCEV2ValueTy &ExpandedSCEVs) {
7785 nullptr,
"vec.epilog.iter.check",
true);
7787 VecEpilogueIterationCountCheck);
7792 "expected this to be saved from the previous pass.");
7810 VecEpilogueIterationCountCheck,
7834 for (
PHINode &Phi : VecEpilogueIterationCountCheck->
phis())
7837 for (
PHINode *Phi : PhisInBlock) {
7839 Phi->replaceIncomingBlockWith(
7841 VecEpilogueIterationCountCheck);
7848 return EPI.EpilogueIterationCountCheck == IncB;
7860 Type *IdxTy =
Legal->getWidestInductionType();
7864 EPResumeVal->
addIncoming(ConstantInt::get(IdxTy, 0),
7875 {VecEpilogueIterationCountCheck,
7886 "Expected trip count to have been safed in the first pass.");
7890 "saved trip count does not dominate insertion point.");
7901 Value *CheckMinIters =
7905 "min.epilog.iters.check");
7911 unsigned EpilogueLoopStep =
7917 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7918 const uint32_t Weights[] = {EstimatedSkipCount,
7919 MainLoopStep - EstimatedSkipCount};
7930 dbgs() <<
"Create Skeleton for epilogue vectorized loop (second pass)\n"
7944 assert(!Range.isEmpty() &&
"Trying to test an empty VF range.");
7945 bool PredicateAtRangeStart = Predicate(Range.Start);
7948 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7953 return PredicateAtRangeStart;
7963 auto MaxVFTimes2 = MaxVF * 2;
7965 VFRange SubRange = {VF, MaxVFTimes2};
7974 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
7975 if (
auto *R = Ingredient2Recipe.lookup(
I))
7976 return R->getVPSingleValue();
7978 return Plan.getOrAddLiveIn(
Op);
7987 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7989 if (ECEntryIt != EdgeMaskCache.
end())
7990 return ECEntryIt->second;
7995 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7996 assert(BI &&
"Unexpected terminator found");
7999 return EdgeMaskCache[Edge] = SrcMask;
8005 return EdgeMaskCache[Edge] = SrcMask;
8008 assert(EdgeMask &&
"No Edge Mask found for condition");
8024 return EdgeMaskCache[Edge] = EdgeMask;
8031 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8033 assert(ECEntryIt != EdgeMaskCache.
end() &&
8034 "looking up mask for edge which has not been created");
8035 return ECEntryIt->second;
8043 BlockMaskCache[Header] =
nullptr;
8055 HeaderVPBB->
insert(
IV, NewInsertionPoint);
8062 BlockMaskCache[Header] = BlockMask;
8068 assert(BCEntryIt != BlockMaskCache.
end() &&
8069 "Trying to access mask for block without one.");
8070 return BCEntryIt->second;
8074 assert(OrigLoop->
contains(BB) &&
"Block is not a part of a loop");
8075 assert(BlockMaskCache.
count(BB) == 0 &&
"Mask for block already computed");
8077 "Loop header must have cached block mask");
8086 BlockMaskCache[BB] = EdgeMask;
8091 BlockMask = EdgeMask;
8095 BlockMask = Builder.
createOr(BlockMask, EdgeMask, {});
8098 BlockMaskCache[BB] = BlockMask;
8104 assert((isa<LoadInst>(
I) || isa<StoreInst>(
I)) &&
8105 "Must be called with either a load or store");
8111 "CM decision should be taken at this point.");
8137 auto *
GEP = dyn_cast<GetElementPtrInst>(
8138 Ptr->getUnderlyingValue()->stripPointerCasts());
8145 if (
LoadInst *Load = dyn_cast<LoadInst>(
I))
8164 "step must be loop invariant");
8168 if (
auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8171 assert(isa<PHINode>(PhiOrTrunc) &&
"must be a phi node here");
8182 *PSE.
getSE(), *OrigLoop, Range);
8208 auto isOptimizableIVTruncate =
8216 isOptimizableIVTruncate(
I),
Range)) {
8218 auto *
Phi = cast<PHINode>(
I->getOperand(0));
8229 unsigned NumIncoming =
Phi->getNumIncomingValues();
8240 for (
unsigned In = 0;
In < NumIncoming;
In++) {
8245 assert(In == 0 &&
"Both null and non-null edge masks found");
8247 "Distinct incoming values with one having a full mask");
8270 if (
ID && (
ID == Intrinsic::assume ||
ID == Intrinsic::lifetime_end ||
8271 ID == Intrinsic::lifetime_start ||
ID == Intrinsic::sideeffect ||
8272 ID == Intrinsic::pseudoprobe ||
8273 ID == Intrinsic::experimental_noalias_scope_decl))
8280 bool ShouldUseVectorIntrinsic =
8287 if (ShouldUseVectorIntrinsic)
8292 std::optional<unsigned> MaskPos;
8314 Variant = Decision.Variant;
8315 MaskPos = Decision.MaskPos;
8322 if (ShouldUseVectorCall) {
8323 if (MaskPos.has_value()) {
8338 Ops.insert(Ops.
begin() + *MaskPos, Mask);
8350 assert(!isa<BranchInst>(
I) && !isa<PHINode>(
I) && !isa<LoadInst>(
I) &&
8351 !isa<StoreInst>(
I) &&
"Instruction should have been handled earlier");
8366 switch (
I->getOpcode()) {
8369 case Instruction::SDiv:
8370 case Instruction::UDiv:
8371 case Instruction::SRem:
8372 case Instruction::URem: {
8389 case Instruction::Add:
8390 case Instruction::And:
8391 case Instruction::AShr:
8392 case Instruction::FAdd:
8393 case Instruction::FCmp:
8394 case Instruction::FDiv:
8395 case Instruction::FMul:
8396 case Instruction::FNeg:
8397 case Instruction::FRem:
8398 case Instruction::FSub:
8399 case Instruction::ICmp:
8400 case Instruction::LShr:
8401 case Instruction::Mul:
8402 case Instruction::Or:
8403 case Instruction::Select:
8404 case Instruction::Shl:
8405 case Instruction::Sub:
8406 case Instruction::Xor:
8407 case Instruction::Freeze:
8415 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8417 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8434 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(
I)) {
8436 case Intrinsic::assume:
8437 case Intrinsic::lifetime_start:
8438 case Intrinsic::lifetime_end:
8460 VPValue *BlockInMask =
nullptr;
8461 if (!IsPredicated) {
8465 LLVM_DEBUG(
dbgs() <<
"LV: Scalarizing and predicating:" << *
I <<
"\n");
8474 IsUniform, BlockInMask);
8485 if (
auto Phi = dyn_cast<PHINode>(Instr)) {
8486 if (Phi->getParent() != OrigLoop->
getHeader())
8489 if ((Recipe = tryToOptimizeInductionPHI(Phi,
Operands, Range)))
8495 "can only widen reductions and fixed-order recurrences here");
8513 PhisToFix.push_back(PhiRecipe);
8517 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8518 cast<TruncInst>(Instr),
Operands, Range)))
8526 if (
auto *CI = dyn_cast<CallInst>(Instr))
8527 return tryToWidenCall(CI,
Operands, Range);
8529 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8530 return tryToWidenMemory(Instr,
Operands, Range);
8532 if (!shouldWiden(Instr, Range))
8535 if (
auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8539 if (
auto *SI = dyn_cast<SelectInst>(Instr)) {
8544 if (
auto *CI = dyn_cast<CastInst>(Instr)) {
8549 return tryToWiden(Instr,
Operands, VPBB);
8552void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
ElementCount MinVF,
8556 auto MaxVFTimes2 = MaxVF * 2;
8558 VFRange SubRange = {VF, MaxVFTimes2};
8559 if (
auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8579 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8586 Header->insert(CanonicalIVPHI, Header->begin());
8591 Instruction::Add, {CanonicalIVPHI, &Plan.
getVFxUF()}, {HasNUW,
false},
DL,
8593 CanonicalIVPHI->
addOperand(CanonicalIVIncrement);
8612 Value *IncomingValue =
8613 ExitPhi.getIncomingValueForBlock(ExitingBB);
8620LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VFRange &Range) {
8640 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8641 Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8647 bool IVUpdateMayOverflow =
false;
8658 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8678 "Unsupported interleave factor for scalable vectors");
8683 InterleaveGroups.
insert(IG);
8700 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8701 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8706 if (VPBB != HeaderVPBB)
8710 if (VPBB == HeaderVPBB)
8711 RecipeBuilder.createHeaderMask();
8712 else if (NeedsMasks)
8713 RecipeBuilder.createBlockInMask(BB);
8720 auto *
Phi = dyn_cast<PHINode>(Instr);
8721 if (Phi &&
Phi->getParent() == HeaderBB) {
8722 Operands.push_back(Plan->getOrAddLiveIn(
8725 auto OpRange = RecipeBuilder.mapToVPValues(
Instr->operands());
8726 Operands = {OpRange.begin(), OpRange.end()};
8732 if ((SI = dyn_cast<StoreInst>(&
I)) &&
8737 RecipeBuilder.tryToCreateWidenRecipe(Instr,
Operands, Range, VPBB);
8739 Recipe = RecipeBuilder.handleReplication(Instr, Range);
8741 RecipeBuilder.setRecipe(Instr, Recipe);
8742 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8753 "unexpected recipe needs moving");
8773 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8774 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8775 "entry block must be set to a VPRegionBlock having a non-empty entry "
8777 RecipeBuilder.fixHeaderPhis();
8785 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder,
Range.Start);
8790 for (
const auto *IG : InterleaveGroups) {
8792 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
8794 for (
unsigned i = 0; i < IG->getFactor(); ++i)
8795 if (
auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8796 auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
8797 StoredValues.
push_back(StoreR->getStoredValue());
8800 bool NeedsMaskForGaps =
8803 Recipe->getMask(), NeedsMaskForGaps);
8804 VPIG->insertBefore(Recipe);
8806 for (
unsigned i = 0; i < IG->getFactor(); ++i)
8808 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8809 if (!
Member->getType()->isVoidTy()) {
8820 Plan->setName(
"Initial VPlan");
8825 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8826 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.
getSCEV(StrideV));
8831 auto *CI = Plan->getOrAddLiveIn(
8832 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
8833 if (
VPValue *StrideVPV = Plan->getLiveIn(StrideV))
8839 if (!isa<SExtInst, ZExtInst>(U))
8841 VPValue *StrideVPV = Plan->getLiveIn(U);
8844 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(
8845 U->getType(), ScevStride->getAPInt().getSExtValue()));
8863 bool WithoutRuntimeCheck =
8866 WithoutRuntimeCheck);
8886 HCFGBuilder.buildHierarchicalCFG();
8894 *PSE.
getSE(), *TLI);
8899 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8900 Term->eraseFromParent();
8924void LoopVectorizationPlanner::adjustRecipesForReductions(
8927 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8934 if (
auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8937 bool HasIntermediateStore =
false;
8942 auto *IS2 =
R2->getRecurrenceDescriptor().IntermediateStore;
8943 HasIntermediateStore |= IS1 || IS2;
8964 if (HasIntermediateStore && ReductionPHIList.
size() > 1)
8966 R->moveBefore(*Header, Header->getFirstNonPhi());
8969 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8970 if (!PhiR || !PhiR->isInLoop() || (MinVF.
isScalar() && !PhiR->isOrdered()))
8976 "AnyOf reductions are not allowed for in-loop reductions");
8981 for (
unsigned I = 0;
I != Worklist.
size(); ++
I) {
8984 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8986 assert(isa<VPLiveOut>(U) &&
8987 "U must either be a VPSingleDef or VPLiveOut");
8990 Worklist.
insert(UserRecipe);
9003 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9006 unsigned IndexOfFirstOperand;
9014 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9015 assert(((MinVF.
isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9016 isa<VPWidenCallRecipe>(CurrentLink)) &&
9017 CurrentLink->getOperand(2) == PreviousLink &&
9018 "expected a call where the previous link is the added operand");
9026 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9028 LinkVPBB->
insert(FMulRecipe, CurrentLink->getIterator());
9031 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9032 if (PhiR->isInLoop() && Blend) {
9033 assert(Blend->getNumIncomingValues() == 2 &&
9034 "Blend must have 2 incoming values");
9035 if (Blend->getIncomingValue(0) == PhiR)
9036 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9038 assert(Blend->getIncomingValue(1) == PhiR &&
9039 "PhiR must be an operand of the blend");
9040 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9046 if (isa<VPWidenRecipe>(CurrentLink)) {
9047 assert(isa<CmpInst>(CurrentLinkI) &&
9048 "need to have the compare of the select");
9051 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9052 "must be a select recipe");
9053 IndexOfFirstOperand = 1;
9056 "Expected to replace a VPWidenSC");
9057 IndexOfFirstOperand = 0;
9062 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9063 ? IndexOfFirstOperand + 1
9064 : IndexOfFirstOperand;
9065 VecOp = CurrentLink->getOperand(VecOpId);
9066 assert(VecOp != PreviousLink &&
9067 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9068 (VecOpId - IndexOfFirstOperand)) ==
9070 "PreviousLink must be the operand other than VecOp");
9086 CurrentLink->replaceAllUsesWith(RedRecipe);
9087 PreviousLink = RedRecipe;
9092 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9105 return isa<VPWidenSelectRecipe>(U) ||
9106 (isa<VPReplicateRecipe>(U) &&
9107 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9108 Instruction::Select);
9114 for (
unsigned I = 0;
I != CmpR->getNumOperands(); ++
I)
9115 if (CmpR->getOperand(
I) == PhiR)
9123 if (
Select->getOperand(1) == PhiR)
9126 Select->getVPSingleValue()->replaceAllUsesWith(
Or);
9140 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9141 "reduction recipe must be defined before latch");
9143 std::optional<FastMathFlags> FMFs =
9150 return isa<VPInstruction>(&U) &&
9151 cast<VPInstruction>(&U)->getOpcode() ==
9168 assert(!PhiR->
isInLoop() &&
"Unexpected truncated inloop reduction!");
9177 Trunc->
insertAfter(NewExitingVPV->getDefiningRecipe());
9178 Extnd->insertAfter(Trunc);
9180 PhiR->
setOperand(1, Extnd->getVPSingleValue());
9181 NewExitingVPV = Extnd;
9200 ->appendRecipe(FinalReductionResult);
9202 FinalReductionResult,
9209#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9212 O << Indent <<
"INTERLEAVE-GROUP with factor " << IG->getFactor() <<
" at ";
9213 IG->getInsertPos()->printAsOperand(O,
false);
9223 for (
unsigned i = 0; i < IG->getFactor(); ++i) {
9224 if (!IG->getMember(i))
9227 O <<
"\n" << Indent <<
" store ";
9229 O <<
" to index " << i;
9231 O <<
"\n" << Indent <<
" ";
9233 O <<
" = load from index " << i;
9242 "Not a pointer induction according to InductionDescriptor!");
9244 "Unexpected type.");
9246 "Recipe should have been replaced");
9249 PHINode *CanonicalIV = cast<PHINode>(State.
get(IVR, 0,
true));
9254 Type *ScStValueType = ScalarStartValue->
getType();
9259 NewPointerPhi->
addIncoming(ScalarStartValue, VectorPH);
9266 Value *NumUnrolledElems =
9277 NewPointerPhi->
addIncoming(InductionGEP, VectorPH);
9282 for (
unsigned Part = 0; Part < State.
UF; ++Part) {
9284 Value *StartOffsetScalar =
9286 Value *StartOffset =
9293 "scalar step must be the same across all parts");
9300 State.
set(
this,
GEP, Part);
9305 assert(!State.
Instance &&
"VPDerivedIVRecipe being replicated.");
9316 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9317 DerivedIV->
setName(
"offset.idx");
9318 assert(DerivedIV != CanonicalIV &&
"IV didn't need transforming?");
9338 if (State.
Instance->Lane.isFirstLane()) {
9352 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9354 return Op->isDefinedOutsideVectorRegions();
9358 for (
unsigned Part = 1; Part < State.
UF; ++Part)
9367 for (
unsigned Part = 0; Part < State.
UF; ++Part)
9374 if (isa<StoreInst>(UI) &&
9385 for (
unsigned Part = 0; Part < State.
UF; ++Part)
9386 for (
unsigned Lane = 0; Lane < EndLane; ++Lane)
9398 auto &Builder = State.
Builder;
9400 for (
unsigned Part = 0; Part < State.
UF; ++Part) {
9402 Value *Mask =
nullptr;
9403 if (
auto *VPMask =
getMask()) {
9406 Mask = State.
get(VPMask, Part);
9408 Mask = Builder.CreateVectorReverse(Mask,
"reverse");
9413 NewLI = Builder.CreateMaskedGather(DataTy,
Addr, Alignment, Mask,
nullptr,
9414 "wide.masked.gather");
9416 NewLI = Builder.CreateMaskedLoad(DataTy,
Addr, Alignment, Mask,
9418 "wide.masked.load");
9420 NewLI = Builder.CreateAlignedLoad(DataTy,
Addr, Alignment,
"wide.load");
9425 NewLI = Builder.CreateVectorReverse(NewLI,
"reverse");
9426 State.
set(
this, NewLI, Part);
9435 Value *AllTrueMask =
9437 return Builder.
CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
9438 {Operand, AllTrueMask, EVL},
nullptr,
Name);
9442 assert(State.
UF == 1 &&
"Expected only UF == 1 when vectorizing with "
9443 "explicit vector length.");
9451 auto &Builder = State.
Builder;
9456 Value *Mask =
nullptr;
9458 Mask = State.
get(VPMask, 0);
9462 Mask = Builder.CreateVectorSplat(State.
VF, Builder.getTrue());
9467 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {
Addr, Mask, EVL},
9468 nullptr,
"wide.masked.gather");
9473 Instruction::Load, DataTy,
Addr,
"vp.op.load"));
9481 State.
set(
this, Res, 0);
9491 auto &Builder = State.
Builder;
9494 for (
unsigned Part = 0; Part < State.
UF; ++Part) {
9496 Value *Mask =
nullptr;
9497 if (
auto *VPMask =
getMask()) {
9500 Mask = State.
get(VPMask, Part);
9502 Mask = Builder.CreateVectorReverse(Mask,
"reverse");
9505 Value *StoredVal = State.
get(StoredVPValue, Part);
9509 StoredVal = Builder.CreateVectorReverse(StoredVal,
"reverse");
9515 NewSI = Builder.CreateMaskedScatter(StoredVal,
Addr, Alignment, Mask);
9517 NewSI = Builder.CreateMaskedStore(StoredVal,
Addr, Alignment, Mask);
9519 NewSI = Builder.CreateAlignedStore(StoredVal,
Addr, Alignment);
9525 assert(State.
UF == 1 &&
"Expected only UF == 1 when vectorizing with "
9526 "explicit vector length.");
9533 auto &Builder = State.
Builder;
9537 Value *StoredVal = State.
get(StoredValue, 0);
9541 Value *Mask =
nullptr;
9543 Mask = State.
get(VPMask, 0);
9547 Mask = Builder.CreateVectorSplat(State.
VF, Builder.getTrue());
9550 if (CreateScatter) {
9552 Intrinsic::vp_scatter,
9553 {StoredVal, Addr, Mask, EVL});
9559 {StoredVal, Addr}));
9628 LLVM_DEBUG(
dbgs() <<
"LV: cannot compute the outer-loop trip count\n");
9632 Function *
F = L->getHeader()->getParent();
9638 LoopVectorizationCostModel CM(
SEL, L, PSE, LI, LVL, *
TTI, TLI, DB, AC, ORE,
F,
9643 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *
TTI, LVL, CM, IAI, PSE, Hints,
9663 bool AddBranchWeights =
9665 GeneratedRTChecks Checks(*PSE.
getSE(), DT, LI,
TTI,
9666 F->getParent()->getDataLayout(), AddBranchWeights);
9668 VF.
Width, 1, LVL, &CM, BFI, PSI, Checks);
9670 << L->getHeader()->getParent()->getName() <<
"\"\n");
9690 if (
auto *S = dyn_cast<StoreInst>(&Inst)) {
9691 if (S->getValueOperand()->getType()->isFloatTy())
9701 while (!Worklist.
empty()) {
9703 if (!L->contains(
I))
9705 if (!Visited.
insert(
I).second)
9712 if (isa<FPExtInst>(
I) && EmittedRemark.
insert(
I).second)
9715 I->getDebugLoc(), L->getHeader())
9716 <<
"floating point conversion changes vector width. "
9717 <<
"Mixed floating point precision requires an up/down "
9718 <<
"cast that will negatively impact performance.";
9721 for (
Use &
Op :
I->operands())
9722 if (
auto *OpI = dyn_cast<Instruction>(
Op))
9729 std::optional<unsigned> VScale,
Loop *L,
9742 <<
"LV: Interleaving only is not profitable due to runtime checks\n");
9783 unsigned AssumedMinimumVscale = 1;
9785 AssumedMinimumVscale = *VScale;
9786 IntVF *= AssumedMinimumVscale;
9804 uint64_t MinTC = std::max(MinTC1, MinTC2);
9806 MinTC =
alignTo(MinTC, IntVF);
9810 dbgs() <<
"LV: Minimum required TC for runtime checks to be profitable:"
9818 LLVM_DEBUG(
dbgs() <<
"LV: Vectorization is not beneficial: expected "
9819 "trip count < minimum profitable VF ("
9830 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9832 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9837 "VPlan-native path is not enabled. Only process inner loops.");
9844 << L->getHeader()->getParent()->getName() <<
"' from "
9845 << DebugLocStr <<
"\n");
9850 dbgs() <<
"LV: Loop hints:"
9861 Function *
F = L->getHeader()->getParent();
9883 LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: Cannot prove legality.\n");
9893 if (!L->isInnermost())
9897 assert(L->isInnermost() &&
"Inner loop expected.");
9919 LLVM_DEBUG(
dbgs() <<
"LV: Found a loop with a very small trip count. "
9920 <<
"This loop is worth vectorizing only if no scalar "
9921 <<
"iteration overheads are incurred.");
9923 LLVM_DEBUG(
dbgs() <<
" But vectorizing was explicitly forced.\n");
9936 LLVM_DEBUG(
dbgs() <<
" But the target considers the trip count too "
9937 "small to consider vectorizing.\n");
9939 "The trip count is below the minial threshold value.",
9940 "loop trip count is too low, avoiding vectorization",
9941 "LowTripCount",
ORE, L);
9950 if (
F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9952 "Can't vectorize when the NoImplicitFloat attribute is used",
9953 "loop not vectorized due to NoImplicitFloat attribute",
9954 "NoImplicitFloat",
ORE, L);
9966 "Potentially unsafe FP op prevents vectorization",
9967 "loop not vectorized due to unsafe FP support.",
9968 "UnsafeFP",
ORE, L);
9973 bool AllowOrderedReductions;
9983 ExactFPMathInst->getDebugLoc(),
9984 ExactFPMathInst->getParent())
9985 <<
"loop not vectorized: cannot prove it is safe to reorder "
9986 "floating-point operations";
9988 LLVM_DEBUG(
dbgs() <<
"LV: loop not vectorized: cannot prove it is safe to "
9989 "reorder floating-point operations\n");
9995 LoopVectorizationCostModel CM(
SEL, L, PSE,
LI, &LVL, *
TTI,
TLI,
DB,
AC,
ORE,
9998 LoopVectorizationPlanner LVP(L,
LI,
DT,
TLI, *
TTI, &LVL, CM, IAI, PSE, Hints,
10006 std::optional<VectorizationFactor> MaybeVF = LVP.
plan(UserVF, UserIC);
10011 bool AddBranchWeights =
10014 F->getParent()->getDataLayout(), AddBranchWeights);
10020 unsigned SelectedIC = std::max(IC, UserIC);
10027 bool ForceVectorization =
10029 if (!ForceVectorization &&
10031 *PSE.
getSE(),
SEL)) {
10034 DEBUG_TYPE,
"CantReorderMemOps", L->getStartLoc(),
10036 <<
"loop not vectorized: cannot prove it is safe to reorder "
10037 "memory operations";
10046 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10047 bool VectorizeLoop =
true, InterleaveLoop =
true;
10049 LLVM_DEBUG(
dbgs() <<
"LV: Vectorization is possible but not beneficial.\n");
10050 VecDiagMsg = std::make_pair(
10051 "VectorizationNotBeneficial",
10052 "the cost-model indicates that vectorization is not beneficial");
10053 VectorizeLoop =
false;
10056 if (!MaybeVF && UserIC > 1) {
10059 LLVM_DEBUG(
dbgs() <<
"LV: Ignoring UserIC, because vectorization and "
10060 "interleaving should be avoided up front\n");
10061 IntDiagMsg = std::make_pair(
10062 "InterleavingAvoided",
10063 "Ignoring UserIC, because interleaving was avoided up front");
10064 InterleaveLoop =
false;
10065 }
else if (IC == 1 && UserIC <= 1) {
10068 IntDiagMsg = std::make_pair(
10069 "InterleavingNotBeneficial",
10070 "the cost-model indicates that interleaving is not beneficial");
10071 InterleaveLoop =
false;
10073 IntDiagMsg.first =
"InterleavingNotBeneficialAndDisabled";
10074 IntDiagMsg.second +=
10075 " and is explicitly disabled or interleave count is set to 1";
10077 }
else if (IC > 1 && UserIC == 1) {
10080 dbgs() <<
"LV: Interleaving is beneficial but is explicitly disabled.");
10081 IntDiagMsg = std::make_pair(
10082 "InterleavingBeneficialButDisabled",
10083 "the cost-model indicates that interleaving is beneficial "
10084 "but is explicitly disabled or interleave count is set to 1");
10085 InterleaveLoop =
false;
10089 IC = UserIC > 0 ? UserIC : IC;
10093 if (!VectorizeLoop && !InterleaveLoop) {
10097 L->getStartLoc(), L->getHeader())
10098 << VecDiagMsg.second;
10102 L->getStartLoc(), L->getHeader())
10103 << IntDiagMsg.second;
10106 }
else if (!VectorizeLoop && InterleaveLoop) {
10110 L->getStartLoc(), L->getHeader())
10111 << VecDiagMsg.second;
10113 }
else if (VectorizeLoop && !InterleaveLoop) {
10115 <<
") in " << DebugLocStr <<
'\n');
10118 L->getStartLoc(), L->getHeader())
10119 << IntDiagMsg.second;
10121 }
else if (VectorizeLoop && InterleaveLoop) {
10123 <<
") in " << DebugLocStr <<
'\n');
10127 bool DisableRuntimeUnroll =
false;
10128 MDNode *OrigLoopID = L->getLoopID();
10130 using namespace ore;
10131 if (!VectorizeLoop) {
10132 assert(IC > 1 &&
"interleave count should not be 1 or 0");
10135 InnerLoopUnroller Unroller(L, PSE,
LI,
DT,
TLI,
TTI,
AC,
ORE, IC, &LVL,
10144 <<
"interleaved loop (interleaved count: "
10145 << NV(
"InterleaveCount", IC) <<
")";
10160 EPI, &LVL, &CM,
BFI,
PSI, Checks);
10162 std::unique_ptr<VPlan> BestMainPlan(
10164 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.
executePlan(
10179 Header->setName(
"vec.epilog.vector.body");
10189 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10191 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10195 ExpandR->eraseFromParent();
10202 if (isa<VPCanonicalIVPHIRecipe>(&R))
10205 Value *ResumeV =
nullptr;
10207 if (
auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10209 ReductionPhi->getRecurrenceDescriptor();
10211 ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
10217 cast<Instruction>(ResumeV)->
getParent()->getFirstNonPHI());
10227 if (
auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10228 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10229 ID = &Ind->getInductionDescriptor();
10231 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10232 IndPhi = WidenInd->getPHINode();
10233 ID = &WidenInd->getInductionDescriptor();
10240 assert(ResumeV &&
"Must have a resume value");
10242 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10246 DT,
true, &ExpandedSCEVs);
10247 ++LoopsEpilogueVectorized;
10250 DisableRuntimeUnroll =
true;
10264 DisableRuntimeUnroll =
true;
10274 std::optional<MDNode *> RemainderLoopID =
10277 if (RemainderLoopID) {
10278 L->setLoopID(*RemainderLoopID);
10280 if (DisableRuntimeUnroll)
10319 bool Changed =
false, CFGChanged =
false;
10326 for (
const auto &L : *
LI)
10327 Changed |= CFGChanged |=
10338 LoopsAnalyzed += Worklist.
size();
10341 while (!Worklist.
empty()) {
10387 runImpl(
F,
SE,
LI,
TTI,
DT,
BFI, &
TLI,
DB,
AC,
LAIs,
ORE,
PSI);
10388 if (!Result.MadeAnyChange)
10407 if (Result.MadeCFGChange) {
10423 OS, MapClassName2PassName);
10426 OS << (InterleaveOnlyWhenForced ?
"" :
"no-") <<
"interleave-forced-only;";
10427 OS << (VectorizeOnlyWhenForced ?
"" :
"no-") <<
"vectorize-forced-only;";
static unsigned getIntrinsicID(const SDNode *N)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefAnalysis InstSet & ToRemove
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This is the interface for a simple mod/ref and alias analysis over globals.
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static void createAndCollectMergePhiForReduction(VPInstruction *RedResult, DenseMap< const RecurrenceDescriptor *, Value * > &ReductionResumeValues, VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue)
static std::optional< unsigned > getSmallBestKnownTC(ScalarEvolution &SE, Loop *L)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it's operands.
static void emitInvalidCostRemarks(SmallVector< InstructionVFPair > InvalidCosts, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
const char LLVMLoopVectorizeFollowupAll[]
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static Type * MaybeVectorizeType(Type *Elt, ElementCount VF)
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static Type * smallestIntegerVectorType(Type *T1, Type *T2)
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional< unsigned > VScale, Loop *L, ScalarEvolution &SE, ScalarEpilogueLowering SEL)
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I)
Create an analysis remark that explains why vectorization failed.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static std::string getDebugLocString(const Loop *L)
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static bool useActiveLaneMask(TailFoldingStyle Style)
static Type * largestIntegerVectorType(Type *T1, Type *T2)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static void AddRuntimeUnrollDisableMetaData(Loop *L)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static cl::opt< bool > PrintVPlansInDotFormat("vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans"))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
separate const offset from gep
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
This defines the Use class.
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
static const uint32_t IV[8]
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
int64_t getSExtValue() const
Get sign extended value.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
BinaryOps getOpcode() const
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
static BranchInst * Create(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore)
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_UGT
unsigned greater than
@ ICMP_ULT
unsigned less than
@ ICMP_ULE
unsigned less or equal
static ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getFalse(LLVMContext &Context)
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
An analysis that produces DemandedBits for a function.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
constexpr bool isVector() const
One or more elements.
static constexpr ElementCount getScalable(ScalarTy MinVal)
static constexpr ElementCount getFixed(ScalarTy MinVal)
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
constexpr bool isScalar() const
Exactly one element.
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
void printDebugTracesAtEnd() override
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
void printDebugTracesAtEnd() override
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent function types.
param_iterator param_begin() const
param_iterator param_end() const
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
static GetElementPtrInst * Create(Type *PointeeType, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Module * getParent()
Get the module that this global value is contained inside of...
Common base class shared among various IRBuilders.
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
ConstantInt * getFalse()
Get the constant value for i1 false.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateStepVector(Type *DstType, const Twine &Name="")
Creates a vector of type DstType with the linear sequence <0, 1, ...>
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
A struct for saving information about induction variables.
BinaryOperator * getInductionBinOp() const
InductionKind getKind() const
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
virtual std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
PHINode * createInductionResumeValue(PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef< BasicBlock * > BypassBlocks, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create a new phi node for the induction variable OrigPhi to resume iteration count in the scalar epil...
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
BasicBlock * LoopScalarBody
The scalar loop body.
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
DenseMap< PHINode *, Value * > IVEndValues
ElementCount MinProfitableTripCount
Value * createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL)
Returns a bitcasted value to the requested vector type.
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
bool areSafetyChecksAdded()
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
LoopVectorizationCostModel * Cost
The profitablity analysis.
SmallMapVector< const RecurrenceDescriptor *, PHINode *, 4 > ReductionResumeValues
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * completeLoopSkeleton()
Complete the loop skeleton by adding debug MDs, creating appropriate conditional branches in the midd...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
LoopVectorizationLegality * Legal
The legality analysis.
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
void createInductionResumeValues(const SCEV2ValueTy &ExpandedSCEVs, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create new phi nodes for the induction variables to resume iteration count in the scalar epilogue,...
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State)
Fix the non-induction PHIs in Plan.
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
bool OptForSizeBasedOnProfile
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void vectorizeInterleaveGroup(const InterleaveGroup< Instruction > *Group, ArrayRef< VPValue * > VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef< VPValue * > StoredValues, VPValue *BlockInMask, bool NeedsMaskForGaps)
Try to vectorize interleaved access group Group with the base address given in Addr,...
void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State)
Create the exit value of first order recurrences in the middle block and update their users.
virtual std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
unsigned UF
The vectorization unroll factor to use.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan)
Fix the vectorized code, taking care of header phi's, live-outs, and more.
BasicBlock * LoopExitBlock
The unique ExitBlock of the scalar loop if one exists.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
const BasicBlock * getParent() const
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getIndex(const InstTy *Instr) const
Get the index for the given member.
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Drive the analysis of interleaved memory accesses in the loop.
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Store the result of a depth first search within basic blocks contained by a single loop.
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
RPOIterator endRPO() const
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
bool hasPredStores() const
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
bool isAccessInterleaved(Instruction *Instr)
Check if Instr belongs to any interleaved access group.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
const Function * TheFunction
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
std::pair< InstructionCost, bool > VectorizationCostTy
The vectorization cost is a combination of the cost itself and a boolean indicating whether any of th...
DemandedBits * DB
Demanded bits analysis.
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
VectorizationCostTy expectedCost(ElementCount VF, SmallVectorImpl< InstructionVFPair > *Invalid=nullptr)
Returns the expected execution cost.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool runtimeChecksRequired()
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool isEpilogueVectorizationProfitable(const ElementCount VF) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr)
Get the interleaved access group that Instr belongs to.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
unsigned getNumStores() const
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
uint64_t getMaxSafeVectorWidthInBits() const
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if value V is uniform across VF lanes, when VF is provided, and otherwise if V is invari...
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool isSafeForAnyVectorWidth() const
unsigned getNumLoads() const
Type * getWidestInductionType()
Returns the widest induction type.
const LoopAccessInfo * getLAI() const
bool prepareToFoldTailByMasking()
Return true if we can vectorize this loop while folding its tail by masking, and mark all respective ...
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
std::optional< VectorizationFactor > plan(ElementCount UserVF, unsigned UserIC)
Plan how to best vectorize, return the best VF and its cost, or std::nullopt if vectorization and int...
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
std::pair< DenseMap< const SCEV *, Value * >, DenseMap< const RecurrenceDescriptor *, Value * > > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
VPlan & getBestPlanFor(ElementCount VF) const
Return the best VPlan for VF.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
void printPlans(raw_ostream &O)
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Instruction * getExactFPInst()
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool isScalableVectorizationDisabled() const
enum ForceKind getForce() const
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
bool isPotentiallyUnsafe() const
ElementCount getWidth() const
@ FK_Enabled
Forcing enabled.
@ FK_Undefined
Not selected.
@ FK_Disabled
Forcing disabled.
unsigned getPredicate() const
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
unsigned getInterleave() const
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
const MDOperand & getOperand(unsigned I) const
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
unsigned getNumOperands() const
Return number of MDNode operands.
static MDString * get(LLVMContext &Context, StringRef Str)
This class implements a map that also provides access to all stored values in a deterministic order.
iterator find(const KeyT &Key)
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
void preserve()
Mark an analysis as preserved.
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
iterator end()
Get an iterator to the end of the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
iterator begin()
Get an iterator to the beginning of the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
This class provides computation of slot numbers for LLVM Assembly writing.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isPointerTy() const
True if this is an instance of PointerType.
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isTokenTy() const
Return true if this is 'token'.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
iterator begin()
Recipe iterator methods.
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
void insert(VPRecipeBase *Recipe, iterator InsertPt)
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
VPRegionBlock * getParent()
const VPBasicBlock * getExitingBasicBlock() const
void setName(const Twine &newName)
const VPBasicBlock * getEntryBasicBlock() const
VPBlockBase * getSingleSuccessor() const
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
VPValue * getStartValue() const
This is a concrete Recipe that models a single VPlan-level instruction.
@ FirstOrderRecurrenceSplice
unsigned getOpcode() const
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
VPValue * getAddr() const
Return the address accessed by this recipe.
VPValue * getMask() const
Return the mask used by this recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
unsigned getNumStoreOperands() const
Returns the number of stored operands of this interleave group.
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getFirstLane()
A value that is used outside the VPlan.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
VPBasicBlock * getParent()
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPValue * getVPValueOrAddLiveIn(Value *V, VPlan &Plan)
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
void createHeaderMask()
Create the mask for the vector loop header block.
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
A recipe for handling reduction phis.
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
const RecurrenceDescriptor & getRecurrenceDescriptor() const
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
This class can be used to assign names to VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
void setOperand(unsigned I, VPValue *New)
unsigned getNumOperands() const
VPValue * getOperand(unsigned N) const
void addOperand(VPValue *Operand)
Value * getUnderlyingValue()
Return the underlying Value attached to this VPValue.
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
void replaceAllUsesWith(VPValue *New)
user_iterator user_begin()
unsigned getNumUsers() const
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
A recipe to compute the pointers for widened memory accesses of IndexTy for all parts.
A recipe for widening Call instructions.
A Recipe for widening the canonical induction variable of the vector loop.
VPWidenCastRecipe is a recipe to create vector cast instructions.
A recipe for handling GEP instructions.
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
A common base class for widening memory operations.
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
VPValue * getMask() const
Return the mask used by this recipe.
VPValue * getAddr() const
Return the address accessed by this recipe.
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
A recipe for handling phis that are widened in the vector loop.
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void execute(VPTransformState &State) override
Generate vector values for the pointer induction.
VPWidenRecipe is a recipe for producing a copy of vector type its ingredient.
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
void prepareToExecute(Value *TripCount, Value *VectorTripCount, Value *CanonicalIVStartValue, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
VPBasicBlock * getEntry()
VPValue & getVectorTripCount()
The vector trip count.
void setName(const Twine &newName)
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
VPValue * getTripCount() const
The trip count of the original loop.
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
void removeLiveOut(PHINode *PN)
void addLiveOut(PHINode *PN, VPValue *V)
VPBasicBlock * getPreheader()
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
static VPlanPtr createInitialVPlan(const SCEV *TripCount, ScalarEvolution &PSE)
Create initial VPlan skeleton, having an "entry" VPBasicBlock (wrapping original scalar pre-header) w...
bool hasVF(ElementCount VF)
bool hasUF(unsigned UF) const
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
LLVM_DUMP_METHOD void dump() const
Dump the plan to stderr (for debugging).
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
const MapVector< PHINode *, VPLiveOut * > & getLiveOuts() const
VPValue * getSCEVExpansion(const SCEV *S) const
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUser() const
Return true if there is exactly one user of this value.
void setName(const Twine &Name)
Change the name of the value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
LLVMContext & getContext() const
All values hold a context through their type.
StringRef getName() const
Return a constant reference to the value's name.
VectorBuilder & setEVL(Value *NewExplicitVectorLength)
VectorBuilder & setMask(Value *NewMask)
Value * createVectorInstruction(unsigned Opcode, Type *ReturnTy, ArrayRef< Value * > VecOpArray, const Twine &Name=Twine())
Base class of all SIMD vector types.
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr ScalarTy getFixedValue() const
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
constexpr bool isNonZero() const
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
constexpr bool isZero() const
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
A raw_ostream that writes to an std::string.
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PredicateElseScalarEpilogue
@ PredicateOrDontVectorize
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
@ C
The default llvm calling convention, compatible with C.
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
NodeAddr< PhiNode * > Phi
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
const_iterator end(StringRef path)
Get end iterator over path.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
bool isUniformAfterVectorization(VPValue *VPV)
Returns true if VPV is uniform after vectorization.
This is an optimization pass for GlobalISel generic memory operations.
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
void stable_sort(R &&Range)
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const SCEV * createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *OrigLoop)
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
auto map_range(ContainerTy &&C, FuncTy F)
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
std::unique_ptr< VPlan > VPlanPtr
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
cl::opt< bool > EnableLoopVectorization
Align getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Type * ToVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
@ Invalid
Denotes invalid value.
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
This struct is a compact representation of a valid (non-zero power of two) alignment.
A special type used by analysis passes to provide an address that identifies that particular analysis...
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
An information struct used to provide DenseMap with the various necessary components for a given valu...
ElementCountComparator creates a total ordering for ElementCount for the purposes of using it in a se...
Encapsulate information regarding vectorization of a loop and its epilogue.
BasicBlock * SCEVSafetyCheck
BasicBlock * MemSafetyCheck
BasicBlock * MainLoopIterationCountCheck
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF)
BasicBlock * EpilogueIterationCountCheck
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
std::optional< unsigned > MaskPos
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
bool processLoop(Loop *L)
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
LoopVectorizeResult runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
A MapVector that performs no allocations if smaller than a certain size.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
A recipe for handling first-order recurrence phis.
VPIteration represents a single point in the iteration space of the output (vectorized and/or unrolle...
bool isFirstIteration() const
void execute(VPTransformState &State) override
Generate the wide load or gather.
VPValue * getEVL() const
Return the EVL operand.
A recipe for widening load operations, using the address to load from and an optional mask.
void execute(VPTransformState &State) override
Generate a wide load or gather.
A recipe for widening select instructions.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
void execute(VPTransformState &State) override
Generate the wide store or scatter.
VPValue * getEVL() const
Return the EVL operand.
A recipe for widening store operations, using the stored value, the address to store to and an option...
void execute(VPTransformState &State) override
Generate a wide store or scatter.
VPValue * getStoredValue() const
Return the value stored by this recipe.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.
static bool HoistRuntimeChecks