155#define LV_NAME "loop-vectorize"
156#define DEBUG_TYPE LV_NAME
166 "llvm.loop.vectorize.followup_vectorized";
168 "llvm.loop.vectorize.followup_epilogue";
171STATISTIC(LoopsVectorized,
"Number of loops vectorized");
172STATISTIC(LoopsAnalyzed,
"Number of loops analyzed for vectorization");
173STATISTIC(LoopsEpilogueVectorized,
"Number of epilogues vectorized");
177 cl::desc(
"Enable vectorization of epilogue loops."));
181 cl::desc(
"When epilogue vectorization is enabled, and a value greater than "
182 "1 is specified, forces the given VF for all applicable epilogue "
187 cl::desc(
"Only loops with vectorization factor equal to or larger than "
188 "the specified value are considered for epilogue vectorization."));
194 cl::desc(
"Loops with a constant trip count that is smaller than this "
195 "value are vectorized only if no scalar iteration overheads "
200 cl::desc(
"The maximum allowed number of runtime memory checks"));
216 "prefer-predicate-over-epilogue",
219 cl::desc(
"Tail-folding and predication preferences over creating a scalar "
223 "Don't tail-predicate loops, create scalar epilogue"),
225 "predicate-else-scalar-epilogue",
226 "prefer tail-folding, create scalar epilogue if tail "
229 "predicate-dont-vectorize",
230 "prefers tail-folding, don't attempt vectorization if "
231 "tail-folding fails.")));
234 "force-tail-folding-style",
cl::desc(
"Force the tail folding style"),
237 clEnumValN(TailFoldingStyle::None,
"none",
"Disable tail folding"),
239 TailFoldingStyle::Data,
"data",
240 "Create lane mask for data only, using active.lane.mask intrinsic"),
241 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
242 "data-without-lane-mask",
243 "Create lane mask with compare/stepvector"),
244 clEnumValN(TailFoldingStyle::DataAndControlFlow,
"data-and-control",
245 "Create lane mask using active.lane.mask intrinsic, and use "
246 "it for both data and control flow"),
248 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
249 "data-and-control-without-rt-check",
250 "Similar to data-and-control, but remove the runtime check")));
254 cl::desc(
"Maximize bandwidth when selecting vectorization factor which "
255 "will be determined by the smallest type in loop."));
259 cl::desc(
"Enable vectorization on interleaved memory accesses in a loop"));
265 cl::desc(
"Enable vectorization on masked interleaved memory accesses in a loop"));
269 cl::desc(
"We don't interleave loops with a estimated constant trip count "
270 "below this number"));
274 cl::desc(
"A flag that overrides the target's number of scalar registers."));
278 cl::desc(
"A flag that overrides the target's number of vector registers."));
282 cl::desc(
"A flag that overrides the target's max interleave factor for "
287 cl::desc(
"A flag that overrides the target's max interleave factor for "
288 "vectorized loops."));
292 cl::desc(
"A flag that overrides the target's expected cost for "
293 "an instruction to a single constant value. Mostly "
294 "useful for getting consistent testing."));
299 "Pretend that scalable vectors are supported, even if the target does "
300 "not support them. This flag should only be used for testing."));
305 "The cost of a loop that is considered 'small' by the interleaver."));
309 cl::desc(
"Enable the use of the block frequency analysis to access PGO "
310 "heuristics minimizing code growth in cold regions and being more "
311 "aggressive in hot regions."));
317 "Enable runtime interleaving until load/store ports are saturated"));
322 cl::desc(
"Enable interleaving for loops with small iteration counts that "
323 "contain scalar reductions to expose ILP."));
328 cl::desc(
"Max number of stores to be predicated behind an if."));
332 cl::desc(
"Count the induction variable only once when interleaving"));
336 cl::desc(
"Enable if predication of stores during vectorization."));
340 cl::desc(
"The maximum interleave count to use when interleaving a scalar "
341 "reduction in a nested loop."));
346 cl::desc(
"Prefer in-loop vector reductions, "
347 "overriding the targets preference."));
351 cl::desc(
"Enable the vectorisation of loops with in-order (strict) "
357 "Prefer predicating a reduction operation over an after loop select."));
362 cl::desc(
"Enable VPlan-native vectorization path with "
363 "support for outer loop vectorization."));
373 "Build VPlan for every supported loop nest in the function and bail "
374 "out right after the build (stress test the VPlan H-CFG construction "
375 "in the VPlan-native vectorization path)."));
379 cl::desc(
"Enable loop interleaving in Loop vectorization passes"));
382 cl::desc(
"Run the Loop vectorization passes"));
386 cl::desc(
"Use dot format instead of plain text when dumping VPlans"));
389 "force-widen-divrem-via-safe-divisor",
cl::Hidden,
391 "Override cost based safe divisor widening for div/rem instructions"));
400 return DL.getTypeAllocSizeInBits(Ty) !=
DL.getTypeSizeInBits(Ty);
439 unsigned Factor = Vals.
size();
440 assert(Factor > 1 &&
"Tried to interleave invalid number of vectors");
444 for (
Value *Val : Vals)
445 assert(Val->getType() == VecTy &&
"Tried to interleave mismatched types");
450 if (VecTy->isScalableTy()) {
451 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
452 return Builder.CreateIntrinsic(
453 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
461 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
462 return Builder.CreateShuffleVector(
468class GeneratedRTChecks;
512 this->MinProfitableTripCount = VecWidth;
528 virtual std::pair<BasicBlock *, Value *>
560 VPValue *BlockInMask,
bool NeedsMaskForGaps);
583 std::pair<BasicBlock *, Value *> AdditionalBypass = {
nullptr,
nullptr});
663 const SCEV2ValueTy &ExpandedSCEVs,
664 std::pair<BasicBlock *, Value *> AdditionalBypass = {
nullptr,
nullptr});
822 "A high UF for the epilogue loop is likely not beneficial.");
842 GeneratedRTChecks &Checks)
844 EPI.MainLoopVF,
EPI.MainLoopVF,
EPI.MainLoopUF, LVL,
851 const SCEV2ValueTy &ExpandedSCEVs)
final {
858 virtual std::pair<BasicBlock *, Value *>
882 GeneratedRTChecks &Check)
887 std::pair<BasicBlock *, Value *>
911 GeneratedRTChecks &Checks)
918 std::pair<BasicBlock *, Value *>
940 if (
I->getDebugLoc() !=
Empty)
941 return I->getDebugLoc();
943 for (
Use &
Op :
I->operands()) {
945 if (OpInst->getDebugLoc() !=
Empty)
946 return OpInst->getDebugLoc();
949 return I->getDebugLoc();
958 dbgs() <<
"LV: " << Prefix << DebugMsg;
980 CodeRegion =
I->getParent();
983 if (
I->getDebugLoc())
984 DL =
I->getDebugLoc();
1001 return B.CreateElementCount(Ty, VF);
1007 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
"Invalid loop count");
1021 <<
"loop not vectorized: " << OREMsg);
1039 "Vectorizing: ", TheLoop->
isInnermost() ?
"innermost loop" :
"outer loop",
1045 <<
"vectorized " << LoopType <<
"loop (vectorization width: "
1047 <<
", interleaved count: " <<
ore::NV(
"InterleaveCount", IC) <<
")";
1059 if (
const DebugLoc LoopDbgLoc = L->getStartLoc())
1060 LoopDbgLoc.print(
OS);
1063 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1076 auto collectPoisonGeneratingInstrsInBackwardSlice([&](
VPRecipeBase *Root) {
1081 while (!Worklist.
empty()) {
1082 VPRecipeBase *CurRec = Worklist.back();
1083 Worklist.pop_back();
1085 if (!Visited.insert(CurRec).second)
1092 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1093 isa<VPInterleaveRecipe>(CurRec) ||
1094 isa<VPScalarIVStepsRecipe>(CurRec) ||
1095 isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1096 isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1102 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
1103 RecWithFlags->dropPoisonGeneratingFlags();
1105 Instruction *Instr = CurRec->getUnderlyingInstr();
1107 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
1108 "found instruction with poison generating flags not covered by "
1109 "VPRecipeWithIRFlags");
1123 for (
VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1125 if (
auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1126 Instruction &UnderlyingInstr = WidenRec->getIngredient();
1127 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1128 if (AddrDef && WidenRec->isConsecutive() &&
1130 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1131 }
else if (
auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1132 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1136 InterleaveRec->getInterleaveGroup();
1137 bool NeedPredication =
false;
1139 I < NumMembers; ++
I) {
1143 Legal->blockNeedsPredication(Member->getParent());
1146 if (NeedPredication)
1147 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1158 "Expected to find a resume value for the reduction.");
1293 "Profitable to scalarize relevant only for VF > 1.");
1300 auto Scalars = InstsToScalarize.find(VF);
1301 assert(Scalars != InstsToScalarize.end() &&
1302 "VF not yet analyzed for scalarization profitability");
1303 return Scalars->second.contains(
I);
1311 if (isa<PseudoProbeInst>(
I))
1322 auto UniformsPerVF = Uniforms.find(VF);
1323 assert(UniformsPerVF != Uniforms.end() &&
1324 "VF not yet analyzed for uniformity");
1325 return UniformsPerVF->second.count(
I);
1338 auto ScalarsPerVF = Scalars.find(VF);
1339 assert(ScalarsPerVF != Scalars.end() &&
1340 "Scalar values are not calculated for VF");
1341 return ScalarsPerVF->second.count(
I);
1347 return VF.
isVector() && MinBWs.contains(
I) &&
1367 WideningDecisions[std::make_pair(
I, VF)] = std::make_pair(W,
Cost);
1378 for (
unsigned i = 0; i < Grp->
getFactor(); ++i) {
1381 WideningDecisions[std::make_pair(
I, VF)] = std::make_pair(W,
Cost);
1383 WideningDecisions[std::make_pair(
I, VF)] = std::make_pair(W, 0);
1398 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(
I, VF);
1399 auto Itr = WideningDecisions.
find(InstOnVF);
1400 if (Itr == WideningDecisions.
end())
1402 return Itr->second.first;
1409 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(
I, VF);
1411 "The cost is not calculated");
1412 return WideningDecisions[InstOnVF].second;
1420 auto *Trunc = dyn_cast<TruncInst>(
I);
1433 Value *
Op = Trunc->getOperand(0);
1450 if (VF.
isScalar() || Uniforms.contains(VF))
1453 collectLoopUniforms(VF);
1454 collectLoopScalars(VF);
1474 bool LI = isa<LoadInst>(V);
1475 bool SI = isa<StoreInst>(V);
1490 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1491 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1502 return ScalarCost < SafeDivisorCost;
1526 std::pair<InstructionCost, InstructionCost>
1567 auto RequiresScalarEpilogue = [
this](
ElementCount VF) {
1570 bool IsRequired =
all_of(Range, RequiresScalarEpilogue);
1572 (IsRequired ||
none_of(Range, RequiresScalarEpilogue)) &&
1573 "all VFs in range must agree on whether a scalar epilogue is required");
1586 if (!CanFoldTailByMasking)
1609 return InLoopReductions.contains(Phi);
1624 bool *NeedsMask =
nullptr)
const;
1628 WideningDecisions.
clear();
1658 unsigned NumPredStores = 0;
1667 bool FoldTailByMasking);
1672 ElementCount getMaximizedVFForTarget(
unsigned ConstTripCount,
1673 unsigned SmallestType,
1674 unsigned WidestType,
1676 bool FoldTailByMasking);
1680 ElementCount getMaxLegalScalableVF(
unsigned MaxSafeElements);
1693 std::optional<InstructionCost>
1741 PredicatedBBsAfterVectorization;
1753 bool CanFoldTailByMasking =
false;
1787 ScalarCostsTy &ScalarCosts,
1813 std::pair<InstWidening, InstructionCost>>;
1815 DecisionList WideningDecisions;
1838 Ops, [
this, VF](
Value *V) {
return this->needsExtract(V, VF); }));
1896class GeneratedRTChecks {
1902 Value *SCEVCheckCond =
nullptr;
1910 Value *MemRuntimeCheckCond =
nullptr;
1919 bool CostTooHigh =
false;
1924 : DT(DT), LI(LI),
TTI(
TTI), SCEVExp(SE,
DL,
"scev.check"),
1925 MemCheckExp(SE,
DL,
"scev.check") {}
1953 nullptr,
"vector.scevcheck");
1960 if (RtPtrChecking.Need) {
1961 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1962 MemCheckBlock =
SplitBlock(Pred, Pred->getTerminator(), DT, LI,
nullptr,
1965 auto DiffChecks = RtPtrChecking.getDiffChecks();
1967 Value *RuntimeVF =
nullptr;
1972 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1978 MemCheckBlock->
getTerminator(), L, RtPtrChecking.getChecks(),
1981 assert(MemRuntimeCheckCond &&
1982 "no RT checks generated although RtPtrChecking "
1983 "claimed checks are required");
1986 if (!MemCheckBlock && !SCEVCheckBlock)
1996 if (SCEVCheckBlock) {
2001 if (MemCheckBlock) {
2008 if (MemCheckBlock) {
2012 if (SCEVCheckBlock) {
2019 if (SCEVCheckBlock || MemCheckBlock)
2032 if (SCEVCheckBlock->getTerminator() == &
I)
2041 if (MemCheckBlock->getTerminator() == &
I)
2049 if (SCEVCheckBlock || MemCheckBlock)
2050 LLVM_DEBUG(
dbgs() <<
"Total cost of runtime checks: " << RTCheckCost
2058 ~GeneratedRTChecks() {
2062 SCEVCleaner.markResultUsed();
2064 if (!MemRuntimeCheckCond)
2065 MemCheckCleaner.markResultUsed();
2067 if (MemRuntimeCheckCond) {
2068 auto &SE = *MemCheckExp.
getSE();
2075 I.eraseFromParent();
2078 MemCheckCleaner.cleanup();
2079 SCEVCleaner.cleanup();
2082 SCEVCheckBlock->eraseFromParent();
2083 if (MemRuntimeCheckCond)
2084 MemCheckBlock->eraseFromParent();
2098 SCEVCheckCond =
nullptr;
2099 if (
auto *
C = dyn_cast<ConstantInt>(
Cond))
2107 if (
auto *PL = LI->
getLoopFor(LoopVectorPreHeader))
2108 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2110 SCEVCheckBlock->getTerminator()->eraseFromParent();
2111 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2112 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2120 return SCEVCheckBlock;
2129 if (!MemRuntimeCheckCond)
2138 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2140 if (
auto *PL = LI->
getLoopFor(LoopVectorPreHeader))
2141 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2144 MemCheckBlock->getTerminator(),
2146 MemCheckBlock->getTerminator()->setDebugLoc(
2147 Pred->getTerminator()->getDebugLoc());
2150 MemRuntimeCheckCond =
nullptr;
2151 return MemCheckBlock;
2194 LLVM_DEBUG(
dbgs() <<
"LV: Loop hints prevent outer loop vectorization.\n");
2200 LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: Interleave is not supported for "
2220 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2230 for (
Loop *InnerL : L)
2252 ?
B.CreateSExtOrTrunc(
Index, StepTy)
2253 :
B.CreateCast(Instruction::SIToFP,
Index, StepTy);
2254 if (CastedIndex !=
Index) {
2256 Index = CastedIndex;
2266 assert(
X->getType() ==
Y->getType() &&
"Types don't match!");
2267 if (
auto *CX = dyn_cast<ConstantInt>(
X))
2270 if (
auto *CY = dyn_cast<ConstantInt>(
Y))
2273 return B.CreateAdd(
X,
Y);
2279 assert(
X->getType()->getScalarType() ==
Y->getType() &&
2280 "Types don't match!");
2281 if (
auto *CX = dyn_cast<ConstantInt>(
X))
2284 if (
auto *CY = dyn_cast<ConstantInt>(
Y))
2287 VectorType *XVTy = dyn_cast<VectorType>(
X->getType());
2288 if (XVTy && !isa<VectorType>(
Y->getType()))
2289 Y =
B.CreateVectorSplat(XVTy->getElementCount(),
Y);
2290 return B.CreateMul(
X,
Y);
2293 switch (InductionKind) {
2296 "Vector indices not supported for integer inductions yet");
2298 "Index type does not match StartValue type");
2299 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2300 return B.CreateSub(StartValue,
Index);
2309 "Vector indices not supported for FP inductions yet");
2312 (InductionBinOp->
getOpcode() == Instruction::FAdd ||
2313 InductionBinOp->
getOpcode() == Instruction::FSub) &&
2314 "Original bin op should be defined for FP induction");
2317 return B.CreateBinOp(InductionBinOp->
getOpcode(), StartValue, MulExp,
2331 if (
F.hasFnAttribute(Attribute::VScaleRange))
2332 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2334 return std::nullopt;
2343 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2345 unsigned MaxUF = UF ? *UF :
Cost->TTI.getMaxInterleaveFactor(VF);
2347 Type *IdxTy =
Cost->Legal->getWidestInductionType();
2348 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2354 Cost->PSE.getSE()->getSmallConstantMaxTripCount(
Cost->TheLoop)) {
2357 std::optional<unsigned> MaxVScale =
2361 MaxVF *= *MaxVScale;
2364 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2412 VPValue *BlockInMask,
bool NeedsMaskForGaps) {
2414 const DataLayout &
DL = Instr->getModule()->getDataLayout();
2418 unsigned InterleaveFactor = Group->
getFactor();
2427 "Reversed masked interleave-group not supported.");
2445 for (
unsigned Part = 0; Part <
UF; Part++) {
2447 if (
auto *
I = dyn_cast<Instruction>(AddrPart))
2462 bool InBounds =
false;
2464 InBounds =
gep->isInBounds();
2472 auto CreateGroupMask = [
this, &BlockInMask, &State, &InterleaveFactor](
2473 unsigned Part,
Value *MaskForGaps) ->
Value * {
2475 assert(!MaskForGaps &&
"Interleaved groups with gaps are not supported.");
2476 assert(InterleaveFactor == 2 &&
2477 "Unsupported deinterleave factor for scalable vectors");
2478 auto *BlockInMaskPart = State.
get(BlockInMask, Part);
2483 MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2484 nullptr,
"interleaved.mask");
2490 Value *BlockInMaskPart = State.
get(BlockInMask, Part);
2494 "interleaved.mask");
2501 if (isa<LoadInst>(Instr)) {
2502 Value *MaskForGaps =
nullptr;
2503 if (NeedsMaskForGaps) {
2506 assert(MaskForGaps &&
"Mask for Gaps is required but it is null");
2511 for (
unsigned Part = 0; Part <
UF; Part++) {
2513 if (BlockInMask || MaskForGaps) {
2515 "masked interleaved groups are not allowed.");
2516 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2519 GroupMask, PoisonVec,
"wide.masked.vec");
2528 if (VecTy->isScalableTy()) {
2529 assert(InterleaveFactor == 2 &&
2530 "Unsupported deinterleave factor for scalable vectors");
2532 for (
unsigned Part = 0; Part <
UF; ++Part) {
2536 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2537 nullptr,
"strided.vec");
2539 for (
unsigned I = 0;
I < InterleaveFactor; ++
I) {
2547 if (Member->getType() != ScalarTy) {
2555 State.
set(VPDefs[J], StridedVec, Part);
2566 for (
unsigned I = 0;
I < InterleaveFactor; ++
I) {
2575 for (
unsigned Part = 0; Part <
UF; Part++) {
2577 NewLoads[Part], StrideMask,
"strided.vec");
2580 if (Member->getType() != ScalarTy) {
2589 State.
set(VPDefs[J], StridedVec, Part);
2600 Value *MaskForGaps =
2603 "masked interleaved groups are not allowed.");
2605 "masking gaps for scalable vectors is not yet supported.");
2606 for (
unsigned Part = 0; Part <
UF; Part++) {
2609 unsigned StoredIdx = 0;
2610 for (
unsigned i = 0; i < InterleaveFactor; i++) {
2612 "Fail to get a member from an interleaved store group");
2622 Value *StoredVec = State.
get(StoredValues[StoredIdx], Part);
2630 if (StoredVec->
getType() != SubVT)
2639 if (BlockInMask || MaskForGaps) {
2640 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2655 assert(!Instr->getType()->isAggregateType() &&
"Can't handle vectors");
2659 if (isa<NoAliasScopeDeclInst>(Instr))
2664 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2668 Cloned->
setName(Instr->getName() +
".cloned");
2672 if (
auto DL = Instr->getDebugLoc())
2678 auto InputInstance = Instance;
2682 Cloned->
setOperand(
I.index(), State.
get(Operand, InputInstance));
2689 State.
set(RepRecipe, Cloned, Instance);
2692 if (
auto *II = dyn_cast<AssumeInst>(Cloned))
2697 if (IfPredicateInstr)
2721 if (
Cost->foldTailByMasking()) {
2723 "VF*UF must be a power of 2 when folding tail by masking");
2755 auto *DstFVTy = cast<VectorType>(DstVTy);
2756 auto VF = DstFVTy->getElementCount();
2757 auto *SrcVecTy = cast<VectorType>(V->getType());
2758 assert(
VF == SrcVecTy->getElementCount() &&
"Vector dimensions do not match");
2759 Type *SrcElemTy = SrcVecTy->getElementType();
2760 Type *DstElemTy = DstFVTy->getElementType();
2761 assert((
DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) &&
2762 "Vector elements must have same size");
2773 "Only one type should be a pointer type");
2775 "Only one type should be a floating point type");
2801 auto CreateStep = [&]() ->
Value * {
2826 Value *MaxUIntTripCount =
2841 "TC check is expected to dominate Bypass");
2860 if (!SCEVCheckBlock)
2866 "Cannot SCEV check stride or overflow when optimizing for size");
2881 return SCEVCheckBlock;
2900 "Cannot emit memory checks when optimizing for size, unless forced "
2906 <<
"Code-size may be reduced by not forcing "
2907 "vectorization, or by source-code modifications "
2908 "eliminating the need for runtime checks "
2909 "(e.g., adding 'restrict').";
2917 return MemCheckBlock;
2926 "multiple exit loop without required epilogue?");
2930 LI,
nullptr,
Twine(Prefix) +
"middle.block");
2933 nullptr,
Twine(Prefix) +
"scalar.ph");
2950 BrInst->
setDebugLoc(ScalarLatchTerm->getDebugLoc());
2966 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2972 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
2973 if (OrigPhi == OldInduction) {
2988 if (AdditionalBypass.first) {
2989 B.SetInsertPoint(AdditionalBypass.first,
2990 AdditionalBypass.first->getFirstInsertionPt());
2991 EndValueFromAdditionalBypass =
2994 EndValueFromAdditionalBypass->
setName(
"ind.end");
3014 if (AdditionalBypass.first)
3016 EndValueFromAdditionalBypass);
3023 const SCEV2ValueTy &ExpandedSCEVs) {
3024 const SCEV *Step =
ID.getStep();
3025 if (
auto *
C = dyn_cast<SCEVConstant>(Step))
3026 return C->getValue();
3027 if (
auto *U = dyn_cast<SCEVUnknown>(Step))
3028 return U->getValue();
3029 auto I = ExpandedSCEVs.find(Step);
3030 assert(
I != ExpandedSCEVs.end() &&
"SCEV must be expanded at this point");
3035 const SCEV2ValueTy &ExpandedSCEVs,
3036 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3037 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3038 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3039 "Inconsistent information about additional bypass.");
3048 PHINode *OrigPhi = InductionEntry.first;
3073 !
Cost->foldTailByMasking()) {
3082 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3087#ifdef EXPENSIVE_CHECKS
3094std::pair<BasicBlock *, Value *>
3096 const SCEV2ValueTy &ExpandedSCEVs) {
3181 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
3182 MissingVals[UI] = EndValue;
3190 auto *UI = cast<Instruction>(U);
3192 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
3199 Value *CountMinusOne =
B.CreateSub(
3201 CountMinusOne->
setName(
"cmo");
3204 assert(StepVPV &&
"step must have been expanded during VPlan execution");
3206 : State.
get(StepVPV, {0, 0});
3210 Escape->
setName(
"ind.escape");
3211 MissingVals[UI] = Escape;
3215 for (
auto &
I : MissingVals) {
3222 if (
PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3223 PHI->addIncoming(
I.second, MiddleBlock);
3231struct CSEDenseMapInfo {
3233 return isa<InsertElementInst>(
I) || isa<ExtractElementInst>(
I) ||
3234 isa<ShuffleVectorInst>(
I) || isa<GetElementPtrInst>(
I);
3246 assert(canHandle(
I) &&
"Unknown instruction!");
3248 I->value_op_end()));
3252 if (
LHS == getEmptyKey() ||
RHS == getEmptyKey() ||
3253 LHS == getTombstoneKey() ||
RHS == getTombstoneKey())
3255 return LHS->isIdenticalTo(
RHS);
3266 if (!CSEDenseMapInfo::canHandle(&In))
3272 In.replaceAllUsesWith(V);
3273 In.eraseFromParent();
3287 for (
auto &ArgOp : CI->
args())
3298 return ScalarCallCost;
3302 for (
Type *ScalarTy : ScalarTys)
3308 getScalarizationOverhead(CI, VF,
CostKind);
3318 *NeedsMask = MaskRequired;
3323 if (!VecFunc && !MaskRequired) {
3346 if (VectorCallCost <
Cost) {
3348 Cost = VectorCallCost;
3363 assert(
ID &&
"Expected intrinsic call!");
3366 if (
auto *FPMO = dyn_cast<FPMathOperator>(CI))
3367 FMF = FPMO->getFastMathFlags();
3373 std::back_inserter(ParamTys),
3374 [&](
Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3377 dyn_cast<IntrinsicInst>(CI));
3383 auto *I1 = cast<IntegerType>(cast<VectorType>(
T1)->getElementType());
3384 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3385 return I1->getBitWidth() < I2->getBitWidth() ?
T1 : T2;
3389 auto *I1 = cast<IntegerType>(cast<VectorType>(
T1)->getElementType());
3390 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3391 return I1->getBitWidth() > I2->getBitWidth() ?
T1 : T2;
3399 for (
const auto &KV :
Cost->getMinimalBitwidths()) {
3410 for (
unsigned Part = 0; Part < UFToUse; ++Part) {
3412 if (Erased.
count(
I) ||
I->use_empty() || !isa<Instruction>(
I))
3414 Type *OriginalTy =
I->getType();
3415 Type *ScalarTruncatedTy =
3418 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3419 if (TruncatedTy == OriginalTy)
3423 auto ShrinkOperand = [&](
Value *V) ->
Value * {
3424 if (
auto *ZI = dyn_cast<ZExtInst>(V))
3425 if (ZI->getSrcTy() == TruncatedTy)
3426 return ZI->getOperand(0);
3427 return B.CreateZExtOrTrunc(V, TruncatedTy);
3432 Value *NewI =
nullptr;
3433 if (
auto *BO = dyn_cast<BinaryOperator>(
I)) {
3434 Value *Op0 = ShrinkOperand(BO->getOperand(0));
3435 Value *Op1 = ShrinkOperand(BO->getOperand(1));
3436 NewI =
B.CreateBinOp(BO->getOpcode(), Op0, Op1);
3441 cast<BinaryOperator>(NewI)->copyIRFlags(
I,
false);
3442 }
else if (
auto *CI = dyn_cast<ICmpInst>(
I)) {
3443 Value *Op0 = ShrinkOperand(BO->getOperand(0));
3444 Value *Op1 = ShrinkOperand(BO->getOperand(1));
3445 NewI =
B.CreateICmp(CI->getPredicate(), Op0, Op1);
3446 }
else if (
auto *SI = dyn_cast<SelectInst>(
I)) {
3447 Value *TV = ShrinkOperand(SI->getTrueValue());
3448 Value *FV = ShrinkOperand(SI->getFalseValue());
3449 NewI =
B.CreateSelect(SI->getCondition(), TV, FV);
3450 }
else if (
auto *CI = dyn_cast<CastInst>(
I)) {
3451 switch (CI->getOpcode()) {
3454 case Instruction::Trunc:
3455 NewI = ShrinkOperand(CI->getOperand(0));
3457 case Instruction::SExt:
3458 NewI =
B.CreateSExtOrTrunc(
3462 case Instruction::ZExt:
3463 NewI =
B.CreateZExtOrTrunc(
3468 }
else if (
auto *SI = dyn_cast<ShuffleVectorInst>(
I)) {
3470 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3471 auto *O0 =
B.CreateZExtOrTrunc(
3474 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3475 auto *O1 =
B.CreateZExtOrTrunc(
3478 NewI =
B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3479 }
else if (isa<LoadInst>(
I) || isa<PHINode>(
I)) {
3482 }
else if (
auto *IE = dyn_cast<InsertElementInst>(
I)) {
3484 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3485 auto *O0 =
B.CreateZExtOrTrunc(
3487 auto *O1 =
B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3488 NewI =
B.CreateInsertElement(O0, O1, IE->getOperand(2));
3489 }
else if (
auto *EE = dyn_cast<ExtractElementInst>(
I)) {
3491 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3492 auto *O0 =
B.CreateZExtOrTrunc(
3494 NewI =
B.CreateExtractElement(O0, EE->getOperand(2));
3502 Value *Res =
B.CreateZExtOrTrunc(NewI, OriginalTy);
3503 I->replaceAllUsesWith(Res);
3504 cast<Instruction>(
I)->eraseFromParent();
3506 State.
reset(Def, Res, Part);
3511 for (
const auto &KV :
Cost->getMinimalBitwidths()) {
3520 for (
unsigned Part = 0; Part < UFToUse; ++Part) {
3526 State.
reset(Def, NewI, Part);
3558 for (
PHINode &PN : Exit->phis())
3588 KV.second->fixPhi(Plan, State);
3629 if (
auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3635 auto *IS2 =
R2->getRecurrenceDescriptor().IntermediateStore;
3660 if (
auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3719 Value *Incoming = State.
get(PreviousDef,
UF - 1);
3720 auto *ExtractForScalar = Incoming;
3722 Value *RuntimeVF =
nullptr;
3732 auto RecurSplice = cast<VPInstruction>(*PhiR->
user_begin());
3734 RecurSplice->getOpcode() ==
3736 "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3738 for (
VPUser *U : RecurSplice->users())
3739 if (
auto *LiveOut = dyn_cast<VPLiveOut>(U))
3742 if (!LiveOuts.
empty()) {
3748 Value *ExtractForPhiUsedOutsideLoop =
nullptr;
3752 Incoming,
Idx,
"vector.recur.extract.for.phi");
3754 assert(
UF > 1 &&
"VF and UF cannot both be 1");
3759 ExtractForPhiUsedOutsideLoop = State.
get(PreviousDef,
UF - 2);
3764 PHINode *LCSSAPhi = LiveOut->getPhi();
3776 auto *Incoming = BB ==
LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3777 Start->addIncoming(Incoming, BB);
3781 Phi->setName(
"scalar.recur");
3789 "Unable to find the reduction variable");
3795 if (
auto *
I = dyn_cast<Instruction>(&*ReductionStartValue))
3821 for (
unsigned Part = 0; Part <
UF; ++Part) {
3822 Value *VecLoopExitInst = State.
get(LoopExitInstDef, Part);
3824 for (
User *U : VecLoopExitInst->
users()) {
3825 if (isa<SelectInst>(U)) {
3826 assert((!Sel || U == Sel) &&
3827 "Reduction exit feeding two different selects");
3828 Sel = cast<SelectInst>(U);
3830 assert(isa<PHINode>(U) &&
"Reduction exit must feed Phi's or select");
3832 assert(Sel &&
"Reduction exit feeds no select");
3833 State.
reset(LoopExitInstDef, Sel, Part);
3845 cast<PHINode>(State.
get(PhiR, Part));
3846 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3855 assert(!PhiR->
isInLoop() &&
"Unexpected truncated inloop reduction!");
3859 for (
unsigned Part = 0; Part <
UF; ++Part) {
3860 RdxParts[Part] = State.
get(LoopExitInstDef, Part);
3866 U->replaceUsesOfWith(RdxParts[Part], Extnd);
3867 RdxParts[Part] = Extnd;
3872 for (
unsigned Part = 0; Part <
UF; ++Part) {
3874 State.
reset(LoopExitInstDef, RdxParts[Part], Part);
3879 Value *ReducedPartRdx = State.
get(LoopExitInstDef, 0);
3891 ReducedPartRdx = State.
get(LoopExitInstDef,
UF - 1);
3896 for (
unsigned Part = 1; Part <
UF; ++Part) {
3897 Value *RdxPart = State.
get(LoopExitInstDef, Part);
3898 if (
Op != Instruction::ICmp &&
Op != Instruction::FCmp)
3903 ReducedPartRdx, RdxPart);
3917 ReducedPartRdx = RdxDesc.
isSigned()
3935 BCBlockPhi->
addIncoming(ReducedPartRdx, Incoming);
3940 BCBlockPhi->
addIncoming(ReductionStartValue, Incoming);
3973 int IncomingEdgeBlockIdx =
3975 assert(IncomingEdgeBlockIdx >= 0 &&
"Invalid block index");
3977 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3996 auto isBlockOfUsePredicated = [&](
Use &U) ->
bool {
3997 auto *
I = cast<Instruction>(U.getUser());
3999 if (
auto *Phi = dyn_cast<PHINode>(
I))
4000 BB = Phi->getIncomingBlock(
4002 return BB == PredBB;
4013 Worklist.
insert(InstsToReanalyze.
begin(), InstsToReanalyze.
end());
4014 InstsToReanalyze.
clear();
4017 while (!Worklist.
empty()) {
4023 if (!
I || isa<PHINode>(
I) || !VectorLoop->contains(
I) ||
4024 I->mayHaveSideEffects() ||
I->mayReadFromMemory())
4032 if (
I->getParent() == PredBB) {
4033 Worklist.
insert(
I->op_begin(),
I->op_end());
4047 I->moveBefore(&*PredBB->getFirstInsertionPt());
4048 Worklist.
insert(
I->op_begin(),
I->op_end());
4060 for (
VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4065 PHINode *NewPhi = cast<PHINode>(State.
get(VPPhi, 0));
4079 return Cost->useOrderedReductions(RdxDesc);
4082void LoopVectorizationCostModel::collectLoopScalars(
ElementCount VF) {
4087 "This function should not be visited twice for the same VF");
4093 Scalars[VF].
insert(Uniforms[VF].begin(), Uniforms[VF].end());
4112 "Widening decision should be ready at this moment");
4113 if (
auto *Store = dyn_cast<StoreInst>(MemAccess))
4114 if (
Ptr == Store->getValueOperand())
4117 "Ptr is neither a value or pointer operand");
4123 auto isLoopVaryingBitCastOrGEP = [&](
Value *
V) {
4124 return ((isa<BitCastInst>(V) &&
V->getType()->isPointerTy()) ||
4125 isa<GetElementPtrInst>(V)) &&
4136 if (!isLoopVaryingBitCastOrGEP(
Ptr))
4141 auto *
I = cast<Instruction>(
Ptr);
4149 return isa<LoadInst>(U) || isa<StoreInst>(U);
4153 PossibleNonScalarPtrs.
insert(
I);
4171 for (
auto &
I : *BB) {
4172 if (
auto *Load = dyn_cast<LoadInst>(&
I)) {
4173 evaluatePtrUse(Load,
Load->getPointerOperand());
4174 }
else if (
auto *Store = dyn_cast<StoreInst>(&
I)) {
4175 evaluatePtrUse(Store,
Store->getPointerOperand());
4176 evaluatePtrUse(Store,
Store->getValueOperand());
4179 for (
auto *
I : ScalarPtrs)
4180 if (!PossibleNonScalarPtrs.
count(
I)) {
4188 auto ForcedScalar = ForcedScalars.
find(VF);
4189 if (ForcedScalar != ForcedScalars.
end())
4190 for (
auto *
I : ForcedScalar->second) {
4191 LLVM_DEBUG(
dbgs() <<
"LV: Found (forced) scalar instruction: " << *
I <<
"\n");
4200 while (
Idx != Worklist.
size()) {
4202 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4204 auto *Src = cast<Instruction>(Dst->getOperand(0));
4206 auto *J = cast<Instruction>(U);
4207 return !TheLoop->contains(J) || Worklist.count(J) ||
4208 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4209 isScalarUse(J, Src));
4212 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Src <<
"\n");
4219 auto *Ind = Induction.first;
4220 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4229 auto IsDirectLoadStoreFromPtrIndvar = [&](
Instruction *Indvar,
4231 return Induction.second.getKind() ==
4233 (isa<LoadInst>(
I) || isa<StoreInst>(
I)) &&
4240 auto *I = cast<Instruction>(U);
4241 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4242 IsDirectLoadStoreFromPtrIndvar(Ind, I);
4249 auto ScalarIndUpdate =
4251 auto *I = cast<Instruction>(U);
4252 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4253 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4255 if (!ScalarIndUpdate)
4260 Worklist.
insert(IndUpdate);
4261 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Ind <<
"\n");
4262 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *IndUpdate
4276 switch(
I->getOpcode()) {
4279 case Instruction::Call:
4281 case Instruction::Load:
4282 case Instruction::Store: {
4294 case Instruction::UDiv:
4295 case Instruction::SDiv:
4296 case Instruction::SRem:
4297 case Instruction::URem: {
4313 switch(
I->getOpcode()) {
4316 case Instruction::Load:
4317 case Instruction::Store: {
4330 (isa<LoadInst>(
I) ||
4331 (isa<StoreInst>(
I) &&
4337 case Instruction::UDiv:
4338 case Instruction::SDiv:
4339 case Instruction::SRem:
4340 case Instruction::URem:
4344 case Instruction::Call:
4349std::pair<InstructionCost, InstructionCost>
4352 assert(
I->getOpcode() == Instruction::UDiv ||
4353 I->getOpcode() == Instruction::SDiv ||
4354 I->getOpcode() == Instruction::SRem ||
4355 I->getOpcode() == Instruction::URem);
4366 ScalarizationCost = 0;
4381 ScalarizationCost += getScalarizationOverhead(
I, VF,
CostKind);
4395 Instruction::Select, VecTy,
4401 Value *Op2 =
I->getOperand(1);
4410 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4412 return {ScalarizationCost, SafeDivisorCost};
4419 "Decision should not be set yet.");
4421 assert(Group &&
"Must have a group.");
4425 auto &
DL =
I->getModule()->getDataLayout();
4432 unsigned InterleaveFactor = Group->getFactor();
4433 bool ScalarNI =
DL.isNonIntegralPointerType(ScalarTy);
4434 for (
unsigned i = 0; i < InterleaveFactor; i++) {
4439 bool MemberNI =
DL.isNonIntegralPointerType(
MemberTy);
4441 if (MemberNI != ScalarNI) {
4444 }
else if (MemberNI && ScalarNI &&
4445 ScalarTy->getPointerAddressSpace() !=
4446 MemberTy->getPointerAddressSpace()) {
4456 bool PredicatedAccessRequiresMasking =
4459 bool LoadAccessWithGapsRequiresEpilogMasking =
4460 isa<LoadInst>(
I) && Group->requiresScalarEpilogue() &&
4462 bool StoreAccessWithGapsRequiresMasking =
4463 isa<StoreInst>(
I) && (Group->getNumMembers() < Group->getFactor());
4464 if (!PredicatedAccessRequiresMasking &&
4465 !LoadAccessWithGapsRequiresEpilogMasking &&
4466 !StoreAccessWithGapsRequiresMasking)
4473 "Masked interleave-groups for predicated accesses are not enabled.");
4475 if (Group->isReverse())
4487 assert((isa<LoadInst, StoreInst>(
I)) &&
"Invalid memory instruction");
4503 auto &
DL =
I->getModule()->getDataLayout();
4510void LoopVectorizationCostModel::collectLoopUniforms(
ElementCount VF) {