148 #include <functional>
156 using namespace llvm;
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
169 "llvm.loop.vectorize.followup_vectorized";
171 "llvm.loop.vectorize.followup_epilogue";
174 STATISTIC(LoopsVectorized,
"Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed,
"Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized,
"Number of epilogues vectorized");
180 cl::desc(
"Enable vectorization of epilogue loops."));
184 cl::desc(
"When epilogue vectorization is enabled, and a value greater than "
185 "1 is specified, forces the given VF for all applicable epilogue "
190 cl::desc(
"Only loops with vectorization factor equal to or larger than "
191 "the specified value are considered for epilogue vectorization."));
197 cl::desc(
"Loops with a constant trip count that is smaller than this "
198 "value are vectorized only if no scalar iteration overheads "
215 "prefer-predicate-over-epilogue",
218 cl::desc(
"Tail-folding and predication preferences over creating a scalar "
222 "Don't tail-predicate loops, create scalar epilogue"),
224 "predicate-else-scalar-epilogue",
225 "prefer tail-folding, create scalar epilogue if tail "
228 "predicate-dont-vectorize",
229 "prefers tail-folding, don't attempt vectorization if "
230 "tail-folding fails.")));
234 cl::desc(
"Maximize bandwidth when selecting vectorization factor which "
235 "will be determined by the smallest type in loop."));
239 cl::desc(
"Enable vectorization on interleaved memory accesses in a loop"));
245 cl::desc(
"Enable vectorization on masked interleaved memory accesses in a loop"));
249 cl::desc(
"We don't interleave loops with a estimated constant trip count "
250 "below this number"));
254 cl::desc(
"A flag that overrides the target's number of scalar registers."));
258 cl::desc(
"A flag that overrides the target's number of vector registers."));
262 cl::desc(
"A flag that overrides the target's max interleave factor for "
267 cl::desc(
"A flag that overrides the target's max interleave factor for "
268 "vectorized loops."));
272 cl::desc(
"A flag that overrides the target's expected cost for "
273 "an instruction to a single constant value. Mostly "
274 "useful for getting consistent testing."));
279 "Pretend that scalable vectors are supported, even if the target does "
280 "not support them. This flag should only be used for testing."));
285 "The cost of a loop that is considered 'small' by the interleaver."));
289 cl::desc(
"Enable the use of the block frequency analysis to access PGO "
290 "heuristics minimizing code growth in cold regions and being more "
291 "aggressive in hot regions."));
297 "Enable runtime interleaving until load/store ports are saturated"));
302 cl::desc(
"Enable interleaving for loops with small iteration counts that "
303 "contain scalar reductions to expose ILP."));
308 cl::desc(
"Max number of stores to be predicated behind an if."));
312 cl::desc(
"Count the induction variable only once when interleaving"));
316 cl::desc(
"Enable if predication of stores during vectorization."));
320 cl::desc(
"The maximum interleave count to use when interleaving a scalar "
321 "reduction in a nested loop."));
326 cl::desc(
"Prefer in-loop vector reductions, "
327 "overriding the targets preference."));
332 "Prefer predicating a reduction operation over an after loop select."));
336 cl::desc(
"Enable VPlan-native vectorization path with "
337 "support for outer loop vectorization."));
343 cl::desc(
"Enable VPlan-native vectorization path predicator with "
344 "support for outer loop vectorization."));
353 "Build VPlan for every supported loop nest in the function and bail "
354 "out right after the build (stress test the VPlan H-CFG construction "
355 "in the VPlan-native vectorization path)."));
359 cl::desc(
"Enable loop interleaving in Loop vectorization passes"));
362 cl::desc(
"Run the Loop vectorization passes"));
366 assert((isa<LoadInst>(
I) || isa<StoreInst>(
I)) &&
367 "Expected Load or Store instruction");
368 if (
auto *LI = dyn_cast<LoadInst>(
I))
369 return LI->getType();
370 return cast<StoreInst>(
I)->getValueOperand()->getType();
382 DL.getTypeAllocSize(Ty).getFixedValue(),
388 return DL.getTypeAllocSizeInBits(Ty) !=
DL.getTypeSizeInBits(Ty);
401 if (isa<FPMathOperator>(V))
525 const VPIteration &Instance,
bool IfPredicateInstr,
547 VPValue *BlockInMask =
nullptr);
627 Instruction::BinaryOpsEnd);
677 unsigned Part,
unsigned Lane = UINT_MAX);
728 std::pair<BasicBlock *, Value *> AdditionalBypass = {
nullptr,
nullptr});
793 std::unique_ptr<LoopVersioning>
LVer;
889 Instruction::BinaryOpsEnd)
override;
913 "A high UF for the epilogue loop is likely not beneficial.");
1024 if (
I->getDebugLoc() != Empty)
1027 for (
Use &
Op :
I->operands()) {
1029 if (OpInst->getDebugLoc() != Empty)
1037 if (
const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1039 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1040 !isa<DbgInfoIntrinsic>(Inst)) {
1045 B.SetCurrentDebugLocation(NewDIL.getValue());
1048 <<
"Failed to create new discriminator: "
1049 << DIL->getFilename() <<
" Line: " << DIL->getLine());
1052 B.SetCurrentDebugLocation(DIL);
1063 dbgs() <<
"LV: Not vectorizing: " << DebugMsg;
1065 dbgs() <<
" " << *
I;
1085 CodeRegion =
I->getParent();
1088 if (
I->getDebugLoc())
1089 DL =
I->getDebugLoc();
1093 R <<
"loop not vectorized: ";
1099 assert(isa<ConstantInt>(Step) &&
"Expected an integer step");
1114 ORETag, TheLoop,
I) << OREMsg);
1126 LoopDbgLoc.print(OS);
1140 if (
LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1141 LVer->annotateInstWithNoAlias(To, Orig);
1152 for (
Value *V : To) {
1203 Hints(Hints), InterleaveInfo(IAI) {}
1211 bool runtimeChecksRequired();
1219 selectEpilogueVectorizationFactor(
const ElementCount MaxVF,
1224 collectUniformsAndScalars(UserVF);
1225 collectInstsToScalarize(UserVF);
1231 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1237 unsigned selectInterleaveCount(
ElementCount VF,
unsigned LoopCost);
1265 void collectValuesToIgnore();
1269 void collectInLoopReductions();
1282 "Profitable to scalarize relevant only for VF > 1.");
1289 auto Scalars = InstsToScalarize.find(
VF);
1290 assert(Scalars != InstsToScalarize.end() &&
1291 "VF not yet analyzed for scalarization profitability");
1292 return Scalars->second.find(
I) != Scalars->second.end();
1305 auto UniformsPerVF = Uniforms.find(
VF);
1306 assert(UniformsPerVF != Uniforms.end() &&
1307 "VF not yet analyzed for uniformity");
1308 return UniformsPerVF->second.count(
I);
1321 auto ScalarsPerVF = Scalars.find(
VF);
1322 assert(ScalarsPerVF != Scalars.end() &&
1323 "Scalar values are not calculated for VF");
1324 return ScalarsPerVF->second.count(
I);
1330 return VF.
isVector() && MinBWs.find(
I) != MinBWs.end() &&
1331 !isProfitableToScalarize(
I,
VF) &&
1332 !isScalarAfterVectorization(
I,
VF);
1350 WideningDecisions[std::make_pair(
I,
VF)] = std::make_pair(
W,
Cost);
1364 WideningDecisions[std::make_pair(
I,
VF)] = std::make_pair(
W,
Cost);
1366 WideningDecisions[std::make_pair(
I,
VF)] = std::make_pair(
W, 0);
1379 return CM_GatherScatter;
1381 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(
I,
VF);
1382 auto Itr = WideningDecisions.find(InstOnVF);
1383 if (Itr == WideningDecisions.end())
1385 return Itr->second.first;
1392 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(
I,
VF);
1393 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1394 "The cost is not calculated");
1395 return WideningDecisions[InstOnVF].second;
1403 auto *Trunc = dyn_cast<TruncInst>(
I);
1416 Value *
Op = Trunc->getOperand(0);
1433 if (
VF.
isScalar() || Uniforms.find(
VF) != Uniforms.end())
1435 setCostBasedWideningDecision(
VF);
1436 collectLoopUniforms(
VF);
1437 collectLoopScalars(
VF);
1469 bool LI = isa<LoadInst>(V);
1470 bool SI = isa<StoreInst>(V);
1475 return (
LI && isLegalMaskedGather(Ty,
Align)) ||
1476 (
SI && isLegalMaskedScatter(Ty,
Align));
1483 RecurrenceDescriptor RdxDesc = Reduction.second;
1484 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1500 if (!blockNeedsPredication(
I->getParent()))
1504 if (isa<LoadInst>(
I) || isa<StoreInst>(
I))
1506 return isScalarWithPredication(
I);
1524 return InterleaveInfo.isInterleaved(Instr);
1530 return InterleaveInfo.getInterleaveGroup(Instr);
1536 if (!isScalarEpilogueAllowed())
1540 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1542 return InterleaveInfo.requiresScalarEpilogue();
1561 using ReductionChainMap =
1566 return InLoopReductionChains;
1571 return InLoopReductionChains.count(Phi);
1585 bool &NeedToScalarize);
1589 WideningDecisions.clear();
1595 unsigned NumPredStores = 0;
1600 ElementCount computeFeasibleMaxVF(
unsigned ConstTripCount,
1610 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1691 bool FoldTailByMasking =
false;
1714 ReductionChainMap InLoopReductionChains;
1727 int computePredInstDiscount(
Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1751 std::pair<InstWidening, InstructionCost>>;
1753 DecisionList WideningDecisions;
1760 TheLoop->isLoopInvariant(
I))
1769 return Scalars.
find(
VF) == Scalars.
end() ||
1770 !isScalarAfterVectorization(
I,
VF);
1777 Ops, [
this,
VF](
Value *V) {
return this->needsExtract(V,
VF); }));
1782 bool isCandidateForEpilogueVectorization(
const Loop &L,
1788 bool isEpilogueVectorizationProfitable(
const ElementCount VF)
const;
1866 LLVM_DEBUG(
dbgs() <<
"LV: Loop hints prevent outer loop vectorization.\n");
1872 LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: Interleave is not supported for "
1892 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *
LI)) {
1902 for (
Loop *InnerL : L)
1915 explicit LoopVectorize(
bool InterleaveOnlyWhenForced =
false,
1916 bool VectorizeOnlyWhenForced =
false)
1918 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1923 if (skipFunction(
F))
1926 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1927 auto *
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1928 auto *
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
1929 auto *
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1930 auto *
BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1931 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1932 auto *
TLI = TLIP ? &TLIP->getTLI(
F) :
nullptr;
1933 auto *
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1934 auto *
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
1935 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1936 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1937 auto *
ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1938 auto *
PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1943 return Impl.
runImpl(
F, *SE, *
LI, *
TTI, *
DT, *
BFI,
TLI, *DB, *
AA, *
AC,
2005 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2006 "Expected either an induction phi-node or a truncate of it!");
2011 if (isa<TruncInst>(EntryVal)) {
2012 assert(Start->getType()->isIntegerTy() &&
2013 "Truncation requires an integer type");
2014 auto *TruncType = cast<IntegerType>(EntryVal->
getType());
2019 Value *SteppedStart =
2028 MulOp = Instruction::Mul;
2031 MulOp = Instruction::FMul;
2046 Value *SplatVF = isa<Constant>(
Mul)
2057 for (
unsigned Part = 0; Part <
UF; ++Part) {
2058 State.
set(
Def, LastInduction, Part);
2060 if (isa<TruncInst>(EntryVal))
2073 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2074 auto *ICmp = cast<Instruction>(Br->getCondition());
2076 LastInduction->
setName(
"vec.ind.next");
2079 VecInd->
addIncoming(LastInduction, LoopVectorLatch);
2090 auto isScalarInst = [&](
User *U) ->
bool {
2091 auto *
I = cast<Instruction>(U);
2100 unsigned Part,
unsigned Lane) {
2101 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2102 "Expected either an induction phi-node or a truncate of it!");
2110 if (isa<TruncInst>(EntryVal))
2119 if (Lane < UINT_MAX)
2122 State.
set(CastDef, VectorLoopVal, Part);
2130 "Primary induction variable must have an integer type");
2135 auto ID = II->second;
2136 assert(IV->
getType() ==
ID.getStartValue()->getType() &&
"Types must match");
2140 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2146 auto CreateStepValue = [&](
const SCEV *Step) ->
Value * {
2148 "Induction step should be loop invariant");
2151 return Exp.expandCodeFor(Step, Step->getType(),
2154 return cast<SCEVUnknown>(Step)->getValue();
2161 auto CreateScalarIV = [&](
Value *&Step) ->
Value * {
2169 ScalarIV->
setName(
"offset.idx");
2172 auto *TruncType = cast<IntegerType>(Trunc->
getType());
2173 assert(Step->getType()->isIntegerTy() &&
2174 "Truncation requires an integer step");
2183 auto CreateSplatIV = [&](
Value *ScalarIV,
Value *Step) {
2185 for (
unsigned Part = 0; Part <
UF; ++Part) {
2189 ID.getInductionOpcode());
2190 State.
set(
Def, EntryPart, Part);
2199 Value *Step = CreateStepValue(
ID.getStep());
2201 Value *ScalarIV = CreateScalarIV(Step);
2202 CreateSplatIV(ScalarIV, Step);
2210 if (!NeedsScalarIV) {
2222 Value *ScalarIV = CreateScalarIV(Step);
2234 Value *ScalarIV = CreateScalarIV(Step);
2236 CreateSplatIV(ScalarIV, Step);
2243 auto *ValVTy = cast<FixedVectorType>(Val->
getType());
2244 int VLen = ValVTy->getNumElements();
2248 "Induction Step must be an integer or FP");
2255 for (
int i = 0;
i < VLen; ++
i)
2270 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2271 "Binary Opcode should be specified for FP induction");
2273 for (
int i = 0;
i < VLen; ++
i)
2286 if (isa<Instruction>(MulOp))
2288 cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2291 if (isa<Instruction>(BOp))
2292 cast<Instruction>(BOp)->setFastMathFlags(Flags);
2306 "Val and Step should have the same type");
2314 MulOp = Instruction::Mul;
2316 AddOp =
ID.getInductionOpcode();
2317 MulOp = Instruction::FMul;
2328 "Should never scalarize a scalable vector");
2330 for (
unsigned Part = 0; Part <
UF; ++Part) {
2331 for (
unsigned Lane = 0; Lane < Lanes; ++Lane) {
2343 "Expected StartIdx to be folded to a constant when VF is not "
2422 unsigned InterleaveFactor = Group->
getFactor();
2432 "Reversed masked interleave-group not supported.");
2441 "scalable vector reverse operation is not implemented");
2445 for (
unsigned Part = 0; Part <
UF; Part++) {
2461 bool InBounds =
false;
2463 InBounds =
gep->isInBounds();
2465 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2476 Value *MaskForGaps =
nullptr;
2480 assert(MaskForGaps &&
"Mask for Gaps is required but it is null");
2484 if (isa<LoadInst>(Instr)) {
2487 for (
unsigned Part = 0; Part <
UF; Part++) {
2489 if (BlockInMask || MaskForGaps) {
2491 "masked interleaved groups are not allowed.");
2492 Value *GroupMask = MaskForGaps;
2494 Value *BlockInMaskPart = State.
get(BlockInMask, Part);
2499 "interleaved.mask");
2500 GroupMask = MaskForGaps
2507 GroupMask, PoisonVec,
"wide.masked.vec");
2513 NewLoads.push_back(NewLoad);
2519 for (
unsigned I = 0;
I < InterleaveFactor; ++
I) {
2529 for (
unsigned Part = 0; Part <
UF; Part++) {
2531 NewLoads[Part], StrideMask,
"strided.vec");
2534 if (Member->getType() != ScalarTy) {
2543 State.
set(VPDefs[J], StridedVec, Part);
2555 for (
unsigned Part = 0; Part <
UF; Part++) {
2558 for (
unsigned i = 0;
i < InterleaveFactor;
i++) {
2560 assert(Group->
getMember(
i) &&
"Fail to get a member from an interleaved store group");
2562 Value *StoredVec = State.
get(StoredValues[
i], Part);
2569 if (StoredVec->
getType() != SubVT)
2572 StoredVecs.push_back(StoredVec);
2586 Value *BlockInMaskPart = State.
get(BlockInMask, Part);
2590 "interleaved.mask");
2592 IVec, AddrParts[Part], Group->
getAlign(), ShuffledMask);
2609 assert((
LI ||
SI) &&
"Invalid Load/Store instruction");
2610 assert((!
SI || StoredValue) &&
"No stored value provided for widened store");
2611 assert((!
LI || !StoredValue) &&
"Stored value provided for widened load");
2618 "CM decision is not to widen the memory instruction");
2628 bool ConsecutiveStride =
2630 bool CreateGatherScatter =
2635 assert((ConsecutiveStride || CreateGatherScatter) &&
2636 "The instruction should be scalarized");
2637 (void)ConsecutiveStride;
2640 bool isMaskRequired = BlockInMask;
2642 for (
unsigned Part = 0; Part <
UF; ++Part)
2643 BlockInMaskParts[Part] = State.
get(BlockInMask, Part);
2645 const auto CreateVecPtr = [&](
unsigned Part,
Value *Ptr) ->
Value * {
2649 bool InBounds =
false;
2650 if (
auto *
gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2651 InBounds =
gep->isInBounds();
2655 "Reversing vectors is not yet supported for scalable vectors.");
2666 BlockInMaskParts[Part] =
reverseVector(BlockInMaskParts[Part]);
2669 PartPtr = cast<GetElementPtrInst>(
2674 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2682 for (
unsigned Part = 0; Part <
UF; ++Part) {
2684 Value *StoredVal = State.
get(StoredValue, Part);
2685 if (CreateGatherScatter) {
2686 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] :
nullptr;
2701 BlockInMaskParts[Part]);
2711 assert(
LI &&
"Must have a load instruction");
2713 for (
unsigned Part = 0; Part <
UF; ++Part) {
2715 if (CreateGatherScatter) {
2716 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] :
nullptr;
2719 nullptr,
"wide.masked.gather");
2726 "wide.masked.load");
2737 State.
set(
Def, NewLI, Part);
2744 bool IfPredicateInstr,
2750 if (isa<NoAliasScopeDeclInst>(Instr))
2768 auto *Operand = dyn_cast<Instruction>(Instr->
getOperand(
op));
2769 auto InputInstance = Instance;
2772 InputInstance.
Lane = 0;
2781 State.
set(
Def, Cloned, Instance);
2784 if (
auto *II = dyn_cast<IntrinsicInst>(Cloned))
2785 if (II->getIntrinsicID() == Intrinsic::assume)
2789 if (IfPredicateInstr)
2829 assert(L &&
"Create Trip Count for null loop.");
2834 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2835 "Invalid loop count");
2838 assert(IdxTy &&
"No type for induction");
2852 BackedgeTakenCount, SE->
getOne(BackedgeTakenCount->
getType()));
2891 "VF*UF must be a power of 2 when folding tail by masking");
2893 "Tail folding not yet supported for scalable vectors");
2929 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2930 unsigned VF = DstFVTy->getNumElements();
2931 auto *SrcVecTy = cast<FixedVectorType>(V->
getType());
2932 assert((
VF == SrcVecTy->getNumElements()) &&
"Vector dimensions do not match");
2933 Type *SrcElemTy = SrcVecTy->getElementType();
2934 Type *DstElemTy = DstFVTy->getElementType();
2935 assert((
DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) &&
2936 "Vector elements must have same size");
2947 "Only one type should be a pointer type");
2949 "Only one type should be a floating point type");
2987 "TC check is expected to dominate Bypass");
3009 Value *SCEVCheck = Exp.expandCodeForPredicate(
3012 if (
auto *
C = dyn_cast<ConstantInt>(SCEVCheck))
3019 "Cannot SCEV check stride or overflow when optimizing for size");
3021 SCEVCheckBlock->
setName(
"vector.scevcheck");
3025 nullptr,
"vector.ph");
3054 if (!RtPtrChecking.Need)
3059 "Cannot emit memory checks when optimizing for size, unless forced "
3064 <<
"Code-size may be reduced by not forcing "
3065 "vectorization, or by source-code modifications "
3066 "eliminating the need for runtime checks "
3067 "(e.g., adding 'restrict').";
3071 MemCheckBlock->
setName(
"vector.memcheck");
3077 auto *CondBranch = cast<BranchInst>(
3095 assert(MemRuntimeCheck &&
"no RT checks generated although RtPtrChecking "
3096 "claimed checks are required");
3097 CondBranch->setCondition(MemRuntimeCheck);
3101 LVer = std::make_unique<LoopVersioning>(
3105 LVer->prepareNoAliasMetadata();
3113 auto Step =
ID.getStep();
3114 auto StartValue =
ID.getStartValue();
3116 "Index type does not match StepValue type");
3125 assert(
X->getType() ==
Y->getType() &&
"Types don't match!");
3126 if (
auto *CX = dyn_cast<ConstantInt>(
X))
3129 if (
auto *CY = dyn_cast<ConstantInt>(
Y))
3132 return B.CreateAdd(
X,
Y);
3136 assert(
X->getType() ==
Y->getType() &&
"Types don't match!");
3137 if (
auto *CX = dyn_cast<ConstantInt>(
X))
3140 if (
auto *CY = dyn_cast<ConstantInt>(
Y))
3143 return B.CreateMul(
X,
Y);
3151 auto GetInsertPoint = [
this, &
B]() {
3152 BasicBlock *InsertBB =
B.GetInsertPoint()->getParent();
3156 return &*
B.GetInsertPoint();
3158 switch (
ID.getKind()) {
3160 assert(
Index->getType() == StartValue->getType() &&
3161 "Index type does not match StartValue type");
3162 if (
ID.getConstIntStepValue() &&
ID.getConstIntStepValue()->isMinusOne())
3163 return B.CreateSub(StartValue,
Index);
3165 Index, Exp.expandCodeFor(Step,
Index->getType(), GetInsertPoint()));
3169 assert(isa<SCEVConstant>(Step) &&
3170 "Expected constant step for pointer induction");
3172 StartValue->getType()->getPointerElementType(), StartValue,
3174 Exp.expandCodeFor(Step,
Index->getType(), GetInsertPoint())));
3177 assert(Step->getType()->isFloatingPointTy() &&
"Expected FP Step value");
3178 auto InductionBinOp =
ID.getInductionBinOp();
3180 (InductionBinOp->getOpcode() == Instruction::FAdd ||
3181 InductionBinOp->getOpcode() == Instruction::FSub) &&
3182 "Original bin op should be defined for FP induction");
3184 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3191 if (isa<Instruction>(MulExp))
3193 cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3195 Value *BOp =
B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3197 if (isa<Instruction>(BOp))
3198 cast<Instruction>(BOp)->setFastMathFlags(Flags);
3228 BrInst->
setDebugLoc(ScalarLatchTerm->getDebugLoc());
3258 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3260 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3261 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3262 "Inconsistent information about additional bypass.");
3271 PHINode *OrigPhi = InductionEntry.first;
3281 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3296 if (AdditionalBypass.first) {
3297 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3301 B.CreateCast(CastOp, AdditionalBypass.second, StepType,
"cast.crd");
3302 EndValueFromAdditionalBypass =
3304 EndValueFromAdditionalBypass->
setName(
"ind.end");
3317 if (AdditionalBypass.first)
3319 EndValueFromAdditionalBypass);
3327 assert(L &&
"Expected valid loop.");
3348 CmpN->
setDebugLoc(ScalarLatchTerm->getDebugLoc());
3354 "Inconsistent vector loop preheader");
3376 #ifdef EXPENSIVE_CHECKS
3492 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
3493 MissingVals[UI] = EndValue;
3501 auto *UI = cast<Instruction>(U);
3505 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
3508 Value *CountMinusOne =
B.CreateSub(
3512 ?
B.CreateCast(Instruction::SIToFP, CountMinusOne,
3517 Escape->
setName(
"ind.escape");
3518 MissingVals[UI] = Escape;
3522 for (
auto &
I : MissingVals) {
3523 PHINode *PHI = cast<PHINode>(
I.first);
3536 struct CSEDenseMapInfo {
3538 return isa<InsertElementInst>(
I) || isa<ExtractElementInst>(
I) ||
3539 isa<ShuffleVectorInst>(
I) || isa<GetElementPtrInst>(
I);
3551 assert(canHandle(
I) &&
"Unknown instruction!");
3553 I->value_op_end()));
3557 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3558 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3573 if (!CSEDenseMapInfo::canHandle(
In))
3579 In->replaceAllUsesWith(V);
3580 In->eraseFromParent();
3590 bool &NeedToScalarize) {
3595 ScalarTys.push_back(ArgOp->getType());
3604 return ScalarCallCost;
3608 for (
Type *ScalarTy : ScalarTys)
3620 NeedToScalarize =
true;
3630 if (VectorCallCost <
Cost) {
3631 NeedToScalarize =
false;
3632 Cost = VectorCallCost;
3647 assert(
ID &&
"Expected intrinsic call!");
3650 if (
auto *FPMO = dyn_cast<FPMathOperator>(CI))
3651 FMF = FPMO->getFastMathFlags();
3657 std::back_inserter(ParamTys),
3658 [&](
Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3661 dyn_cast<IntrinsicInst>(CI));
3667 auto *I1 = cast<IntegerType>(cast<VectorType>(
T1)->getElementType());
3668 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3669 return I1->getBitWidth() < I2->getBitWidth() ?
T1 : T2;
3673 auto *I1 = cast<IntegerType>(cast<VectorType>(
T1)->getElementType());
3674 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3675 return I1->getBitWidth() > I2->getBitWidth() ?
T1 : T2;
3690 for (
unsigned Part = 0; Part <
UF; ++Part) {
3692 if (Erased.
count(
I) ||
I->use_empty() || !isa<Instruction>(
I))
3694 Type *OriginalTy =
I->getType();
3695 Type *ScalarTruncatedTy =
3699 cast<FixedVectorType>(OriginalTy)->getNumElements());
3700 if (TruncatedTy == OriginalTy)
3704 auto ShrinkOperand = [&](
Value *V) ->
Value * {
3705 if (
auto *ZI = dyn_cast<ZExtInst>(V))
3706 if (ZI->getSrcTy() == TruncatedTy)
3707 return ZI->getOperand(0);
3708 return B.CreateZExtOrTrunc(V, TruncatedTy);
3713 Value *NewI =
nullptr;
3714 if (
auto *BO = dyn_cast<BinaryOperator>(
I)) {
3715 NewI =
B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3716 ShrinkOperand(BO->getOperand(1)));
3721 cast<BinaryOperator>(NewI)->copyIRFlags(
I,
false);
3722 }
else if (
auto *CI = dyn_cast<ICmpInst>(
I)) {
3724 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3725 ShrinkOperand(CI->getOperand(1)));
3726 }
else if (
auto *
SI = dyn_cast<SelectInst>(
I)) {
3727 NewI =
B.CreateSelect(
SI->getCondition(),
3728 ShrinkOperand(
SI->getTrueValue()),
3729 ShrinkOperand(
SI->getFalseValue()));
3730 }
else if (
auto *CI = dyn_cast<CastInst>(
I)) {
3731 switch (CI->getOpcode()) {
3734 case Instruction::Trunc:
3735 NewI = ShrinkOperand(CI->getOperand(0));
3737 case Instruction::SExt:
3738 NewI =
B.CreateSExtOrTrunc(
3742 case Instruction::ZExt:
3743 NewI =
B.CreateZExtOrTrunc(
3748 }
else if (
auto *
SI = dyn_cast<ShuffleVectorInst>(
I)) {
3749 auto Elements0 = cast<FixedVectorType>(
SI->getOperand(0)->getType())
3751 auto *O0 =
B.CreateZExtOrTrunc(
3754 auto Elements1 = cast<FixedVectorType>(
SI->getOperand(1)->getType())
3756 auto *O1 =
B.CreateZExtOrTrunc(
3760 NewI =
B.CreateShuffleVector(O0, O1,
SI->getShuffleMask());
3761 }
else if (isa<LoadInst>(
I) || isa<PHINode>(
I)) {
3764 }
else if (
auto *
IE = dyn_cast<InsertElementInst>(
I)) {
3765 auto Elements = cast<FixedVectorType>(
IE->getOperand(0)->getType())
3767 auto *O0 =
B.CreateZExtOrTrunc(
3770 auto *O1 =
B.CreateZExtOrTrunc(
IE->getOperand(1), ScalarTruncatedTy);
3771 NewI =
B.CreateInsertElement(O0, O1,
IE->getOperand(2));
3772 }
else if (
auto *EE = dyn_cast<ExtractElementInst>(
I)) {
3773 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3775 auto *O0 =
B.CreateZExtOrTrunc(
3778 NewI =
B.CreateExtractElement(O0, EE->getOperand(2));
3786 Value *Res =
B.CreateZExtOrTrunc(NewI, OriginalTy);
3787 I->replaceAllUsesWith(Res);
3788 cast<Instruction>(
I)->eraseFromParent();
3802 for (
unsigned Part = 0; Part <
UF; ++Part) {
3823 "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3943 auto *VectorInit = ScalarInit;
3960 auto *VecPhi =
Builder.
CreatePHI(VectorInit->getType(), 2,
"vector.recur");
3965 Value *PreviousLastPart = State.
get(PreviousDef,
UF - 1);
3977 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3978 if (isa<PHINode>(PreviousLastPart))
3999 Value *Incoming = VecPhi;
4002 for (
unsigned Part = 0; Part <
UF; ++Part) {
4003 Value *PreviousPart = State.
get(PreviousDef, Part);
4004 Value *PhiPart = State.
get(PhiDef, Part);
4010 cast<Instruction>(PhiPart)->eraseFromParent();
4011 State.
reset(PhiDef, Shuffle, Part);
4012 Incoming = PreviousPart;
4020 auto *ExtractForScalar = Incoming;
4025 "vector.recur.extract");
4032 Value *ExtractForPhiUsedOutsideLoop =
nullptr;
4036 "vector.recur.extract.for.phi");
4042 ExtractForPhiUsedOutsideLoop = State.
get(PreviousDef,
UF - 2);
4049 Start->addIncoming(Incoming,
BB);
4065 if (
any_of(LCSSAPhi.incoming_values(),
4066 [Phi](
Value *V) { return V == Phi; }))
4073 "Unable to find the reduction variable");
4096 for (
unsigned Part = 0; Part <
UF; ++Part) {
4099 cast<PHINode>(VecRdxPhi)
4116 for (
unsigned Part = 0; Part <
UF; ++Part) {
4117 Value *VecLoopExitInst = State.
get(LoopExitInstDef, Part);
4118 Value *Sel =
nullptr;
4119 for (
User *U : VecLoopExitInst->
users()) {
4120 if (isa<SelectInst>(U)) {
4121 assert(!Sel &&
"Reduction exit feeding two selects");
4124 assert(isa<PHINode>(U) &&
"Reduction exit must feed Phi's or select");
4126 assert(Sel &&
"Reduction exit feeds no select");
4127 State.
reset(LoopExitInstDef, Sel, Part);
4141 VecRdxPhi->setIncomingValueForBlock(
4151 assert(!IsInLoopReductionPhi &&
"Unexpected truncated inloop reduction!");
4157 for (
unsigned Part = 0; Part <
UF; ++Part) {
4158 RdxParts[Part] = State.
get(LoopExitInstDef, Part);
4163 UI != RdxParts[Part]->user_end();)
4166 RdxParts[Part] = Extnd;
4172 for (
unsigned Part = 0; Part <
UF; ++Part) {
4174 State.
reset(LoopExitInstDef, RdxParts[Part], Part);
4179 Value *ReducedPartRdx = State.
get(LoopExitInstDef, 0);
4194 for (
unsigned Part = 1; Part <
UF; ++Part) {
4195 Value *RdxPart = State.
get(LoopExitInstDef, Part);
4196 if (
Op != Instruction::ICmp &&
Op != Instruction::FCmp) {
4207 if (
VF.
isVector() && !IsInLoopReductionPhi) {
4234 if (
any_of(LCSSAPhi.incoming_values(),
4235 [LoopExitInst](
Value *V) { return V == LoopExitInst; }))
4240 int IncomingEdgeBlockIdx =
4242 assert(IncomingEdgeBlockIdx >= 0 &&
"Invalid block index");
4244 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4256 assert(LoopExitInstr &&
"null loop exit instruction");
4259 Worklist.push_back(LoopExitInstr);
4260 Visited.
insert(LoopExitInstr);
4262 while (!Worklist.empty()) {
4264 if (isa<OverflowingBinaryOperator>(Cur))
4265 for (
unsigned Part = 0; Part <
UF; ++Part) {
4267 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4273 Visited.
insert(UI).second)
4274 Worklist.push_back(UI);
4286 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4288 unsigned LastLane = 0;
4289 if (isa<Instruction>(IncomingValue))
4291 cast<Instruction>(IncomingValue),
VF)
4295 "scalable vectors dont support non-uniform scalars yet");
4299 Value *lastIncomingValue =
4322 auto isBlockOfUsePredicated = [&](
Use &U) ->
bool {
4323 auto *
I = cast<Instruction>(U.getUser());
4325 if (
auto *Phi = dyn_cast<PHINode>(
I))
4326 BB = Phi->getIncomingBlock(
4328 return BB == PredBB;
4339 Worklist.
insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4340 InstsToReanalyze.
clear();
4343 while (!Worklist.
empty()) {
4348 if (!
I || isa<PHINode>(
I) ||
I->getParent() == PredBB ||
4349 !VectorLoop->contains(
I) ||
I->mayHaveSideEffects())
4356 InstsToReanalyze.push_back(
I);
4362 I->moveBefore(&*PredBB->getFirstInsertionPt());
4363 Worklist.
insert(
I->op_begin(),
I->op_end());
4376 PHINode *NewPhi = cast<PHINode>(State.
get(VPPhi, 0));
4398 if (
VF.
isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.
all()) {
4412 for (
unsigned Part = 0; Part <
UF; ++Part) {
4425 for (
unsigned Part = 0; Part <
UF; ++Part) {
4428 auto *Ptr = IsPtrLoopInvariant
4435 for (
unsigned I = 1,
E =
Operands.getNumOperands();
I <
E;
I++) {
4437 if (IsIndexLoopInvariant[
I - 1])
4440 Indices.push_back(State.
get(Operand, Part));
4451 "NewGEP is not a pointer vector");
4472 State.
set(
Def, VecPhi, 0);
4479 "Non-header phis should have been handled elsewhere");
4487 Value *Iden =
nullptr;
4495 "RdxDesc should only be set for reduction variables; in that case "
4496 "a StartV is also required");
4523 for (
unsigned Part = 0; Part < State.
UF; ++Part) {
4527 State.
set(
Def, EntryPart, Part);
4531 Value *StartVal = (Part == 0) ? StartV : Iden;
4539 "reductions should be handled above");