LLVM 19.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanTransforms.h"
63#include "VPlanVerifier.h"
64#include "llvm/ADT/APInt.h"
65#include "llvm/ADT/ArrayRef.h"
66#include "llvm/ADT/DenseMap.h"
68#include "llvm/ADT/Hashing.h"
69#include "llvm/ADT/MapVector.h"
70#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallSet.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
81#include "llvm/Analysis/CFG.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfo.h"
105#include "llvm/IR/DebugLoc.h"
106#include "llvm/IR/DerivedTypes.h"
108#include "llvm/IR/Dominators.h"
109#include "llvm/IR/Function.h"
110#include "llvm/IR/IRBuilder.h"
111#include "llvm/IR/InstrTypes.h"
112#include "llvm/IR/Instruction.h"
113#include "llvm/IR/Instructions.h"
115#include "llvm/IR/Intrinsics.h"
116#include "llvm/IR/MDBuilder.h"
117#include "llvm/IR/Metadata.h"
118#include "llvm/IR/Module.h"
119#include "llvm/IR/Operator.h"
120#include "llvm/IR/PatternMatch.h"
122#include "llvm/IR/Type.h"
123#include "llvm/IR/Use.h"
124#include "llvm/IR/User.h"
125#include "llvm/IR/Value.h"
126#include "llvm/IR/ValueHandle.h"
128#include "llvm/IR/Verifier.h"
129#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
145#include <algorithm>
146#include <cassert>
147#include <cmath>
148#include <cstdint>
149#include <functional>
150#include <iterator>
151#include <limits>
152#include <map>
153#include <memory>
154#include <string>
155#include <tuple>
156#include <utility>
157
158using namespace llvm;
159
160#define LV_NAME "loop-vectorize"
161#define DEBUG_TYPE LV_NAME
162
163#ifndef NDEBUG
164const char VerboseDebug[] = DEBUG_TYPE "-verbose";
165#endif
166
167/// @{
168/// Metadata attribute names
169const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
171 "llvm.loop.vectorize.followup_vectorized";
173 "llvm.loop.vectorize.followup_epilogue";
174/// @}
175
176STATISTIC(LoopsVectorized, "Number of loops vectorized");
177STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
178STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
179
181 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
182 cl::desc("Enable vectorization of epilogue loops."));
183
185 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
186 cl::desc("When epilogue vectorization is enabled, and a value greater than "
187 "1 is specified, forces the given VF for all applicable epilogue "
188 "loops."));
189
191 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
192 cl::desc("Only loops with vectorization factor equal to or larger than "
193 "the specified value are considered for epilogue vectorization."));
194
195/// Loops with a known constant trip count below this number are vectorized only
196/// if no scalar iteration overheads are incurred.
198 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
199 cl::desc("Loops with a constant trip count that is smaller than this "
200 "value are vectorized only if no scalar iteration overheads "
201 "are incurred."));
202
204 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
205 cl::desc("The maximum allowed number of runtime memory checks"));
206
207// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208// that predication is preferred, and this lists all options. I.e., the
209// vectorizer will try to fold the tail-loop (epilogue) into the vector body
210// and predicate the instructions accordingly. If tail-folding fails, there are
211// different fallback strategies depending on these values:
213 enum Option {
217 };
218} // namespace PreferPredicateTy
219
221 "prefer-predicate-over-epilogue",
224 cl::desc("Tail-folding and predication preferences over creating a scalar "
225 "epilogue loop."),
227 "scalar-epilogue",
228 "Don't tail-predicate loops, create scalar epilogue"),
230 "predicate-else-scalar-epilogue",
231 "prefer tail-folding, create scalar epilogue if tail "
232 "folding fails."),
234 "predicate-dont-vectorize",
235 "prefers tail-folding, don't attempt vectorization if "
236 "tail-folding fails.")));
237
239 "force-tail-folding-style", cl::desc("Force the tail folding style"),
240 cl::init(TailFoldingStyle::None),
242 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
244 TailFoldingStyle::Data, "data",
245 "Create lane mask for data only, using active.lane.mask intrinsic"),
246 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
247 "data-without-lane-mask",
248 "Create lane mask with compare/stepvector"),
249 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
250 "Create lane mask using active.lane.mask intrinsic, and use "
251 "it for both data and control flow"),
252 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253 "data-and-control-without-rt-check",
254 "Similar to data-and-control, but remove the runtime check"),
255 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
256 "Use predicated EVL instructions for tail folding. If EVL "
257 "is unsupported, fallback to data-without-lane-mask.")));
258
260 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
261 cl::desc("Maximize bandwidth when selecting vectorization factor which "
262 "will be determined by the smallest type in loop."));
263
265 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
266 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
267
268/// An interleave-group may need masking if it resides in a block that needs
269/// predication, or in order to mask away gaps.
271 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
272 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
273
275 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
276 cl::desc("A flag that overrides the target's number of scalar registers."));
277
279 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
280 cl::desc("A flag that overrides the target's number of vector registers."));
281
283 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
284 cl::desc("A flag that overrides the target's max interleave factor for "
285 "scalar loops."));
286
288 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
289 cl::desc("A flag that overrides the target's max interleave factor for "
290 "vectorized loops."));
291
293 "force-target-instruction-cost", cl::init(0), cl::Hidden,
294 cl::desc("A flag that overrides the target's expected cost for "
295 "an instruction to a single constant value. Mostly "
296 "useful for getting consistent testing."));
297
299 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
300 cl::desc(
301 "Pretend that scalable vectors are supported, even if the target does "
302 "not support them. This flag should only be used for testing."));
303
305 "small-loop-cost", cl::init(20), cl::Hidden,
306 cl::desc(
307 "The cost of a loop that is considered 'small' by the interleaver."));
308
310 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
311 cl::desc("Enable the use of the block frequency analysis to access PGO "
312 "heuristics minimizing code growth in cold regions and being more "
313 "aggressive in hot regions."));
314
315// Runtime interleave loops for load/store throughput.
317 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
318 cl::desc(
319 "Enable runtime interleaving until load/store ports are saturated"));
320
321/// The number of stores in a loop that are allowed to need predication.
323 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
324 cl::desc("Max number of stores to be predicated behind an if."));
325
327 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
328 cl::desc("Count the induction variable only once when interleaving"));
329
331 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
332 cl::desc("Enable if predication of stores during vectorization."));
333
335 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
336 cl::desc("The maximum interleave count to use when interleaving a scalar "
337 "reduction in a nested loop."));
338
339static cl::opt<bool>
340 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
342 cl::desc("Prefer in-loop vector reductions, "
343 "overriding the targets preference."));
344
346 "force-ordered-reductions", cl::init(false), cl::Hidden,
347 cl::desc("Enable the vectorisation of loops with in-order (strict) "
348 "FP reductions"));
349
351 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
352 cl::desc(
353 "Prefer predicating a reduction operation over an after loop select."));
354
355namespace llvm {
357 "enable-vplan-native-path", cl::Hidden,
358 cl::desc("Enable VPlan-native vectorization path with "
359 "support for outer loop vectorization."));
360}
361
362// This flag enables the stress testing of the VPlan H-CFG construction in the
363// VPlan-native vectorization path. It must be used in conjuction with
364// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
365// verification of the H-CFGs built.
367 "vplan-build-stress-test", cl::init(false), cl::Hidden,
368 cl::desc(
369 "Build VPlan for every supported loop nest in the function and bail "
370 "out right after the build (stress test the VPlan H-CFG construction "
371 "in the VPlan-native vectorization path)."));
372
374 "interleave-loops", cl::init(true), cl::Hidden,
375 cl::desc("Enable loop interleaving in Loop vectorization passes"));
377 "vectorize-loops", cl::init(true), cl::Hidden,
378 cl::desc("Run the Loop vectorization passes"));
379
381 "vplan-print-in-dot-format", cl::Hidden,
382 cl::desc("Use dot format instead of plain text when dumping VPlans"));
383
385 "force-widen-divrem-via-safe-divisor", cl::Hidden,
386 cl::desc(
387 "Override cost based safe divisor widening for div/rem instructions"));
388
390 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
392 cl::desc("Try wider VFs if they enable the use of vector variants"));
393
394// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
395// variables not overflowing do not hold. See `emitSCEVChecks`.
396static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
397// Likelyhood of bypassing the vectorized loop because pointers overlap. See
398// `emitMemRuntimeChecks`.
399static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
400// Likelyhood of bypassing the vectorized loop because there are zero trips left
401// after prolog. See `emitIterationCountCheck`.
402static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
403
404/// A helper function that returns true if the given type is irregular. The
405/// type is irregular if its allocated size doesn't equal the store size of an
406/// element of the corresponding vector type.
407static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
408 // Determine if an array of N elements of type Ty is "bitcast compatible"
409 // with a <N x Ty> vector.
410 // This is only true if there is no padding between the array elements.
411 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
412}
413
414/// A helper function that returns the reciprocal of the block probability of
415/// predicated blocks. If we return X, we are assuming the predicated block
416/// will execute once for every X iterations of the loop header.
417///
418/// TODO: We should use actual block probability here, if available. Currently,
419/// we always assume predicated blocks have a 50% chance of executing.
420static unsigned getReciprocalPredBlockProb() { return 2; }
421
422/// Returns "best known" trip count for the specified loop \p L as defined by
423/// the following procedure:
424/// 1) Returns exact trip count if it is known.
425/// 2) Returns expected trip count according to profile data if any.
426/// 3) Returns upper bound estimate if it is known.
427/// 4) Returns std::nullopt if all of the above failed.
428static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
429 Loop *L) {
430 // Check if exact trip count is known.
431 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
432 return ExpectedTC;
433
434 // Check if there is an expected trip count available from profile data.
436 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
437 return *EstimatedTC;
438
439 // Check if upper bound estimate is known.
440 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
441 return ExpectedTC;
442
443 return std::nullopt;
444}
445
446/// Return a vector containing interleaved elements from multiple
447/// smaller input vectors.
449 const Twine &Name) {
450 unsigned Factor = Vals.size();
451 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
452
453 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
454#ifndef NDEBUG
455 for (Value *Val : Vals)
456 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
457#endif
458
459 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
460 // must use intrinsics to interleave.
461 if (VecTy->isScalableTy()) {
462 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
463 return Builder.CreateIntrinsic(
464 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
465 /*FMFSource=*/nullptr, Name);
466 }
467
468 // Fixed length. Start by concatenating all vectors into a wide vector.
469 Value *WideVec = concatenateVectors(Builder, Vals);
470
471 // Interleave the elements into the wide vector.
472 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
473 return Builder.CreateShuffleVector(
474 WideVec, createInterleaveMask(NumElts, Factor), Name);
475}
476
477namespace {
478// Forward declare GeneratedRTChecks.
479class GeneratedRTChecks;
480
481using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
482} // namespace
483
484namespace llvm {
485
487
488/// InnerLoopVectorizer vectorizes loops which contain only one basic
489/// block to a specified vectorization factor (VF).
490/// This class performs the widening of scalars into vectors, or multiple
491/// scalars. This class also implements the following features:
492/// * It inserts an epilogue loop for handling loops that don't have iteration
493/// counts that are known to be a multiple of the vectorization factor.
494/// * It handles the code generation for reduction variables.
495/// * Scalarization (implementation using scalars) of un-vectorizable
496/// instructions.
497/// InnerLoopVectorizer does not perform any vectorization-legality
498/// checks, and relies on the caller to check for the different legality
499/// aspects. The InnerLoopVectorizer relies on the
500/// LoopVectorizationLegality class to provide information about the induction
501/// and reduction variables that were found to a given vectorization factor.
503public:
506 const TargetLibraryInfo *TLI,
510 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
512 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
513 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
514 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
515 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
517 // Query this against the original loop and save it here because the profile
518 // of the original loop header may change as the transformation happens.
521
523 this->MinProfitableTripCount = VecWidth;
524 else
525 this->MinProfitableTripCount = MinProfitableTripCount;
526 }
527
528 virtual ~InnerLoopVectorizer() = default;
529
530 /// Create a new empty loop that will contain vectorized instructions later
531 /// on, while the old loop will be used as the scalar remainder. Control flow
532 /// is generated around the vectorized (and scalar epilogue) loops consisting
533 /// of various checks and bypasses. Return the pre-header block of the new
534 /// loop and the start value for the canonical induction, if it is != 0. The
535 /// latter is the case when vectorizing the epilogue loop. In the case of
536 /// epilogue vectorization, this function is overriden to handle the more
537 /// complex control flow around the loops. \p ExpandedSCEVs is used to
538 /// look up SCEV expansions for expressions needed during skeleton creation.
539 virtual std::pair<BasicBlock *, Value *>
540 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
541
542 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
543 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
544
545 // Return true if any runtime check is added.
547
548 /// A type for vectorized values in the new loop. Each value from the
549 /// original loop, when vectorized, is represented by UF vector values in the
550 /// new unrolled loop, where UF is the unroll factor.
552
553 /// A helper function to scalarize a single Instruction in the innermost loop.
554 /// Generates a sequence of scalar instances for each lane between \p MinLane
555 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
556 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
557 /// Instr's operands.
558 void scalarizeInstruction(const Instruction *Instr,
559 VPReplicateRecipe *RepRecipe,
560 const VPIteration &Instance,
561 VPTransformState &State);
562
563 /// Try to vectorize interleaved access group \p Group with the base address
564 /// given in \p Addr, optionally masking the vector operations if \p
565 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
566 /// values in the vectorized loop.
568 ArrayRef<VPValue *> VPDefs,
570 ArrayRef<VPValue *> StoredValues,
571 VPValue *BlockInMask, bool NeedsMaskForGaps);
572
573 /// Fix the non-induction PHIs in \p Plan.
574 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
575
576 /// Create a new phi node for the induction variable \p OrigPhi to resume
577 /// iteration count in the scalar epilogue, from where the vectorized loop
578 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
579 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
580 /// and the resume values can come from an additional bypass block, the \p
581 /// AdditionalBypass pair provides information about the bypass block and the
582 /// end value on the edge from bypass to this loop.
584 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
585 ArrayRef<BasicBlock *> BypassBlocks,
586 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
587
588 /// Returns the original loop trip count.
589 Value *getTripCount() const { return TripCount; }
590
591 /// Used to set the trip count after ILV's construction and after the
592 /// preheader block has been executed. Note that this always holds the trip
593 /// count of the original loop for both main loop and epilogue vectorization.
594 void setTripCount(Value *TC) { TripCount = TC; }
595
596protected:
598
599 /// A small list of PHINodes.
601
602 /// A type for scalarized values in the new loop. Each value from the
603 /// original loop, when scalarized, is represented by UF x VF scalar values
604 /// in the new unrolled loop, where UF is the unroll factor and VF is the
605 /// vectorization factor.
607
608 /// Set up the values of the IVs correctly when exiting the vector loop.
609 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
610 Value *VectorTripCount, Value *EndValue,
611 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
612 VPlan &Plan, VPTransformState &State);
613
614 /// Create the exit value of first order recurrences in the middle block and
615 /// update their users.
617 VPTransformState &State);
618
619 /// Create code for the loop exit value of the reduction.
621
622 /// Iteratively sink the scalarized operands of a predicated instruction into
623 /// the block that was created for it.
624 void sinkScalarOperands(Instruction *PredInst);
625
626 /// Returns (and creates if needed) the trip count of the widened loop.
628
629 /// Returns a bitcasted value to the requested vector type.
630 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
632 const DataLayout &DL);
633
634 /// Emit a bypass check to see if the vector trip count is zero, including if
635 /// it overflows.
637
638 /// Emit a bypass check to see if all of the SCEV assumptions we've
639 /// had to make are correct. Returns the block containing the checks or
640 /// nullptr if no checks have been added.
642
643 /// Emit bypass checks to check any memory assumptions we may have made.
644 /// Returns the block containing the checks or nullptr if no checks have been
645 /// added.
647
648 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
649 /// vector loop preheader, middle block and scalar preheader.
651
652 /// Create new phi nodes for the induction variables to resume iteration count
653 /// in the scalar epilogue, from where the vectorized loop left off.
654 /// In cases where the loop skeleton is more complicated (eg. epilogue
655 /// vectorization) and the resume values can come from an additional bypass
656 /// block, the \p AdditionalBypass pair provides information about the bypass
657 /// block and the end value on the edge from bypass to this loop.
659 const SCEV2ValueTy &ExpandedSCEVs,
660 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
661
662 /// Complete the loop skeleton by adding debug MDs, creating appropriate
663 /// conditional branches in the middle block, preparing the builder and
664 /// running the verifier. Return the preheader of the completed vector loop.
666
667 /// Allow subclasses to override and print debug traces before/after vplan
668 /// execution, when trace information is requested.
669 virtual void printDebugTracesAtStart(){};
670 virtual void printDebugTracesAtEnd(){};
671
672 /// The original loop.
674
675 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
676 /// dynamic knowledge to simplify SCEV expressions and converts them to a
677 /// more usable form.
679
680 /// Loop Info.
682
683 /// Dominator Tree.
685
686 /// Target Library Info.
688
689 /// Target Transform Info.
691
692 /// Assumption Cache.
694
695 /// Interface to emit optimization remarks.
697
698 /// The vectorization SIMD factor to use. Each vector will have this many
699 /// vector elements.
701
703
704 /// The vectorization unroll factor to use. Each scalar is vectorized to this
705 /// many different vector instructions.
706 unsigned UF;
707
708 /// The builder that we use
710
711 // --- Vectorization state ---
712
713 /// The vector-loop preheader.
715
716 /// The scalar-loop preheader.
718
719 /// Middle Block between the vector and the scalar.
721
722 /// The unique ExitBlock of the scalar loop if one exists. Note that
723 /// there can be multiple exiting edges reaching this block.
725
726 /// The scalar loop body.
728
729 /// A list of all bypass blocks. The first block is the entry of the loop.
731
732 /// Store instructions that were predicated.
734
735 /// Trip count of the original loop.
736 Value *TripCount = nullptr;
737
738 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
740
741 /// The legality analysis.
743
744 /// The profitablity analysis.
746
747 // Record whether runtime checks are added.
748 bool AddedSafetyChecks = false;
749
750 // Holds the end values for each induction variable. We save the end values
751 // so we can later fix-up the external users of the induction variables.
753
754 /// BFI and PSI are used to check for profile guided size optimizations.
757
758 // Whether this loop should be optimized for size based on profile guided size
759 // optimizatios.
761
762 /// Structure to hold information about generated runtime checks, responsible
763 /// for cleaning the checks, if vectorization turns out unprofitable.
764 GeneratedRTChecks &RTChecks;
765
766 // Holds the resume values for reductions in the loops, used to set the
767 // correct start value of reduction PHIs when vectorizing the epilogue.
770};
771
773public:
776 const TargetLibraryInfo *TLI,
778 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
781 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
783 ElementCount::getFixed(1),
784 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
785 BFI, PSI, Check) {}
786};
787
788/// Encapsulate information regarding vectorization of a loop and its epilogue.
789/// This information is meant to be updated and used across two stages of
790/// epilogue vectorization.
793 unsigned MainLoopUF = 0;
795 unsigned EpilogueUF = 0;
800 Value *TripCount = nullptr;
802
804 ElementCount EVF, unsigned EUF)
805 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
806 assert(EUF == 1 &&
807 "A high UF for the epilogue loop is likely not beneficial.");
808 }
809};
810
811/// An extension of the inner loop vectorizer that creates a skeleton for a
812/// vectorized loop that has its epilogue (residual) also vectorized.
813/// The idea is to run the vplan on a given loop twice, firstly to setup the
814/// skeleton and vectorize the main loop, and secondly to complete the skeleton
815/// from the first step and vectorize the epilogue. This is achieved by
816/// deriving two concrete strategy classes from this base class and invoking
817/// them in succession from the loop vectorizer planner.
819public:
827 GeneratedRTChecks &Checks)
829 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
830 CM, BFI, PSI, Checks),
831 EPI(EPI) {}
832
833 // Override this function to handle the more complex control flow around the
834 // three loops.
835 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
836 const SCEV2ValueTy &ExpandedSCEVs) final {
837 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
838 }
839
840 /// The interface for creating a vectorized skeleton using one of two
841 /// different strategies, each corresponding to one execution of the vplan
842 /// as described above.
843 virtual std::pair<BasicBlock *, Value *>
844 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
845
846 /// Holds and updates state information required to vectorize the main loop
847 /// and its epilogue in two separate passes. This setup helps us avoid
848 /// regenerating and recomputing runtime safety checks. It also helps us to
849 /// shorten the iteration-count-check path length for the cases where the
850 /// iteration count of the loop is so small that the main vector loop is
851 /// completely skipped.
853};
854
855/// A specialized derived class of inner loop vectorizer that performs
856/// vectorization of *main* loops in the process of vectorizing loops and their
857/// epilogues.
859public:
867 GeneratedRTChecks &Check)
869 EPI, LVL, CM, BFI, PSI, Check) {}
870 /// Implements the interface for creating a vectorized skeleton using the
871 /// *main loop* strategy (ie the first pass of vplan execution).
872 std::pair<BasicBlock *, Value *>
873 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
874
875protected:
876 /// Emits an iteration count bypass check once for the main loop (when \p
877 /// ForEpilogue is false) and once for the epilogue loop (when \p
878 /// ForEpilogue is true).
879 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
880 void printDebugTracesAtStart() override;
881 void printDebugTracesAtEnd() override;
882};
883
884// A specialized derived class of inner loop vectorizer that performs
885// vectorization of *epilogue* loops in the process of vectorizing loops and
886// their epilogues.
888public:
896 GeneratedRTChecks &Checks)
898 EPI, LVL, CM, BFI, PSI, Checks) {
900 }
901 /// Implements the interface for creating a vectorized skeleton using the
902 /// *epilogue loop* strategy (ie the second pass of vplan execution).
903 std::pair<BasicBlock *, Value *>
904 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
905
906protected:
907 /// Emits an iteration count bypass check after the main vector loop has
908 /// finished to see if there are any iterations left to execute by either
909 /// the vector epilogue or the scalar epilogue.
911 BasicBlock *Bypass,
912 BasicBlock *Insert);
913 void printDebugTracesAtStart() override;
914 void printDebugTracesAtEnd() override;
915};
916} // end namespace llvm
917
918/// Look for a meaningful debug location on the instruction or it's
919/// operands.
921 if (!I)
922 return DebugLoc();
923
925 if (I->getDebugLoc() != Empty)
926 return I->getDebugLoc();
927
928 for (Use &Op : I->operands()) {
929 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
930 if (OpInst->getDebugLoc() != Empty)
931 return OpInst->getDebugLoc();
932 }
933
934 return I->getDebugLoc();
935}
936
937/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
938/// is passed, the message relates to that particular instruction.
939#ifndef NDEBUG
940static void debugVectorizationMessage(const StringRef Prefix,
941 const StringRef DebugMsg,
942 Instruction *I) {
943 dbgs() << "LV: " << Prefix << DebugMsg;
944 if (I != nullptr)
945 dbgs() << " " << *I;
946 else
947 dbgs() << '.';
948 dbgs() << '\n';
949}
950#endif
951
952/// Create an analysis remark that explains why vectorization failed
953///
954/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
955/// RemarkName is the identifier for the remark. If \p I is passed it is an
956/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
957/// the location of the remark. \return the remark object that can be
958/// streamed to.
960 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
961 Value *CodeRegion = TheLoop->getHeader();
962 DebugLoc DL = TheLoop->getStartLoc();
963
964 if (I) {
965 CodeRegion = I->getParent();
966 // If there is no debug location attached to the instruction, revert back to
967 // using the loop's.
968 if (I->getDebugLoc())
969 DL = I->getDebugLoc();
970 }
971
972 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
973}
974
975namespace llvm {
976
977/// Return a value for Step multiplied by VF.
979 int64_t Step) {
980 assert(Ty->isIntegerTy() && "Expected an integer step");
981 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
982}
983
984/// Return the runtime value for VF.
986 return B.CreateElementCount(Ty, VF);
987}
988
990 Loop *OrigLoop) {
991 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
992 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
993
994 ScalarEvolution &SE = *PSE.getSE();
995 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
996}
997
999 const StringRef OREMsg, const StringRef ORETag,
1000 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1001 Instruction *I) {
1002 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1003 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1004 ORE->emit(
1005 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1006 << "loop not vectorized: " << OREMsg);
1007}
1008
1009void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1010 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1011 Instruction *I) {
1013 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1014 ORE->emit(
1015 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1016 << Msg);
1017}
1018
1019/// Report successful vectorization of the loop. In case an outer loop is
1020/// vectorized, prepend "outer" to the vectorization remark.
1022 VectorizationFactor VF, unsigned IC) {
1024 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1025 nullptr));
1026 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1027 ORE->emit([&]() {
1028 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1029 TheLoop->getHeader())
1030 << "vectorized " << LoopType << "loop (vectorization width: "
1031 << ore::NV("VectorizationFactor", VF.Width)
1032 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1033 });
1034}
1035
1036} // end namespace llvm
1037
1038#ifndef NDEBUG
1039/// \return string containing a file name and a line # for the given loop.
1040static std::string getDebugLocString(const Loop *L) {
1041 std::string Result;
1042 if (L) {
1043 raw_string_ostream OS(Result);
1044 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1045 LoopDbgLoc.print(OS);
1046 else
1047 // Just print the module name.
1048 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1049 OS.flush();
1050 }
1051 return Result;
1052}
1053#endif
1054
1055namespace llvm {
1056
1057// Loop vectorization cost-model hints how the scalar epilogue loop should be
1058// lowered.
1060
1061 // The default: allowing scalar epilogues.
1063
1064 // Vectorization with OptForSize: don't allow epilogues.
1066
1067 // A special case of vectorisation with OptForSize: loops with a very small
1068 // trip count are considered for vectorization under OptForSize, thereby
1069 // making sure the cost of their loop body is dominant, free of runtime
1070 // guards and scalar iteration overheads.
1072
1073 // Loop hint predicate indicating an epilogue is undesired.
1075
1076 // Directive indicating we must either tail fold or not vectorize
1079
1080using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1081
1082/// LoopVectorizationCostModel - estimates the expected speedups due to
1083/// vectorization.
1084/// In many cases vectorization is not profitable. This can happen because of
1085/// a number of reasons. In this class we mainly attempt to predict the
1086/// expected speedup/slowdowns due to the supported instruction set. We use the
1087/// TargetTransformInfo to query the different backends for the cost of
1088/// different operations.
1090public:
1094 const TargetTransformInfo &TTI,
1100 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1101 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1102 Hints(Hints), InterleaveInfo(IAI) {}
1103
1104 /// \return An upper bound for the vectorization factors (both fixed and
1105 /// scalable). If the factors are 0, vectorization and interleaving should be
1106 /// avoided up front.
1107 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1108
1109 /// \return True if runtime checks are required for vectorization, and false
1110 /// otherwise.
1111 bool runtimeChecksRequired();
1112
1113 /// Setup cost-based decisions for user vectorization factor.
1114 /// \return true if the UserVF is a feasible VF to be chosen.
1118 return expectedCost(UserVF).first.isValid();
1119 }
1120
1121 /// \return The size (in bits) of the smallest and widest types in the code
1122 /// that needs to be vectorized. We ignore values that remain scalar such as
1123 /// 64 bit loop indices.
1124 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1125
1126 /// \return The desired interleave count.
1127 /// If interleave count has been specified by metadata it will be returned.
1128 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1129 /// are the selected vectorization factor and the cost of the selected VF.
1130 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1131
1132 /// Memory access instruction may be vectorized in more than one way.
1133 /// Form of instruction after vectorization depends on cost.
1134 /// This function takes cost-based decisions for Load/Store instructions
1135 /// and collects them in a map. This decisions map is used for building
1136 /// the lists of loop-uniform and loop-scalar instructions.
1137 /// The calculated cost is saved with widening decision in order to
1138 /// avoid redundant calculations.
1140
1141 /// A call may be vectorized in different ways depending on whether we have
1142 /// vectorized variants available and whether the target supports masking.
1143 /// This function analyzes all calls in the function at the supplied VF,
1144 /// makes a decision based on the costs of available options, and stores that
1145 /// decision in a map for use in planning and plan execution.
1147
1148 /// A struct that represents some properties of the register usage
1149 /// of a loop.
1151 /// Holds the number of loop invariant values that are used in the loop.
1152 /// The key is ClassID of target-provided register class.
1154 /// Holds the maximum number of concurrent live intervals in the loop.
1155 /// The key is ClassID of target-provided register class.
1157 };
1158
1159 /// \return Returns information about the register usages of the loop for the
1160 /// given vectorization factors.
1163
1164 /// Collect values we want to ignore in the cost model.
1165 void collectValuesToIgnore();
1166
1167 /// Collect all element types in the loop for which widening is needed.
1169
1170 /// Split reductions into those that happen in the loop, and those that happen
1171 /// outside. In loop reductions are collected into InLoopReductions.
1173
1174 /// Returns true if we should use strict in-order reductions for the given
1175 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1176 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1177 /// of FP operations.
1178 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1179 return !Hints->allowReordering() && RdxDesc.isOrdered();
1180 }
1181
1182 /// \returns The smallest bitwidth each instruction can be represented with.
1183 /// The vector equivalents of these instructions should be truncated to this
1184 /// type.
1186 return MinBWs;
1187 }
1188
1189 /// \returns True if it is more profitable to scalarize instruction \p I for
1190 /// vectorization factor \p VF.
1192 assert(VF.isVector() &&
1193 "Profitable to scalarize relevant only for VF > 1.");
1194 assert(
1195 TheLoop->isInnermost() &&
1196 "cost-model should not be used for outer loops (in VPlan-native path)");
1197
1198 auto Scalars = InstsToScalarize.find(VF);
1199 assert(Scalars != InstsToScalarize.end() &&
1200 "VF not yet analyzed for scalarization profitability");
1201 return Scalars->second.contains(I);
1202 }
1203
1204 /// Returns true if \p I is known to be uniform after vectorization.
1206 assert(
1207 TheLoop->isInnermost() &&
1208 "cost-model should not be used for outer loops (in VPlan-native path)");
1209 // Pseudo probe needs to be duplicated for each unrolled iteration and
1210 // vector lane so that profiled loop trip count can be accurately
1211 // accumulated instead of being under counted.
1212 if (isa<PseudoProbeInst>(I))
1213 return false;
1214
1215 if (VF.isScalar())
1216 return true;
1217
1218 auto UniformsPerVF = Uniforms.find(VF);
1219 assert(UniformsPerVF != Uniforms.end() &&
1220 "VF not yet analyzed for uniformity");
1221 return UniformsPerVF->second.count(I);
1222 }
1223
1224 /// Returns true if \p I is known to be scalar after vectorization.
1226 assert(
1227 TheLoop->isInnermost() &&
1228 "cost-model should not be used for outer loops (in VPlan-native path)");
1229 if (VF.isScalar())
1230 return true;
1231
1232 auto ScalarsPerVF = Scalars.find(VF);
1233 assert(ScalarsPerVF != Scalars.end() &&
1234 "Scalar values are not calculated for VF");
1235 return ScalarsPerVF->second.count(I);
1236 }
1237
1238 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1239 /// for vectorization factor \p VF.
1241 return VF.isVector() && MinBWs.contains(I) &&
1242 !isProfitableToScalarize(I, VF) &&
1244 }
1245
1246 /// Decision that was taken during cost calculation for memory instruction.
1249 CM_Widen, // For consecutive accesses with stride +1.
1250 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1257
1258 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1259 /// instruction \p I and vector width \p VF.
1262 assert(VF.isVector() && "Expected VF >=2");
1263 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1264 }
1265
1266 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1267 /// interleaving group \p Grp and vector width \p VF.
1271 assert(VF.isVector() && "Expected VF >=2");
1272 /// Broadcast this decicion to all instructions inside the group.
1273 /// But the cost will be assigned to one instruction only.
1274 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1275 if (auto *I = Grp->getMember(i)) {
1276 if (Grp->getInsertPos() == I)
1277 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1278 else
1279 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1280 }
1281 }
1282 }
1283
1284 /// Return the cost model decision for the given instruction \p I and vector
1285 /// width \p VF. Return CM_Unknown if this instruction did not pass
1286 /// through the cost modeling.
1288 assert(VF.isVector() && "Expected VF to be a vector VF");
1289 assert(
1290 TheLoop->isInnermost() &&
1291 "cost-model should not be used for outer loops (in VPlan-native path)");
1292
1293 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1294 auto Itr = WideningDecisions.find(InstOnVF);
1295 if (Itr == WideningDecisions.end())
1296 return CM_Unknown;
1297 return Itr->second.first;
1298 }
1299
1300 /// Return the vectorization cost for the given instruction \p I and vector
1301 /// width \p VF.
1303 assert(VF.isVector() && "Expected VF >=2");
1304 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1305 assert(WideningDecisions.contains(InstOnVF) &&
1306 "The cost is not calculated");
1307 return WideningDecisions[InstOnVF].second;
1308 }
1309
1314 std::optional<unsigned> MaskPos;
1316 };
1317
1319 Function *Variant, Intrinsic::ID IID,
1320 std::optional<unsigned> MaskPos,
1322 assert(!VF.isScalar() && "Expected vector VF");
1323 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1324 MaskPos, Cost};
1325 }
1326
1328 ElementCount VF) const {
1329 assert(!VF.isScalar() && "Expected vector VF");
1330 return CallWideningDecisions.at(std::make_pair(CI, VF));
1331 }
1332
1333 /// Return True if instruction \p I is an optimizable truncate whose operand
1334 /// is an induction variable. Such a truncate will be removed by adding a new
1335 /// induction variable with the destination type.
1337 // If the instruction is not a truncate, return false.
1338 auto *Trunc = dyn_cast<TruncInst>(I);
1339 if (!Trunc)
1340 return false;
1341
1342 // Get the source and destination types of the truncate.
1343 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1344 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1345
1346 // If the truncate is free for the given types, return false. Replacing a
1347 // free truncate with an induction variable would add an induction variable
1348 // update instruction to each iteration of the loop. We exclude from this
1349 // check the primary induction variable since it will need an update
1350 // instruction regardless.
1351 Value *Op = Trunc->getOperand(0);
1352 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1353 return false;
1354
1355 // If the truncated value is not an induction variable, return false.
1356 return Legal->isInductionPhi(Op);
1357 }
1358
1359 /// Collects the instructions to scalarize for each predicated instruction in
1360 /// the loop.
1362
1363 /// Collect Uniform and Scalar values for the given \p VF.
1364 /// The sets depend on CM decision for Load/Store instructions
1365 /// that may be vectorized as interleave, gather-scatter or scalarized.
1366 /// Also make a decision on what to do about call instructions in the loop
1367 /// at that VF -- scalarize, call a known vector routine, or call a
1368 /// vector intrinsic.
1370 // Do the analysis once.
1371 if (VF.isScalar() || Uniforms.contains(VF))
1372 return;
1375 collectLoopUniforms(VF);
1376 collectLoopScalars(VF);
1377 }
1378
1379 /// Returns true if the target machine supports masked store operation
1380 /// for the given \p DataType and kind of access to \p Ptr.
1381 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1382 return Legal->isConsecutivePtr(DataType, Ptr) &&
1383 TTI.isLegalMaskedStore(DataType, Alignment);
1384 }
1385
1386 /// Returns true if the target machine supports masked load operation
1387 /// for the given \p DataType and kind of access to \p Ptr.
1388 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1389 return Legal->isConsecutivePtr(DataType, Ptr) &&
1390 TTI.isLegalMaskedLoad(DataType, Alignment);
1391 }
1392
1393 /// Returns true if the target machine can represent \p V as a masked gather
1394 /// or scatter operation.
1396 bool LI = isa<LoadInst>(V);
1397 bool SI = isa<StoreInst>(V);
1398 if (!LI && !SI)
1399 return false;
1400 auto *Ty = getLoadStoreType(V);
1402 if (VF.isVector())
1403 Ty = VectorType::get(Ty, VF);
1404 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1405 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1406 }
1407
1408 /// Returns true if the target machine supports all of the reduction
1409 /// variables found for the given VF.
1411 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1412 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1413 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1414 }));
1415 }
1416
1417 /// Given costs for both strategies, return true if the scalar predication
1418 /// lowering should be used for div/rem. This incorporates an override
1419 /// option so it is not simply a cost comparison.
1421 InstructionCost SafeDivisorCost) const {
1422 switch (ForceSafeDivisor) {
1423 case cl::BOU_UNSET:
1424 return ScalarCost < SafeDivisorCost;
1425 case cl::BOU_TRUE:
1426 return false;
1427 case cl::BOU_FALSE:
1428 return true;
1429 };
1430 llvm_unreachable("impossible case value");
1431 }
1432
1433 /// Returns true if \p I is an instruction which requires predication and
1434 /// for which our chosen predication strategy is scalarization (i.e. we
1435 /// don't have an alternate strategy such as masking available).
1436 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1438
1439 /// Returns true if \p I is an instruction that needs to be predicated
1440 /// at runtime. The result is independent of the predication mechanism.
1441 /// Superset of instructions that return true for isScalarWithPredication.
1442 bool isPredicatedInst(Instruction *I) const;
1443
1444 /// Return the costs for our two available strategies for lowering a
1445 /// div/rem operation which requires speculating at least one lane.
1446 /// First result is for scalarization (will be invalid for scalable
1447 /// vectors); second is for the safe-divisor strategy.
1448 std::pair<InstructionCost, InstructionCost>
1450 ElementCount VF) const;
1451
1452 /// Returns true if \p I is a memory instruction with consecutive memory
1453 /// access that can be widened.
1455
1456 /// Returns true if \p I is a memory instruction in an interleaved-group
1457 /// of memory accesses that can be vectorized with wide vector loads/stores
1458 /// and shuffles.
1460
1461 /// Check if \p Instr belongs to any interleaved access group.
1463 return InterleaveInfo.isInterleaved(Instr);
1464 }
1465
1466 /// Get the interleaved access group that \p Instr belongs to.
1469 return InterleaveInfo.getInterleaveGroup(Instr);
1470 }
1471
1472 /// Returns true if we're required to use a scalar epilogue for at least
1473 /// the final iteration of the original loop.
1474 bool requiresScalarEpilogue(bool IsVectorizing) const {
1476 return false;
1477 // If we might exit from anywhere but the latch, must run the exiting
1478 // iteration in scalar form.
1480 return true;
1481 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1482 }
1483
1484 /// Returns true if we're required to use a scalar epilogue for at least
1485 /// the final iteration of the original loop for all VFs in \p Range.
1486 /// A scalar epilogue must either be required for all VFs in \p Range or for
1487 /// none.
1489 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1490 return requiresScalarEpilogue(VF.isVector());
1491 };
1492 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1493 assert(
1494 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1495 "all VFs in range must agree on whether a scalar epilogue is required");
1496 return IsRequired;
1497 }
1498
1499 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1500 /// loop hint annotation.
1502 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1503 }
1504
1505 /// Returns the TailFoldingStyle that is best for the current loop.
1506 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1507 if (!ChosenTailFoldingStyle)
1509 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1510 : ChosenTailFoldingStyle->second;
1511 }
1512
1513 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1514 /// overflow or not.
1515 /// \param IsScalableVF true if scalable vector factors enabled.
1516 /// \param UserIC User specific interleave count.
1517 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1518 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1520 ChosenTailFoldingStyle =
1522 return;
1523 }
1524
1525 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1526 ChosenTailFoldingStyle = std::make_pair(
1527 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1528 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1529 return;
1530 }
1531
1532 // Set styles when forced.
1533 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1534 ForceTailFoldingStyle.getValue());
1536 return;
1537 // Override forced styles if needed.
1538 // FIXME: use actual opcode/data type for analysis here.
1539 // FIXME: Investigate opportunity for fixed vector factor.
1540 bool EVLIsLegal =
1541 IsScalableVF && UserIC <= 1 &&
1542 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1544 // FIXME: implement support for max safe dependency distance.
1546 // FIXME: remove this once reductions are supported.
1548 if (!EVLIsLegal) {
1549 // If for some reason EVL mode is unsupported, fallback to
1550 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1551 // in a generic way.
1552 ChosenTailFoldingStyle =
1555 LLVM_DEBUG(
1556 dbgs()
1557 << "LV: Preference for VP intrinsics indicated. Will "
1558 "not try to generate VP Intrinsics "
1559 << (UserIC > 1
1560 ? "since interleave count specified is greater than 1.\n"
1561 : "due to non-interleaving reasons.\n"));
1562 }
1563 }
1564
1565 /// Returns true if all loop blocks should be masked to fold tail loop.
1566 bool foldTailByMasking() const {
1567 // TODO: check if it is possible to check for None style independent of
1568 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1570 }
1571
1572 /// Returns true if the instructions in this block requires predication
1573 /// for any reason, e.g. because tail folding now requires a predicate
1574 /// or because the block in the original loop was predicated.
1577 }
1578
1579 /// Returns true if VP intrinsics with explicit vector length support should
1580 /// be generated in the tail folded loop.
1581 bool foldTailWithEVL() const {
1583 // FIXME: remove this once vp_reverse is supported.
1584 none_of(
1585 WideningDecisions,
1586 [](const std::pair<std::pair<Instruction *, ElementCount>,
1587 std::pair<InstWidening, InstructionCost>>
1588 &Data) { return Data.second.first == CM_Widen_Reverse; });
1589 }
1590
1591 /// Returns true if the Phi is part of an inloop reduction.
1592 bool isInLoopReduction(PHINode *Phi) const {
1593 return InLoopReductions.contains(Phi);
1594 }
1595
1596 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1597 /// with factor VF. Return the cost of the instruction, including
1598 /// scalarization overhead if it's needed.
1600
1601 /// Estimate cost of a call instruction CI if it were vectorized with factor
1602 /// VF. Return the cost of the instruction, including scalarization overhead
1603 /// if it's needed.
1605
1606 /// Invalidates decisions already taken by the cost model.
1608 WideningDecisions.clear();
1609 CallWideningDecisions.clear();
1610 Uniforms.clear();
1611 Scalars.clear();
1612 }
1613
1614 /// The vectorization cost is a combination of the cost itself and a boolean
1615 /// indicating whether any of the contributing operations will actually
1616 /// operate on vector values after type legalization in the backend. If this
1617 /// latter value is false, then all operations will be scalarized (i.e. no
1618 /// vectorization has actually taken place).
1619 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1620
1621 /// Returns the expected execution cost. The unit of the cost does
1622 /// not matter because we use the 'cost' units to compare different
1623 /// vector widths. The cost that is returned is *not* normalized by
1624 /// the factor width. If \p Invalid is not nullptr, this function
1625 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1626 /// each instruction that has an Invalid cost for the given VF.
1630
1631 bool hasPredStores() const { return NumPredStores > 0; }
1632
1633 /// Returns true if epilogue vectorization is considered profitable, and
1634 /// false otherwise.
1635 /// \p VF is the vectorization factor chosen for the original loop.
1637
1638private:
1639 unsigned NumPredStores = 0;
1640
1641 /// \return An upper bound for the vectorization factors for both
1642 /// fixed and scalable vectorization, where the minimum-known number of
1643 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1644 /// disabled or unsupported, then the scalable part will be equal to
1645 /// ElementCount::getScalable(0).
1646 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1647 ElementCount UserVF,
1648 bool FoldTailByMasking);
1649
1650 /// \return the maximized element count based on the targets vector
1651 /// registers and the loop trip-count, but limited to a maximum safe VF.
1652 /// This is a helper function of computeFeasibleMaxVF.
1653 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1654 unsigned SmallestType,
1655 unsigned WidestType,
1656 ElementCount MaxSafeVF,
1657 bool FoldTailByMasking);
1658
1659 /// \return the maximum legal scalable VF, based on the safe max number
1660 /// of elements.
1661 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1662
1663 /// Returns the execution time cost of an instruction for a given vector
1664 /// width. Vector width of one means scalar.
1665 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1666
1667 /// The cost-computation logic from getInstructionCost which provides
1668 /// the vector type as an output parameter.
1669 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1670 Type *&VectorTy);
1671
1672 /// Return the cost of instructions in an inloop reduction pattern, if I is
1673 /// part of that pattern.
1674 std::optional<InstructionCost>
1675 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1677
1678 /// Calculate vectorization cost of memory instruction \p I.
1679 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1680
1681 /// The cost computation for scalarized memory instruction.
1682 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1683
1684 /// The cost computation for interleaving group of memory instructions.
1685 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1686
1687 /// The cost computation for Gather/Scatter instruction.
1688 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1689
1690 /// The cost computation for widening instruction \p I with consecutive
1691 /// memory access.
1692 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1693
1694 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1695 /// Load: scalar load + broadcast.
1696 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1697 /// element)
1698 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1699
1700 /// Estimate the overhead of scalarizing an instruction. This is a
1701 /// convenience wrapper for the type-based getScalarizationOverhead API.
1702 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1704
1705 /// Returns true if an artificially high cost for emulated masked memrefs
1706 /// should be used.
1707 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1708
1709 /// Map of scalar integer values to the smallest bitwidth they can be legally
1710 /// represented as. The vector equivalents of these values should be truncated
1711 /// to this type.
1713
1714 /// A type representing the costs for instructions if they were to be
1715 /// scalarized rather than vectorized. The entries are Instruction-Cost
1716 /// pairs.
1717 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1718
1719 /// A set containing all BasicBlocks that are known to present after
1720 /// vectorization as a predicated block.
1722 PredicatedBBsAfterVectorization;
1723
1724 /// Records whether it is allowed to have the original scalar loop execute at
1725 /// least once. This may be needed as a fallback loop in case runtime
1726 /// aliasing/dependence checks fail, or to handle the tail/remainder
1727 /// iterations when the trip count is unknown or doesn't divide by the VF,
1728 /// or as a peel-loop to handle gaps in interleave-groups.
1729 /// Under optsize and when the trip count is very small we don't allow any
1730 /// iterations to execute in the scalar loop.
1731 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1732
1733 /// Control finally chosen tail folding style. The first element is used if
1734 /// the IV update may overflow, the second element - if it does not.
1735 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1736 ChosenTailFoldingStyle;
1737
1738 /// A map holding scalar costs for different vectorization factors. The
1739 /// presence of a cost for an instruction in the mapping indicates that the
1740 /// instruction will be scalarized when vectorizing with the associated
1741 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1743
1744 /// Holds the instructions known to be uniform after vectorization.
1745 /// The data is collected per VF.
1747
1748 /// Holds the instructions known to be scalar after vectorization.
1749 /// The data is collected per VF.
1751
1752 /// Holds the instructions (address computations) that are forced to be
1753 /// scalarized.
1755
1756 /// PHINodes of the reductions that should be expanded in-loop.
1757 SmallPtrSet<PHINode *, 4> InLoopReductions;
1758
1759 /// A Map of inloop reduction operations and their immediate chain operand.
1760 /// FIXME: This can be removed once reductions can be costed correctly in
1761 /// VPlan. This was added to allow quick lookup of the inloop operations.
1762 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1763
1764 /// Returns the expected difference in cost from scalarizing the expression
1765 /// feeding a predicated instruction \p PredInst. The instructions to
1766 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1767 /// non-negative return value implies the expression will be scalarized.
1768 /// Currently, only single-use chains are considered for scalarization.
1769 InstructionCost computePredInstDiscount(Instruction *PredInst,
1770 ScalarCostsTy &ScalarCosts,
1771 ElementCount VF);
1772
1773 /// Collect the instructions that are uniform after vectorization. An
1774 /// instruction is uniform if we represent it with a single scalar value in
1775 /// the vectorized loop corresponding to each vector iteration. Examples of
1776 /// uniform instructions include pointer operands of consecutive or
1777 /// interleaved memory accesses. Note that although uniformity implies an
1778 /// instruction will be scalar, the reverse is not true. In general, a
1779 /// scalarized instruction will be represented by VF scalar values in the
1780 /// vectorized loop, each corresponding to an iteration of the original
1781 /// scalar loop.
1782 void collectLoopUniforms(ElementCount VF);
1783
1784 /// Collect the instructions that are scalar after vectorization. An
1785 /// instruction is scalar if it is known to be uniform or will be scalarized
1786 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1787 /// to the list if they are used by a load/store instruction that is marked as
1788 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1789 /// VF values in the vectorized loop, each corresponding to an iteration of
1790 /// the original scalar loop.
1791 void collectLoopScalars(ElementCount VF);
1792
1793 /// Keeps cost model vectorization decision and cost for instructions.
1794 /// Right now it is used for memory instructions only.
1796 std::pair<InstWidening, InstructionCost>>;
1797
1798 DecisionList WideningDecisions;
1799
1800 using CallDecisionList =
1801 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1802
1803 CallDecisionList CallWideningDecisions;
1804
1805 /// Returns true if \p V is expected to be vectorized and it needs to be
1806 /// extracted.
1807 bool needsExtract(Value *V, ElementCount VF) const {
1808 Instruction *I = dyn_cast<Instruction>(V);
1809 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1811 return false;
1812
1813 // Assume we can vectorize V (and hence we need extraction) if the
1814 // scalars are not computed yet. This can happen, because it is called
1815 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1816 // the scalars are collected. That should be a safe assumption in most
1817 // cases, because we check if the operands have vectorizable types
1818 // beforehand in LoopVectorizationLegality.
1819 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1820 };
1821
1822 /// Returns a range containing only operands needing to be extracted.
1823 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1824 ElementCount VF) const {
1826 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1827 }
1828
1829public:
1830 /// The loop that we evaluate.
1832
1833 /// Predicated scalar evolution analysis.
1835
1836 /// Loop Info analysis.
1838
1839 /// Vectorization legality.
1841
1842 /// Vector target information.
1844
1845 /// Target Library Info.
1847
1848 /// Demanded bits analysis.
1850
1851 /// Assumption cache.
1853
1854 /// Interface to emit optimization remarks.
1856
1858
1859 /// Loop Vectorize Hint.
1861
1862 /// The interleave access information contains groups of interleaved accesses
1863 /// with the same stride and close to each other.
1865
1866 /// Values to ignore in the cost model.
1868
1869 /// Values to ignore in the cost model when VF > 1.
1871
1872 /// All element types found in the loop.
1874};
1875} // end namespace llvm
1876
1877namespace {
1878/// Helper struct to manage generating runtime checks for vectorization.
1879///
1880/// The runtime checks are created up-front in temporary blocks to allow better
1881/// estimating the cost and un-linked from the existing IR. After deciding to
1882/// vectorize, the checks are moved back. If deciding not to vectorize, the
1883/// temporary blocks are completely removed.
1884class GeneratedRTChecks {
1885 /// Basic block which contains the generated SCEV checks, if any.
1886 BasicBlock *SCEVCheckBlock = nullptr;
1887
1888 /// The value representing the result of the generated SCEV checks. If it is
1889 /// nullptr, either no SCEV checks have been generated or they have been used.
1890 Value *SCEVCheckCond = nullptr;
1891
1892 /// Basic block which contains the generated memory runtime checks, if any.
1893 BasicBlock *MemCheckBlock = nullptr;
1894
1895 /// The value representing the result of the generated memory runtime checks.
1896 /// If it is nullptr, either no memory runtime checks have been generated or
1897 /// they have been used.
1898 Value *MemRuntimeCheckCond = nullptr;
1899
1900 DominatorTree *DT;
1901 LoopInfo *LI;
1903
1904 SCEVExpander SCEVExp;
1905 SCEVExpander MemCheckExp;
1906
1907 bool CostTooHigh = false;
1908 const bool AddBranchWeights;
1909
1910 Loop *OuterLoop = nullptr;
1911
1912public:
1913 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1915 bool AddBranchWeights)
1916 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1917 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1918
1919 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1920 /// accurately estimate the cost of the runtime checks. The blocks are
1921 /// un-linked from the IR and is added back during vector code generation. If
1922 /// there is no vector code generation, the check blocks are removed
1923 /// completely.
1924 void Create(Loop *L, const LoopAccessInfo &LAI,
1925 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1926
1927 // Hard cutoff to limit compile-time increase in case a very large number of
1928 // runtime checks needs to be generated.
1929 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1930 // profile info.
1931 CostTooHigh =
1933 if (CostTooHigh)
1934 return;
1935
1936 BasicBlock *LoopHeader = L->getHeader();
1937 BasicBlock *Preheader = L->getLoopPreheader();
1938
1939 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1940 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1941 // may be used by SCEVExpander. The blocks will be un-linked from their
1942 // predecessors and removed from LI & DT at the end of the function.
1943 if (!UnionPred.isAlwaysTrue()) {
1944 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1945 nullptr, "vector.scevcheck");
1946
1947 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1948 &UnionPred, SCEVCheckBlock->getTerminator());
1949 }
1950
1951 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1952 if (RtPtrChecking.Need) {
1953 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1954 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1955 "vector.memcheck");
1956
1957 auto DiffChecks = RtPtrChecking.getDiffChecks();
1958 if (DiffChecks) {
1959 Value *RuntimeVF = nullptr;
1960 MemRuntimeCheckCond = addDiffRuntimeChecks(
1961 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1962 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1963 if (!RuntimeVF)
1964 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1965 return RuntimeVF;
1966 },
1967 IC);
1968 } else {
1969 MemRuntimeCheckCond = addRuntimeChecks(
1970 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1972 }
1973 assert(MemRuntimeCheckCond &&
1974 "no RT checks generated although RtPtrChecking "
1975 "claimed checks are required");
1976 }
1977
1978 if (!MemCheckBlock && !SCEVCheckBlock)
1979 return;
1980
1981 // Unhook the temporary block with the checks, update various places
1982 // accordingly.
1983 if (SCEVCheckBlock)
1984 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1985 if (MemCheckBlock)
1986 MemCheckBlock->replaceAllUsesWith(Preheader);
1987
1988 if (SCEVCheckBlock) {
1989 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1990 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1991 Preheader->getTerminator()->eraseFromParent();
1992 }
1993 if (MemCheckBlock) {
1994 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1995 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1996 Preheader->getTerminator()->eraseFromParent();
1997 }
1998
1999 DT->changeImmediateDominator(LoopHeader, Preheader);
2000 if (MemCheckBlock) {
2001 DT->eraseNode(MemCheckBlock);
2002 LI->removeBlock(MemCheckBlock);
2003 }
2004 if (SCEVCheckBlock) {
2005 DT->eraseNode(SCEVCheckBlock);
2006 LI->removeBlock(SCEVCheckBlock);
2007 }
2008
2009 // Outer loop is used as part of the later cost calculations.
2010 OuterLoop = L->getParentLoop();
2011 }
2012
2013 InstructionCost getCost() {
2014 if (SCEVCheckBlock || MemCheckBlock)
2015 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2016
2017 if (CostTooHigh) {
2019 Cost.setInvalid();
2020 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2021 return Cost;
2022 }
2023
2024 InstructionCost RTCheckCost = 0;
2025 if (SCEVCheckBlock)
2026 for (Instruction &I : *SCEVCheckBlock) {
2027 if (SCEVCheckBlock->getTerminator() == &I)
2028 continue;
2031 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2032 RTCheckCost += C;
2033 }
2034 if (MemCheckBlock) {
2035 InstructionCost MemCheckCost = 0;
2036 for (Instruction &I : *MemCheckBlock) {
2037 if (MemCheckBlock->getTerminator() == &I)
2038 continue;
2041 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2042 MemCheckCost += C;
2043 }
2044
2045 // If the runtime memory checks are being created inside an outer loop
2046 // we should find out if these checks are outer loop invariant. If so,
2047 // the checks will likely be hoisted out and so the effective cost will
2048 // reduce according to the outer loop trip count.
2049 if (OuterLoop) {
2050 ScalarEvolution *SE = MemCheckExp.getSE();
2051 // TODO: If profitable, we could refine this further by analysing every
2052 // individual memory check, since there could be a mixture of loop
2053 // variant and invariant checks that mean the final condition is
2054 // variant.
2055 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2056 if (SE->isLoopInvariant(Cond, OuterLoop)) {
2057 // It seems reasonable to assume that we can reduce the effective
2058 // cost of the checks even when we know nothing about the trip
2059 // count. Assume that the outer loop executes at least twice.
2060 unsigned BestTripCount = 2;
2061
2062 // If exact trip count is known use that.
2063 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2064 BestTripCount = SmallTC;
2066 // Else use profile data if available.
2067 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2068 BestTripCount = *EstimatedTC;
2069 }
2070
2071 BestTripCount = std::max(BestTripCount, 1U);
2072 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2073
2074 // Let's ensure the cost is always at least 1.
2075 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2077
2078 if (BestTripCount > 1)
2080 << "We expect runtime memory checks to be hoisted "
2081 << "out of the outer loop. Cost reduced from "
2082 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2083
2084 MemCheckCost = NewMemCheckCost;
2085 }
2086 }
2087
2088 RTCheckCost += MemCheckCost;
2089 }
2090
2091 if (SCEVCheckBlock || MemCheckBlock)
2092 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2093 << "\n");
2094
2095 return RTCheckCost;
2096 }
2097
2098 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2099 /// unused.
2100 ~GeneratedRTChecks() {
2101 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2102 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2103 if (!SCEVCheckCond)
2104 SCEVCleaner.markResultUsed();
2105
2106 if (!MemRuntimeCheckCond)
2107 MemCheckCleaner.markResultUsed();
2108
2109 if (MemRuntimeCheckCond) {
2110 auto &SE = *MemCheckExp.getSE();
2111 // Memory runtime check generation creates compares that use expanded
2112 // values. Remove them before running the SCEVExpanderCleaners.
2113 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2114 if (MemCheckExp.isInsertedInstruction(&I))
2115 continue;
2116 SE.forgetValue(&I);
2117 I.eraseFromParent();
2118 }
2119 }
2120 MemCheckCleaner.cleanup();
2121 SCEVCleaner.cleanup();
2122
2123 if (SCEVCheckCond)
2124 SCEVCheckBlock->eraseFromParent();
2125 if (MemRuntimeCheckCond)
2126 MemCheckBlock->eraseFromParent();
2127 }
2128
2129 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2130 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2131 /// depending on the generated condition.
2132 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2133 BasicBlock *LoopVectorPreHeader,
2134 BasicBlock *LoopExitBlock) {
2135 if (!SCEVCheckCond)
2136 return nullptr;
2137
2138 Value *Cond = SCEVCheckCond;
2139 // Mark the check as used, to prevent it from being removed during cleanup.
2140 SCEVCheckCond = nullptr;
2141 if (auto *C = dyn_cast<ConstantInt>(Cond))
2142 if (C->isZero())
2143 return nullptr;
2144
2145 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2146
2147 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2148 // Create new preheader for vector loop.
2149 if (OuterLoop)
2150 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2151
2152 SCEVCheckBlock->getTerminator()->eraseFromParent();
2153 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2154 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2155 SCEVCheckBlock);
2156
2157 DT->addNewBlock(SCEVCheckBlock, Pred);
2158 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2159
2160 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2161 if (AddBranchWeights)
2163 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2164 return SCEVCheckBlock;
2165 }
2166
2167 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2168 /// the branches to branch to the vector preheader or \p Bypass, depending on
2169 /// the generated condition.
2170 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2171 BasicBlock *LoopVectorPreHeader) {
2172 // Check if we generated code that checks in runtime if arrays overlap.
2173 if (!MemRuntimeCheckCond)
2174 return nullptr;
2175
2176 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2177 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2178 MemCheckBlock);
2179
2180 DT->addNewBlock(MemCheckBlock, Pred);
2181 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2182 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2183
2184 if (OuterLoop)
2185 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2186
2187 BranchInst &BI =
2188 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2189 if (AddBranchWeights) {
2191 }
2192 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2193 MemCheckBlock->getTerminator()->setDebugLoc(
2194 Pred->getTerminator()->getDebugLoc());
2195
2196 // Mark the check as used, to prevent it from being removed during cleanup.
2197 MemRuntimeCheckCond = nullptr;
2198 return MemCheckBlock;
2199 }
2200};
2201} // namespace
2202
2204 return Style == TailFoldingStyle::Data ||
2205 Style == TailFoldingStyle::DataAndControlFlow ||
2206 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2207}
2208
2210 return Style == TailFoldingStyle::DataAndControlFlow ||
2211 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2212}
2213
2214// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2215// vectorization. The loop needs to be annotated with #pragma omp simd
2216// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2217// vector length information is not provided, vectorization is not considered
2218// explicit. Interleave hints are not allowed either. These limitations will be
2219// relaxed in the future.
2220// Please, note that we are currently forced to abuse the pragma 'clang
2221// vectorize' semantics. This pragma provides *auto-vectorization hints*
2222// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2223// provides *explicit vectorization hints* (LV can bypass legal checks and
2224// assume that vectorization is legal). However, both hints are implemented
2225// using the same metadata (llvm.loop.vectorize, processed by
2226// LoopVectorizeHints). This will be fixed in the future when the native IR
2227// representation for pragma 'omp simd' is introduced.
2228static bool isExplicitVecOuterLoop(Loop *OuterLp,
2230 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2231 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2232
2233 // Only outer loops with an explicit vectorization hint are supported.
2234 // Unannotated outer loops are ignored.
2236 return false;
2237
2238 Function *Fn = OuterLp->getHeader()->getParent();
2239 if (!Hints.allowVectorization(Fn, OuterLp,
2240 true /*VectorizeOnlyWhenForced*/)) {
2241 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2242 return false;
2243 }
2244
2245 if (Hints.getInterleave() > 1) {
2246 // TODO: Interleave support is future work.
2247 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2248 "outer loops.\n");
2249 Hints.emitRemarkWithHints();
2250 return false;
2251 }
2252
2253 return true;
2254}
2255
2259 // Collect inner loops and outer loops without irreducible control flow. For
2260 // now, only collect outer loops that have explicit vectorization hints. If we
2261 // are stress testing the VPlan H-CFG construction, we collect the outermost
2262 // loop of every loop nest.
2263 if (L.isInnermost() || VPlanBuildStressTest ||
2265 LoopBlocksRPO RPOT(&L);
2266 RPOT.perform(LI);
2267 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2268 V.push_back(&L);
2269 // TODO: Collect inner loops inside marked outer loops in case
2270 // vectorization fails for the outer loop. Do not invoke
2271 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2272 // already known to be reducible. We can use an inherited attribute for
2273 // that.
2274 return;
2275 }
2276 }
2277 for (Loop *InnerL : L)
2278 collectSupportedLoops(*InnerL, LI, ORE, V);
2279}
2280
2281//===----------------------------------------------------------------------===//
2282// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2283// LoopVectorizationCostModel and LoopVectorizationPlanner.
2284//===----------------------------------------------------------------------===//
2285
2286/// Compute the transformed value of Index at offset StartValue using step
2287/// StepValue.
2288/// For integer induction, returns StartValue + Index * StepValue.
2289/// For pointer induction, returns StartValue[Index * StepValue].
2290/// FIXME: The newly created binary instructions should contain nsw/nuw
2291/// flags, which can be found from the original scalar operations.
2292static Value *
2294 Value *Step,
2296 const BinaryOperator *InductionBinOp) {
2297 Type *StepTy = Step->getType();
2298 Value *CastedIndex = StepTy->isIntegerTy()
2299 ? B.CreateSExtOrTrunc(Index, StepTy)
2300 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2301 if (CastedIndex != Index) {
2302 CastedIndex->setName(CastedIndex->getName() + ".cast");
2303 Index = CastedIndex;
2304 }
2305
2306 // Note: the IR at this point is broken. We cannot use SE to create any new
2307 // SCEV and then expand it, hoping that SCEV's simplification will give us
2308 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2309 // lead to various SCEV crashes. So all we can do is to use builder and rely
2310 // on InstCombine for future simplifications. Here we handle some trivial
2311 // cases only.
2312 auto CreateAdd = [&B](Value *X, Value *Y) {
2313 assert(X->getType() == Y->getType() && "Types don't match!");
2314 if (auto *CX = dyn_cast<ConstantInt>(X))
2315 if (CX->isZero())
2316 return Y;
2317 if (auto *CY = dyn_cast<ConstantInt>(Y))
2318 if (CY->isZero())
2319 return X;
2320 return B.CreateAdd(X, Y);
2321 };
2322
2323 // We allow X to be a vector type, in which case Y will potentially be
2324 // splatted into a vector with the same element count.
2325 auto CreateMul = [&B](Value *X, Value *Y) {
2326 assert(X->getType()->getScalarType() == Y->getType() &&
2327 "Types don't match!");
2328 if (auto *CX = dyn_cast<ConstantInt>(X))
2329 if (CX->isOne())
2330 return Y;
2331 if (auto *CY = dyn_cast<ConstantInt>(Y))
2332 if (CY->isOne())
2333 return X;
2334 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2335 if (XVTy && !isa<VectorType>(Y->getType()))
2336 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2337 return B.CreateMul(X, Y);
2338 };
2339
2340 switch (InductionKind) {
2342 assert(!isa<VectorType>(Index->getType()) &&
2343 "Vector indices not supported for integer inductions yet");
2344 assert(Index->getType() == StartValue->getType() &&
2345 "Index type does not match StartValue type");
2346 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2347 return B.CreateSub(StartValue, Index);
2348 auto *Offset = CreateMul(Index, Step);
2349 return CreateAdd(StartValue, Offset);
2350 }
2352 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2354 assert(!isa<VectorType>(Index->getType()) &&
2355 "Vector indices not supported for FP inductions yet");
2356 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2357 assert(InductionBinOp &&
2358 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2359 InductionBinOp->getOpcode() == Instruction::FSub) &&
2360 "Original bin op should be defined for FP induction");
2361
2362 Value *MulExp = B.CreateFMul(Step, Index);
2363 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2364 "induction");
2365 }
2367 return nullptr;
2368 }
2369 llvm_unreachable("invalid enum");
2370}
2371
2372std::optional<unsigned> getMaxVScale(const Function &F,
2373 const TargetTransformInfo &TTI) {
2374 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2375 return MaxVScale;
2376
2377 if (F.hasFnAttribute(Attribute::VScaleRange))
2378 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2379
2380 return std::nullopt;
2381}
2382
2383/// For the given VF and UF and maximum trip count computed for the loop, return
2384/// whether the induction variable might overflow in the vectorized loop. If not,
2385/// then we know a runtime overflow check always evaluates to false and can be
2386/// removed.
2389 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2390 // Always be conservative if we don't know the exact unroll factor.
2391 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2392
2393 Type *IdxTy = Cost->Legal->getWidestInductionType();
2394 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2395
2396 // We know the runtime overflow check is known false iff the (max) trip-count
2397 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2398 // the vector loop induction variable.
2399 if (unsigned TC =
2400 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2401 uint64_t MaxVF = VF.getKnownMinValue();
2402 if (VF.isScalable()) {
2403 std::optional<unsigned> MaxVScale =
2404 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2405 if (!MaxVScale)
2406 return false;
2407 MaxVF *= *MaxVScale;
2408 }
2409
2410 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2411 }
2412
2413 return false;
2414}
2415
2416// Return whether we allow using masked interleave-groups (for dealing with
2417// strided loads/stores that reside in predicated blocks, or for dealing
2418// with gaps).
2420 // If an override option has been passed in for interleaved accesses, use it.
2421 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2423
2425}
2426
2427// Try to vectorize the interleave group that \p Instr belongs to.
2428//
2429// E.g. Translate following interleaved load group (factor = 3):
2430// for (i = 0; i < N; i+=3) {
2431// R = Pic[i]; // Member of index 0
2432// G = Pic[i+1]; // Member of index 1
2433// B = Pic[i+2]; // Member of index 2
2434// ... // do something to R, G, B
2435// }
2436// To:
2437// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2438// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2439// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2440// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2441//
2442// Or translate following interleaved store group (factor = 3):
2443// for (i = 0; i < N; i+=3) {
2444// ... do something to R, G, B
2445// Pic[i] = R; // Member of index 0
2446// Pic[i+1] = G; // Member of index 1
2447// Pic[i+2] = B; // Member of index 2
2448// }
2449// To:
2450// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2451// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2452// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2453// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2454// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2457 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2458 VPValue *BlockInMask, bool NeedsMaskForGaps) {
2459 Instruction *Instr = Group->getInsertPos();
2460 const DataLayout &DL = Instr->getModule()->getDataLayout();
2461
2462 // Prepare for the vector type of the interleaved load/store.
2463 Type *ScalarTy = getLoadStoreType(Instr);
2464 unsigned InterleaveFactor = Group->getFactor();
2465 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2466
2467 // Prepare for the new pointers.
2468 SmallVector<Value *, 2> AddrParts;
2469 unsigned Index = Group->getIndex(Instr);
2470
2471 // TODO: extend the masked interleaved-group support to reversed access.
2472 assert((!BlockInMask || !Group->isReverse()) &&
2473 "Reversed masked interleave-group not supported.");
2474
2475 Value *Idx;
2476 // If the group is reverse, adjust the index to refer to the last vector lane
2477 // instead of the first. We adjust the index from the first vector lane,
2478 // rather than directly getting the pointer for lane VF - 1, because the
2479 // pointer operand of the interleaved access is supposed to be uniform. For
2480 // uniform instructions, we're only required to generate a value for the
2481 // first vector lane in each unroll iteration.
2482 if (Group->isReverse()) {
2483 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2484 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2488 } else
2490
2491 for (unsigned Part = 0; Part < UF; Part++) {
2492 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2493 if (auto *I = dyn_cast<Instruction>(AddrPart))
2494 State.setDebugLocFrom(I->getDebugLoc());
2495
2496 // Notice current instruction could be any index. Need to adjust the address
2497 // to the member of index 0.
2498 //
2499 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2500 // b = A[i]; // Member of index 0
2501 // Current pointer is pointed to A[i+1], adjust it to A[i].
2502 //
2503 // E.g. A[i+1] = a; // Member of index 1
2504 // A[i] = b; // Member of index 0
2505 // A[i+2] = c; // Member of index 2 (Current instruction)
2506 // Current pointer is pointed to A[i+2], adjust it to A[i].
2507
2508 bool InBounds = false;
2509 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2510 InBounds = gep->isInBounds();
2511 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2512 AddrParts.push_back(AddrPart);
2513 }
2514
2515 State.setDebugLocFrom(Instr->getDebugLoc());
2516 Value *PoisonVec = PoisonValue::get(VecTy);
2517
2518 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2519 unsigned Part, Value *MaskForGaps) -> Value * {
2520 if (VF.isScalable()) {
2521 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2522 assert(InterleaveFactor == 2 &&
2523 "Unsupported deinterleave factor for scalable vectors");
2524 auto *BlockInMaskPart = State.get(BlockInMask, Part);
2525 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2526 auto *MaskTy =
2528 return Builder.CreateIntrinsic(
2529 MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2530 /*FMFSource=*/nullptr, "interleaved.mask");
2531 }
2532
2533 if (!BlockInMask)
2534 return MaskForGaps;
2535
2536 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2537 Value *ShuffledMask = Builder.CreateShuffleVector(
2538 BlockInMaskPart,
2539 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2540 "interleaved.mask");
2541 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2542 MaskForGaps)
2543 : ShuffledMask;
2544 };
2545
2546 // Vectorize the interleaved load group.
2547 if (isa<LoadInst>(Instr)) {
2548 Value *MaskForGaps = nullptr;
2549 if (NeedsMaskForGaps) {
2550 MaskForGaps =
2552 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2553 }
2554
2555 // For each unroll part, create a wide load for the group.
2556 SmallVector<Value *, 2> NewLoads;
2557 for (unsigned Part = 0; Part < UF; Part++) {
2558 Instruction *NewLoad;
2559 if (BlockInMask || MaskForGaps) {
2561 "masked interleaved groups are not allowed.");
2562 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2563 NewLoad =
2564 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2565 GroupMask, PoisonVec, "wide.masked.vec");
2566 }
2567 else
2568 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2569 Group->getAlign(), "wide.vec");
2570 Group->addMetadata(NewLoad);
2571 NewLoads.push_back(NewLoad);
2572 }
2573
2574 if (VecTy->isScalableTy()) {
2575 assert(InterleaveFactor == 2 &&
2576 "Unsupported deinterleave factor for scalable vectors");
2577
2578 for (unsigned Part = 0; Part < UF; ++Part) {
2579 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2580 // so must use intrinsics to deinterleave.
2582 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2583 /*FMFSource=*/nullptr, "strided.vec");
2584 unsigned J = 0;
2585 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2586 Instruction *Member = Group->getMember(I);
2587
2588 if (!Member)
2589 continue;
2590
2591 Value *StridedVec = Builder.CreateExtractValue(DI, I);
2592 // If this member has different type, cast the result type.
2593 if (Member->getType() != ScalarTy) {
2594 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2595 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2596 }
2597
2598 if (Group->isReverse())
2599 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2600
2601 State.set(VPDefs[J], StridedVec, Part);
2602 ++J;
2603 }
2604 }
2605
2606 return;
2607 }
2608
2609 // For each member in the group, shuffle out the appropriate data from the
2610 // wide loads.
2611 unsigned J = 0;
2612 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2613 Instruction *Member = Group->getMember(I);
2614
2615 // Skip the gaps in the group.
2616 if (!Member)
2617 continue;
2618
2619 auto StrideMask =
2620 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2621 for (unsigned Part = 0; Part < UF; Part++) {
2622 Value *StridedVec = Builder.CreateShuffleVector(
2623 NewLoads[Part], StrideMask, "strided.vec");
2624
2625 // If this member has different type, cast the result type.
2626 if (Member->getType() != ScalarTy) {
2627 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2628 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2629 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2630 }
2631
2632 if (Group->isReverse())
2633 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2634
2635 State.set(VPDefs[J], StridedVec, Part);
2636 }
2637 ++J;
2638 }
2639 return;
2640 }
2641
2642 // The sub vector type for current instruction.
2643 auto *SubVT = VectorType::get(ScalarTy, VF);
2644
2645 // Vectorize the interleaved store group.
2646 Value *MaskForGaps =
2648 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2649 "masked interleaved groups are not allowed.");
2650 assert((!MaskForGaps || !VF.isScalable()) &&
2651 "masking gaps for scalable vectors is not yet supported.");
2652 for (unsigned Part = 0; Part < UF; Part++) {
2653 // Collect the stored vector from each member.
2654 SmallVector<Value *, 4> StoredVecs;
2655 unsigned StoredIdx = 0;
2656 for (unsigned i = 0; i < InterleaveFactor; i++) {
2657 assert((Group->getMember(i) || MaskForGaps) &&
2658 "Fail to get a member from an interleaved store group");
2659 Instruction *Member = Group->getMember(i);
2660
2661 // Skip the gaps in the group.
2662 if (!Member) {
2663 Value *Undef = PoisonValue::get(SubVT);
2664 StoredVecs.push_back(Undef);
2665 continue;
2666 }
2667
2668 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2669 ++StoredIdx;
2670
2671 if (Group->isReverse())
2672 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2673
2674 // If this member has different type, cast it to a unified type.
2675
2676 if (StoredVec->getType() != SubVT)
2677 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2678
2679 StoredVecs.push_back(StoredVec);
2680 }
2681
2682 // Interleave all the smaller vectors into one wider vector.
2683 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2684 Instruction *NewStoreInstr;
2685 if (BlockInMask || MaskForGaps) {
2686 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2687 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2688 Group->getAlign(), GroupMask);
2689 } else
2690 NewStoreInstr =
2691 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2692
2693 Group->addMetadata(NewStoreInstr);
2694 }
2695}
2696
2698 VPReplicateRecipe *RepRecipe,
2699 const VPIteration &Instance,
2700 VPTransformState &State) {
2701 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2702
2703 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2704 // the first lane and part.
2705 if (isa<NoAliasScopeDeclInst>(Instr))
2706 if (!Instance.isFirstIteration())
2707 return;
2708
2709 // Does this instruction return a value ?
2710 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2711
2712 Instruction *Cloned = Instr->clone();
2713 if (!IsVoidRetTy) {
2714 Cloned->setName(Instr->getName() + ".cloned");
2715#if !defined(NDEBUG)
2716 // Verify that VPlan type inference results agree with the type of the
2717 // generated values.
2718 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2719 "inferred type and type from generated instructions do not match");
2720#endif
2721 }
2722
2723 RepRecipe->setFlags(Cloned);
2724
2725 if (auto DL = Instr->getDebugLoc())
2726 State.setDebugLocFrom(DL);
2727
2728 // Replace the operands of the cloned instructions with their scalar
2729 // equivalents in the new loop.
2730 for (const auto &I : enumerate(RepRecipe->operands())) {
2731 auto InputInstance = Instance;
2732 VPValue *Operand = I.value();
2734 InputInstance.Lane = VPLane::getFirstLane();
2735 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2736 }
2737 State.addNewMetadata(Cloned, Instr);
2738
2739 // Place the cloned scalar in the new loop.
2740 State.Builder.Insert(Cloned);
2741
2742 State.set(RepRecipe, Cloned, Instance);
2743
2744 // If we just cloned a new assumption, add it the assumption cache.
2745 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2747
2748 // End if-block.
2749 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2750 if (IfPredicateInstr)
2751 PredicatedInstructions.push_back(Cloned);
2752}
2753
2754Value *
2756 if (VectorTripCount)
2757 return VectorTripCount;
2758
2759 Value *TC = getTripCount();
2760 IRBuilder<> Builder(InsertBlock->getTerminator());
2761
2762 Type *Ty = TC->getType();
2763 // This is where we can make the step a runtime constant.
2764 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2765
2766 // If the tail is to be folded by masking, round the number of iterations N
2767 // up to a multiple of Step instead of rounding down. This is done by first
2768 // adding Step-1 and then rounding down. Note that it's ok if this addition
2769 // overflows: the vector induction variable will eventually wrap to zero given
2770 // that it starts at zero and its Step is a power of two; the loop will then
2771 // exit, with the last early-exit vector comparison also producing all-true.
2772 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2773 // is accounted for in emitIterationCountCheck that adds an overflow check.
2774 if (Cost->foldTailByMasking()) {
2776 "VF*UF must be a power of 2 when folding tail by masking");
2777 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2778 TC = Builder.CreateAdd(
2779 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2780 }
2781
2782 // Now we need to generate the expression for the part of the loop that the
2783 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2784 // iterations are not required for correctness, or N - Step, otherwise. Step
2785 // is equal to the vectorization factor (number of SIMD elements) times the
2786 // unroll factor (number of SIMD instructions).
2787 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2788
2789 // There are cases where we *must* run at least one iteration in the remainder
2790 // loop. See the cost model for when this can happen. If the step evenly
2791 // divides the trip count, we set the remainder to be equal to the step. If
2792 // the step does not evenly divide the trip count, no adjustment is necessary
2793 // since there will already be scalar iterations. Note that the minimum
2794 // iterations check ensures that N >= Step.
2795 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2796 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2797 R = Builder.CreateSelect(IsZero, Step, R);
2798 }
2799
2800 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2801
2802 return VectorTripCount;
2803}
2804
2806 const DataLayout &DL) {
2807 // Verify that V is a vector type with same number of elements as DstVTy.
2808 auto *DstFVTy = cast<VectorType>(DstVTy);
2809 auto VF = DstFVTy->getElementCount();
2810 auto *SrcVecTy = cast<VectorType>(V->getType());
2811 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2812 Type *SrcElemTy = SrcVecTy->getElementType();
2813 Type *DstElemTy = DstFVTy->getElementType();
2814 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2815 "Vector elements must have same size");
2816
2817 // Do a direct cast if element types are castable.
2818 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2819 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2820 }
2821 // V cannot be directly casted to desired vector type.
2822 // May happen when V is a floating point vector but DstVTy is a vector of
2823 // pointers or vice-versa. Handle this using a two-step bitcast using an
2824 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2825 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2826 "Only one type should be a pointer type");
2827 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2828 "Only one type should be a floating point type");
2829 Type *IntTy =
2830 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2831 auto *VecIntTy = VectorType::get(IntTy, VF);
2832 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2833 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2834}
2835
2837 Value *Count = getTripCount();
2838 // Reuse existing vector loop preheader for TC checks.
2839 // Note that new preheader block is generated for vector loop.
2840 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2841 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2842
2843 // Generate code to check if the loop's trip count is less than VF * UF, or
2844 // equal to it in case a scalar epilogue is required; this implies that the
2845 // vector trip count is zero. This check also covers the case where adding one
2846 // to the backedge-taken count overflowed leading to an incorrect trip count
2847 // of zero. In this case we will also jump to the scalar loop.
2848 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2850
2851 // If tail is to be folded, vector loop takes care of all iterations.
2852 Type *CountTy = Count->getType();
2853 Value *CheckMinIters = Builder.getFalse();
2854 auto CreateStep = [&]() -> Value * {
2855 // Create step with max(MinProTripCount, UF * VF).
2857 return createStepForVF(Builder, CountTy, VF, UF);
2858
2859 Value *MinProfTC =
2861 if (!VF.isScalable())
2862 return MinProfTC;
2864 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2865 };
2866
2867 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2868 if (Style == TailFoldingStyle::None)
2869 CheckMinIters =
2870 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2871 else if (VF.isScalable() &&
2874 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2875 // an overflow to zero when updating induction variables and so an
2876 // additional overflow check is required before entering the vector loop.
2877
2878 // Get the maximum unsigned value for the type.
2879 Value *MaxUIntTripCount =
2880 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2881 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2882
2883 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2884 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2885 }
2886
2887 // Create new preheader for vector loop.
2889 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2890 "vector.ph");
2891
2892 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2893 DT->getNode(Bypass)->getIDom()) &&
2894 "TC check is expected to dominate Bypass");
2895
2896 // Update dominator for Bypass & LoopExit (if needed).
2897 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2898 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2899 // If there is an epilogue which must run, there's no edge from the
2900 // middle block to exit blocks and thus no need to update the immediate
2901 // dominator of the exit blocks.
2903
2904 BranchInst &BI =
2905 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2908 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2909 LoopBypassBlocks.push_back(TCCheckBlock);
2910}
2911
2913 BasicBlock *const SCEVCheckBlock =
2914 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2915 if (!SCEVCheckBlock)
2916 return nullptr;
2917
2918 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2920 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2921 "Cannot SCEV check stride or overflow when optimizing for size");
2922
2923
2924 // Update dominator only if this is first RT check.
2925 if (LoopBypassBlocks.empty()) {
2926 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2927 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2928 // If there is an epilogue which must run, there's no edge from the
2929 // middle block to exit blocks and thus no need to update the immediate
2930 // dominator of the exit blocks.
2931 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2932 }
2933
2934 LoopBypassBlocks.push_back(SCEVCheckBlock);
2935 AddedSafetyChecks = true;
2936 return SCEVCheckBlock;
2937}
2938
2940 // VPlan-native path does not do any analysis for runtime checks currently.
2942 return nullptr;
2943
2944 BasicBlock *const MemCheckBlock =
2945 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2946
2947 // Check if we generated code that checks in runtime if arrays overlap. We put
2948 // the checks into a separate block to make the more common case of few
2949 // elements faster.
2950 if (!MemCheckBlock)
2951 return nullptr;
2952
2953 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2954 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2955 "Cannot emit memory checks when optimizing for size, unless forced "
2956 "to vectorize.");
2957 ORE->emit([&]() {
2958 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2961 << "Code-size may be reduced by not forcing "
2962 "vectorization, or by source-code modifications "
2963 "eliminating the need for runtime checks "
2964 "(e.g., adding 'restrict').";
2965 });
2966 }
2967
2968 LoopBypassBlocks.push_back(MemCheckBlock);
2969
2970 AddedSafetyChecks = true;
2971
2972 return MemCheckBlock;
2973}
2974
2978 assert(LoopVectorPreHeader && "Invalid loop structure");
2979 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2980 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2981 "multiple exit loop without required epilogue?");
2982
2985 LI, nullptr, Twine(Prefix) + "middle.block");
2988 nullptr, Twine(Prefix) + "scalar.ph");
2989
2990 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2991
2992 // Set up the middle block terminator. Two cases:
2993 // 1) If we know that we must execute the scalar epilogue, emit an
2994 // unconditional branch.
2995 // 2) Otherwise, we must have a single unique exit block (due to how we
2996 // implement the multiple exit case). In this case, set up a conditional
2997 // branch from the middle block to the loop scalar preheader, and the
2998 // exit block. completeLoopSkeleton will update the condition to use an
2999 // iteration check, if required to decide whether to execute the remainder.
3000 BranchInst *BrInst =
3001 Cost->requiresScalarEpilogue(VF.isVector())
3004 Builder.getTrue());
3005 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3007
3008 // Update dominator for loop exit. During skeleton creation, only the vector
3009 // pre-header and the middle block are created. The vector loop is entirely
3010 // created during VPlan exection.
3011 if (!Cost->requiresScalarEpilogue(VF.isVector()))
3012 // If there is an epilogue which must run, there's no edge from the
3013 // middle block to exit blocks and thus no need to update the immediate
3014 // dominator of the exit blocks.
3016}
3017
3019 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3020 ArrayRef<BasicBlock *> BypassBlocks,
3021 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3023 assert(VectorTripCount && "Expected valid arguments");
3024
3025 Instruction *OldInduction = Legal->getPrimaryInduction();
3026 Value *&EndValue = IVEndValues[OrigPhi];
3027 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3028 if (OrigPhi == OldInduction) {
3029 // We know what the end value is.
3030 EndValue = VectorTripCount;
3031 } else {
3033
3034 // Fast-math-flags propagate from the original induction instruction.
3035 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3036 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3037
3039 Step, II.getKind(), II.getInductionBinOp());
3040 EndValue->setName("ind.end");
3041
3042 // Compute the end value for the additional bypass (if applicable).
3043 if (AdditionalBypass.first) {
3044 B.SetInsertPoint(AdditionalBypass.first,
3045 AdditionalBypass.first->getFirstInsertionPt());
3046 EndValueFromAdditionalBypass =
3047 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3048 Step, II.getKind(), II.getInductionBinOp());
3049 EndValueFromAdditionalBypass->setName("ind.end");
3050 }
3051 }
3052
3053 // Create phi nodes to merge from the backedge-taken check block.
3054 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3056 // Copy original phi DL over to the new one.
3057 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3058
3059 // The new PHI merges the original incoming value, in case of a bypass,
3060 // or the value at the end of the vectorized loop.
3061 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3062
3063 // Fix the scalar body counter (PHI node).
3064 // The old induction's phi node in the scalar body needs the truncated
3065 // value.
3066 for (BasicBlock *BB : BypassBlocks)
3067 BCResumeVal->addIncoming(II.getStartValue(), BB);
3068
3069 if (AdditionalBypass.first)
3070 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3071 EndValueFromAdditionalBypass);
3072 return BCResumeVal;
3073}
3074
3075/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3076/// expansion results.
3078 const SCEV2ValueTy &ExpandedSCEVs) {
3079 const SCEV *Step = ID.getStep();
3080 if (auto *C = dyn_cast<SCEVConstant>(Step))
3081 return C->getValue();
3082 if (auto *U = dyn_cast<SCEVUnknown>(Step))
3083 return U->getValue();
3084 auto I = ExpandedSCEVs.find(Step);
3085 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3086 return I->second;
3087}
3088
3090 const SCEV2ValueTy &ExpandedSCEVs,
3091 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3092 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3093 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3094 "Inconsistent information about additional bypass.");
3095 // We are going to resume the execution of the scalar loop.
3096 // Go over all of the induction variables that we found and fix the
3097 // PHIs that are left in the scalar version of the loop.
3098 // The starting values of PHI nodes depend on the counter of the last
3099 // iteration in the vectorized loop.
3100 // If we come from a bypass edge then we need to start from the original
3101 // start value.
3102 for (const auto &InductionEntry : Legal->getInductionVars()) {
3103 PHINode *OrigPhi = InductionEntry.first;
3104 const InductionDescriptor &II = InductionEntry.second;
3105 PHINode *BCResumeVal = createInductionResumeValue(
3106 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3107 AdditionalBypass);
3108 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3109 }
3110}
3111
3113 // The trip counts should be cached by now.
3114 Value *Count = getTripCount();
3116
3117 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3118
3119 // Add a check in the middle block to see if we have completed
3120 // all of the iterations in the first vector loop. Three cases:
3121 // 1) If we require a scalar epilogue, there is no conditional branch as
3122 // we unconditionally branch to the scalar preheader. Do nothing.
3123 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3124 // Thus if tail is to be folded, we know we don't need to run the
3125 // remainder and we can use the previous value for the condition (true).
3126 // 3) Otherwise, construct a runtime check.
3127 if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3128 !Cost->foldTailByMasking()) {
3129 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3130 // of the corresponding compare because they may have ended up with
3131 // different line numbers and we want to avoid awkward line stepping while
3132 // debugging. Eg. if the compare has got a line number inside the loop.
3133 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3134 // operands. Perform simplification directly on VPlan once the branch is
3135 // modeled there.
3137 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3138 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3139 BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3140 BI.setCondition(CmpN);
3141 if (hasBranchWeightMD(*ScalarLatchTerm)) {
3142 // Assume that `Count % VectorTripCount` is equally distributed.
3143 unsigned TripCount = UF * VF.getKnownMinValue();
3144 assert(TripCount > 0 && "trip count should not be zero");
3145 const uint32_t Weights[] = {1, TripCount - 1};
3146 setBranchWeights(BI, Weights);
3147 }
3148 }
3149
3150#ifdef EXPENSIVE_CHECKS
3151 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3152#endif
3153
3154 return LoopVectorPreHeader;
3155}
3156
3157std::pair<BasicBlock *, Value *>
3159 const SCEV2ValueTy &ExpandedSCEVs) {
3160 /*
3161 In this function we generate a new loop. The new loop will contain
3162 the vectorized instructions while the old loop will continue to run the
3163 scalar remainder.
3164
3165 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3166 / | preheader are expanded here. Eventually all required SCEV
3167 / | expansion should happen here.
3168 / v
3169 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3170 | / |
3171 | / v
3172 || [ ] <-- vector pre header.
3173 |/ |
3174 | v
3175 | [ ] \
3176 | [ ]_| <-- vector loop (created during VPlan execution).
3177 | |
3178 | v
3179 \ -[ ] <--- middle-block.
3180 \/ |
3181 /\ v
3182 | ->[ ] <--- new preheader.
3183 | |
3184 (opt) v <-- edge from middle to exit iff epilogue is not required.
3185 | [ ] \
3186 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3187 \ |
3188 \ v
3189 >[ ] <-- exit block(s).
3190 ...
3191 */
3192
3193 // Create an empty vector loop, and prepare basic blocks for the runtime
3194 // checks.
3196
3197 // Now, compare the new count to zero. If it is zero skip the vector loop and
3198 // jump to the scalar loop. This check also covers the case where the
3199 // backedge-taken count is uint##_max: adding one to it will overflow leading
3200 // to an incorrect trip count of zero. In this (rare) case we will also jump
3201 // to the scalar loop.
3203
3204 // Generate the code to check any assumptions that we've made for SCEV
3205 // expressions.
3207
3208 // Generate the code that checks in runtime if arrays overlap. We put the
3209 // checks into a separate block to make the more common case of few elements
3210 // faster.
3212
3213 // Emit phis for the new starting index of the scalar loop.
3214 createInductionResumeValues(ExpandedSCEVs);
3215
3216 return {completeLoopSkeleton(), nullptr};
3217}
3218
3219// Fix up external users of the induction variable. At this point, we are
3220// in LCSSA form, with all external PHIs that use the IV having one input value,
3221// coming from the remainder loop. We need those PHIs to also have a correct
3222// value for the IV when arriving directly from the middle block.
3224 const InductionDescriptor &II,
3225 Value *VectorTripCount, Value *EndValue,
3226 BasicBlock *MiddleBlock,
3227 BasicBlock *VectorHeader, VPlan &Plan,
3228 VPTransformState &State) {
3229 // There are two kinds of external IV usages - those that use the value
3230 // computed in the last iteration (the PHI) and those that use the penultimate
3231 // value (the value that feeds into the phi from the loop latch).
3232 // We allow both, but they, obviously, have different values.
3233
3234 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3235
3236 DenseMap<Value *, Value *> MissingVals;
3237
3238 // An external user of the last iteration's value should see the value that
3239 // the remainder loop uses to initialize its own IV.
3241 for (User *U : PostInc->users()) {
3242 Instruction *UI = cast<Instruction>(U);
3243 if (!OrigLoop->contains(UI)) {
3244 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3245 MissingVals[UI] = EndValue;
3246 }
3247 }
3248
3249 // An external user of the penultimate value need to see EndValue - Step.
3250 // The simplest way to get this is to recompute it from the constituent SCEVs,
3251 // that is Start + (Step * (CRD - 1)).
3252 for (User *U : OrigPhi->users()) {
3253 auto *UI = cast<Instruction>(U);
3254 if (!OrigLoop->contains(UI)) {
3255 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3256 IRBuilder<> B(MiddleBlock->getTerminator());
3257
3258 // Fast-math-flags propagate from the original induction instruction.
3259 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3260 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3261
3262 Value *CountMinusOne = B.CreateSub(
3263 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3264 CountMinusOne->setName("cmo");
3265
3266 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3267 assert(StepVPV && "step must have been expanded during VPlan execution");
3268 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3269 : State.get(StepVPV, {0, 0});
3270 Value *Escape =
3271 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3272 II.getKind(), II.getInductionBinOp());
3273 Escape->setName("ind.escape");
3274 MissingVals[UI] = Escape;
3275 }
3276 }
3277
3278 for (auto &I : MissingVals) {
3279 PHINode *PHI = cast<PHINode>(I.first);
3280 // One corner case we have to handle is two IVs "chasing" each-other,
3281 // that is %IV2 = phi [...], [ %IV1, %latch ]
3282 // In this case, if IV1 has an external use, we need to avoid adding both
3283 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3284 // don't already have an incoming value for the middle block.
3285 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3286 PHI->addIncoming(I.second, MiddleBlock);
3287 Plan.removeLiveOut(PHI);
3288 }
3289 }
3290}
3291
3292namespace {
3293
3294struct CSEDenseMapInfo {
3295 static bool canHandle(const Instruction *I) {
3296 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3297 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3298 }
3299
3300 static inline Instruction *getEmptyKey() {
3302 }
3303
3304 static inline Instruction *getTombstoneKey() {
3306 }
3307
3308 static unsigned getHashValue(const Instruction *I) {
3309 assert(canHandle(I) && "Unknown instruction!");
3310 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3311 I->value_op_end()));
3312 }
3313
3314 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3315 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3316 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3317 return LHS == RHS;
3318 return LHS->isIdenticalTo(RHS);
3319 }
3320};
3321
3322} // end anonymous namespace
3323
3324///Perform cse of induction variable instructions.
3325static void cse(BasicBlock *BB) {
3326 // Perform simple cse.
3328 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3329 if (!CSEDenseMapInfo::canHandle(&In))
3330 continue;
3331
3332 // Check if we can replace this instruction with any of the
3333 // visited instructions.
3334 if (Instruction *V = CSEMap.lookup(&In)) {
3335 In.replaceAllUsesWith(V);
3336 In.eraseFromParent();
3337 continue;
3338 }
3339
3340 CSEMap[&In] = &In;
3341 }
3342}
3343
3346 ElementCount VF) const {
3347 // We only need to calculate a cost if the VF is scalar; for actual vectors
3348 // we should already have a pre-calculated cost at each VF.
3349 if (!VF.isScalar())
3350 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3351
3353 Type *RetTy = CI->getType();
3355 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3356 return *RedCost;
3357
3359 for (auto &ArgOp : CI->args())
3360 Tys.push_back(ArgOp->getType());
3361
3362 InstructionCost ScalarCallCost =
3364
3365 // If this is an intrinsic we may have a lower cost for it.
3367 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3368 return std::min(ScalarCallCost, IntrinsicCost);
3369 }
3370 return ScalarCallCost;
3371}
3372
3374 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3375 return Elt;
3376 return VectorType::get(Elt, VF);
3377}
3378
3381 ElementCount VF) const {
3383 assert(ID && "Expected intrinsic call!");
3384 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3385 FastMathFlags FMF;
3386 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3387 FMF = FPMO->getFastMathFlags();
3388
3391 SmallVector<Type *> ParamTys;
3392 std::transform(FTy->param_begin(), FTy->param_end(),
3393 std::back_inserter(ParamTys),
3394 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3395
3396 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3397 dyn_cast<IntrinsicInst>(CI));
3398 return TTI.getIntrinsicInstrCost(CostAttrs,
3400}
3401
3403 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3404 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3405 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3406}
3407
3409 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3410 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3411 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3412}
3413
3415 VPlan &Plan) {
3416 // Fix widened non-induction PHIs by setting up the PHI operands.
3418 fixNonInductionPHIs(Plan, State);
3419
3420 // At this point every instruction in the original loop is widened to a
3421 // vector form. Now we need to fix the recurrences in the loop. These PHI
3422 // nodes are currently empty because we did not want to introduce cycles.
3423 // This is the second stage of vectorizing recurrences. Note that fixing
3424 // reduction phis are already modeled in VPlan.
3425 // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3426 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3427 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3428 for (VPRecipeBase &R : HeaderVPBB->phis()) {
3429 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3430 fixFixedOrderRecurrence(FOR, State);
3431 }
3432
3433 // Forget the original basic block.
3436
3437 // After vectorization, the exit blocks of the original loop will have
3438 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3439 // looked through single-entry phis.
3440 SmallVector<BasicBlock *> ExitBlocks;
3441 OrigLoop->getExitBlocks(ExitBlocks);
3442 for (BasicBlock *Exit : ExitBlocks)
3443 for (PHINode &PN : Exit->phis())
3445
3446 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3447 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3448 if (Cost->requiresScalarEpilogue(VF.isVector())) {
3449 // No edge from the middle block to the unique exit block has been inserted
3450 // and there is nothing to fix from vector loop; phis should have incoming
3451 // from scalar loop only.
3452 } else {
3453 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3454 // the cost model.
3455
3456 // If we inserted an edge from the middle block to the unique exit block,
3457 // update uses outside the loop (phis) to account for the newly inserted
3458 // edge.
3459
3460 // Fix-up external users of the induction variables.
3461 for (const auto &Entry : Legal->getInductionVars())
3462 fixupIVUsers(Entry.first, Entry.second,
3464 IVEndValues[Entry.first], LoopMiddleBlock,
3465 VectorLoop->getHeader(), Plan, State);
3466 }
3467
3468 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3469 // in the exit block, so update the builder.
3470 State.Builder.SetInsertPoint(State.CFG.ExitBB,
3471 State.CFG.ExitBB->getFirstNonPHIIt());
3472 for (const auto &KV : Plan.getLiveOuts())
3473 KV.second->fixPhi(Plan, State);
3474
3476 sinkScalarOperands(&*PI);
3477
3478 // Remove redundant induction instructions.
3479 cse(VectorLoop->getHeader());
3480
3481 // Set/update profile weights for the vector and remainder loops as original
3482 // loop iterations are now distributed among them. Note that original loop
3483 // represented by LoopScalarBody becomes remainder loop after vectorization.
3484 //
3485 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3486 // end up getting slightly roughened result but that should be OK since
3487 // profile is not inherently precise anyway. Note also possible bypass of
3488 // vector code caused by legality checks is ignored, assigning all the weight
3489 // to the vector loop, optimistically.
3490 //
3491 // For scalable vectorization we can't know at compile time how many iterations
3492 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3493 // vscale of '1'.
3496 VF.getKnownMinValue() * UF);
3497}
3498
3501 // This is the second phase of vectorizing first-order recurrences. An
3502 // overview of the transformation is described below. Suppose we have the
3503 // following loop.
3504 //
3505 // for (int i = 0; i < n; ++i)
3506 // b[i] = a[i] - a[i - 1];
3507 //
3508 // There is a first-order recurrence on "a". For this loop, the shorthand
3509 // scalar IR looks like:
3510 //
3511 // scalar.ph:
3512 // s_init = a[-1]
3513 // br scalar.body
3514 //
3515 // scalar.body:
3516 // i = phi [0, scalar.ph], [i+1, scalar.body]
3517 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3518 // s2 = a[i]
3519 // b[i] = s2 - s1
3520 // br cond, scalar.body, ...
3521 //
3522 // In this example, s1 is a recurrence because it's value depends on the
3523 // previous iteration. In the first phase of vectorization, we created a
3524 // vector phi v1 for s1. We now complete the vectorization and produce the
3525 // shorthand vector IR shown below (for VF = 4, UF = 1).
3526 //
3527 // vector.ph:
3528 // v_init = vector(..., ..., ..., a[-1])
3529 // br vector.body
3530 //
3531 // vector.body
3532 // i = phi [0, vector.ph], [i+4, vector.body]
3533 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3534 // v2 = a[i, i+1, i+2, i+3];
3535 // v3 = vector(v1(3), v2(0, 1, 2))
3536 // b[i, i+1, i+2, i+3] = v2 - v3
3537 // br cond, vector.body, middle.block
3538 //
3539 // middle.block:
3540 // x = v2(3)
3541 // br scalar.ph
3542 //
3543 // scalar.ph:
3544 // s_init = phi [x, middle.block], [a[-1], otherwise]
3545 // br scalar.body
3546 //
3547 // After execution completes the vector loop, we extract the next value of
3548 // the recurrence (x) to use as the initial value in the scalar loop.
3549
3550 // Extract the last vector element in the middle block. This will be the
3551 // initial value for the recurrence when jumping to the scalar loop.
3552 VPValue *PreviousDef = PhiR->getBackedgeValue();
3553 Value *Incoming = State.get(PreviousDef, UF - 1);
3554 auto *ExtractForScalar = Incoming;
3555 auto *IdxTy = Builder.getInt32Ty();
3556 Value *RuntimeVF = nullptr;
3557 if (VF.isVector()) {
3558 auto *One = ConstantInt::get(IdxTy, 1);
3560 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3561 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3562 ExtractForScalar =
3563 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3564 }
3565
3566 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3567 assert(PhiR->getNumUsers() == 1 &&
3568 RecurSplice->getOpcode() ==
3570 "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3571 SmallVector<VPLiveOut *> LiveOuts;
3572 for (VPUser *U : RecurSplice->users())
3573 if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3574 LiveOuts.push_back(LiveOut);
3575
3576 if (!LiveOuts.empty()) {
3577 // Extract the second last element in the middle block if the
3578 // Phi is used outside the loop. We need to extract the phi itself
3579 // and not the last element (the phi update in the current iteration). This
3580 // will be the value when jumping to the exit block from the
3581 // LoopMiddleBlock, when the scalar loop is not run at all.
3582 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3583 if (VF.isVector()) {
3584 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3585 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3586 Incoming, Idx, "vector.recur.extract.for.phi");
3587 } else {
3588 assert(UF > 1 && "VF and UF cannot both be 1");
3589 // When loop is unrolled without vectorizing, initialize
3590 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3591 // value of `Incoming`. This is analogous to the vectorized case above:
3592 // extracting the second last element when VF > 1.
3593 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3594 }
3595
3596 for (VPLiveOut *LiveOut : LiveOuts) {
3597 assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3598 PHINode *LCSSAPhi = LiveOut->getPhi();
3599 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3600 State.Plan->removeLiveOut(LCSSAPhi);
3601 }
3602 }
3603
3604 // Fix the initial value of the original recurrence in the scalar loop.
3606 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3607 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3608 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3609 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3610 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3611 Start->addIncoming(Incoming, BB);
3612 }
3613
3614 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3615 Phi->setName("scalar.recur");
3616}
3617
3619 // The basic block and loop containing the predicated instruction.
3620 auto *PredBB = PredInst->getParent();
3621 auto *VectorLoop = LI->getLoopFor(PredBB);
3622
3623 // Initialize a worklist with the operands of the predicated instruction.
3624 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3625
3626 // Holds instructions that we need to analyze again. An instruction may be
3627 // reanalyzed if we don't yet know if we can sink it or not.
3628 SmallVector<Instruction *, 8> InstsToReanalyze;
3629
3630 // Returns true if a given use occurs in the predicated block. Phi nodes use
3631 // their operands in their corresponding predecessor blocks.
3632 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3633 auto *I = cast<Instruction>(U.getUser());
3634 BasicBlock *BB = I->getParent();
3635 if (auto *Phi = dyn_cast<PHINode>(I))
3636 BB = Phi->getIncomingBlock(
3637 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3638 return BB == PredBB;
3639 };
3640
3641 // Iteratively sink the scalarized operands of the predicated instruction
3642 // into the block we created for it. When an instruction is sunk, it's
3643 // operands are then added to the worklist. The algorithm ends after one pass
3644 // through the worklist doesn't sink a single instruction.
3645 bool Changed;
3646 do {
3647 // Add the instructions that need to be reanalyzed to the worklist, and
3648 // reset the changed indicator.
3649 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3650 InstsToReanalyze.clear();
3651 Changed = false;
3652
3653 while (!Worklist.empty()) {
3654 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3655
3656 // We can't sink an instruction if it is a phi node, is not in the loop,
3657 // may have side effects or may read from memory.
3658 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3659 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3660 I->mayHaveSideEffects() || I->mayReadFromMemory())
3661 continue;
3662
3663 // If the instruction is already in PredBB, check if we can sink its
3664 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3665 // sinking the scalar instruction I, hence it appears in PredBB; but it
3666 // may have failed to sink I's operands (recursively), which we try
3667 // (again) here.
3668 if (I->getParent() == PredBB) {
3669 Worklist.insert(I->op_begin(), I->op_end());
3670 continue;
3671 }
3672
3673 // It's legal to sink the instruction if all its uses occur in the
3674 // predicated block. Otherwise, there's nothing to do yet, and we may
3675 // need to reanalyze the instruction.
3676 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3677 InstsToReanalyze.push_back(I);
3678 continue;
3679 }
3680
3681 // Move the instruction to the beginning of the predicated block, and add
3682 // it's operands to the worklist.
3683 I->moveBefore(&*PredBB->getFirstInsertionPt());
3684 Worklist.insert(I->op_begin(), I->op_end());
3685
3686 // The sinking may have enabled other instructions to be sunk, so we will
3687 // need to iterate.
3688 Changed = true;
3689 }
3690 } while (Changed);
3691}
3692
3694 VPTransformState &State) {
3695 auto Iter = vp_depth_first_deep(Plan.getEntry());
3696 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3697 for (VPRecipeBase &P : VPBB->phis()) {
3698 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3699 if (!VPPhi)
3700 continue;
3701 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3702 // Make sure the builder has a valid insert point.
3703 Builder.SetInsertPoint(NewPhi);
3704 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3705 VPValue *Inc = VPPhi->getIncomingValue(i);
3706 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3707 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3708 }
3709 }
3710 }
3711}
3712
3713void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3714 // We should not collect Scalars more than once per VF. Right now, this
3715 // function is called from collectUniformsAndScalars(), which already does
3716 // this check. Collecting Scalars for VF=1 does not make any sense.
3717 assert(VF.isVector() && !Scalars.contains(VF) &&
3718 "This function should not be visited twice for the same VF");
3719
3720 // This avoids any chances of creating a REPLICATE recipe during planning
3721 // since that would result in generation of scalarized code during execution,
3722 // which is not supported for scalable vectors.
3723 if (VF.isScalable()) {
3724 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3725 return;
3726 }
3727
3729
3730 // These sets are used to seed the analysis with pointers used by memory
3731 // accesses that will remain scalar.
3733 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3734 auto *Latch = TheLoop->getLoopLatch();
3735
3736 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3737 // The pointer operands of loads and stores will be scalar as long as the
3738 // memory access is not a gather or scatter operation. The value operand of a
3739 // store will remain scalar if the store is scalarized.
3740 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3741 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3742 assert(WideningDecision != CM_Unknown &&
3743 "Widening decision should be ready at this moment");
3744 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3745 if (Ptr == Store->getValueOperand())
3746 return WideningDecision == CM_Scalarize;
3747 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3748 "Ptr is neither a value or pointer operand");
3749 return WideningDecision != CM_GatherScatter;
3750 };
3751
3752 // A helper that returns true if the given value is a bitcast or
3753 // getelementptr instruction contained in the loop.
3754 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3755 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3756 isa<GetElementPtrInst>(V)) &&
3758 };
3759
3760 // A helper that evaluates a memory access's use of a pointer. If the use will
3761 // be a scalar use and the pointer is only used by memory accesses, we place
3762 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3763 // PossibleNonScalarPtrs.
3764 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3765 // We only care about bitcast and getelementptr instructions contained in
3766 // the loop.
3767 if (!isLoopVaryingBitCastOrGEP(Ptr))
3768 return;
3769
3770 // If the pointer has already been identified as scalar (e.g., if it was
3771 // also identified as uniform), there's nothing to do.
3772 auto *I = cast<Instruction>(Ptr);
3773 if (Worklist.count(I))
3774 return;
3775
3776 // If the use of the pointer will be a scalar use, and all users of the
3777 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3778 // place the pointer in PossibleNonScalarPtrs.
3779 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3780 return isa<LoadInst>(U) || isa<StoreInst>(U);
3781 }))
3782 ScalarPtrs.insert(I);
3783 else
3784 PossibleNonScalarPtrs.insert(I);
3785 };
3786
3787 // We seed the scalars analysis with three classes of instructions: (1)
3788 // instructions marked uniform-after-vectorization and (2) bitcast,
3789 // getelementptr and (pointer) phi instructions used by memory accesses
3790 // requiring a scalar use.
3791 //
3792 // (1) Add to the worklist all instructions that have been identified as
3793 // uniform-after-vectorization.
3794 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3795
3796 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3797 // memory accesses requiring a scalar use. The pointer operands of loads and
3798 // stores will be scalar as long as the memory accesses is not a gather or
3799 // scatter operation. The value operand of a store will remain scalar if the
3800 // store is scalarized.
3801 for (auto *BB : TheLoop->blocks())
3802 for (auto &I : *BB) {
3803 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3804 evaluatePtrUse(Load, Load->getPointerOperand());
3805 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3806 evaluatePtrUse(Store, Store->getPointerOperand());
3807 evaluatePtrUse(Store, Store->getValueOperand());
3808 }
3809 }
3810 for (auto *I : ScalarPtrs)
3811 if (!PossibleNonScalarPtrs.count(I)) {
3812 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3813 Worklist.insert(I);
3814 }
3815
3816 // Insert the forced scalars.
3817 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3818 // induction variable when the PHI user is scalarized.
3819 auto ForcedScalar = ForcedScalars.find(VF);
3820 if (ForcedScalar != ForcedScalars.end())
3821 for (auto *I : ForcedScalar->second) {
3822 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3823 Worklist.insert(I);
3824 }
3825
3826 // Expand the worklist by looking through any bitcasts and getelementptr
3827 // instructions we've already identified as scalar. This is similar to the
3828 // expansion step in collectLoopUniforms(); however, here we're only
3829 // expanding to include additional bitcasts and getelementptr instructions.
3830 unsigned Idx = 0;
3831 while (Idx != Worklist.size()) {
3832 Instruction *Dst = Worklist[Idx++];
3833 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3834 continue;
3835 auto *Src = cast<Instruction>(Dst->getOperand(0));
3836 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3837 auto *J = cast<Instruction>(U);
3838 return !TheLoop->contains(J) || Worklist.count(J) ||
3839 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3840 isScalarUse(J, Src));
3841 })) {
3842 Worklist.insert(Src);
3843 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3844 }
3845 }
3846
3847 // An induction variable will remain scalar if all users of the induction
3848 // variable and induction variable update remain scalar.
3849 for (const auto &Induction : Legal->getInductionVars()) {
3850 auto *Ind = Induction.first;
3851 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3852
3853 // If tail-folding is applied, the primary induction variable will be used
3854 // to feed a vector compare.
3855 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3856 continue;
3857
3858 // Returns true if \p Indvar is a pointer induction that is used directly by
3859 // load/store instruction \p I.
3860 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3861 Instruction *I) {
3862 return Induction.second.getKind() ==
3864 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3865 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3866 };
3867
3868 // Determine if all users of the induction variable are scalar after
3869 // vectorization.
3870 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3871 auto *I = cast<Instruction>(U);
3872 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3873 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3874 });
3875 if (!ScalarInd)
3876 continue;
3877
3878 // Determine if all users of the induction variable update instruction are
3879 // scalar after vectorization.
3880 auto ScalarIndUpdate =
3881 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3882 auto *I = cast<Instruction>(U);
3883 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3884 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3885 });
3886 if (!ScalarIndUpdate)
3887 continue;
3888
3889 // The induction variable and its update instruction will remain scalar.
3890 Worklist.insert(Ind);
3891 Worklist.insert(IndUpdate);
3892 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3893 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3894 << "\n");
3895 }
3896
3897 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3898}
3899
3901 Instruction *I, ElementCount VF) const {
3902 if (!isPredicatedInst(I))
3903 return false;
3904
3905 // Do we have a non-scalar lowering for this predicated
3906 // instruction? No - it is scalar with predication.
3907 switch(I->getOpcode()) {
3908 default:
3909 return true;
3910 case Instruction::Call:
3911 if (VF.isScalar())
3912 return true;
3913 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3914 .Kind == CM_Scalarize;
3915 case Instruction::Load:
3916 case Instruction::Store: {
3918 auto *Ty = getLoadStoreType(I);
3919 Type *VTy = Ty;
3920 if (VF.isVector())
3921 VTy = VectorType::get(Ty, VF);
3922 const Align Alignment = getLoadStoreAlignment(I);
3923 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3924 TTI.isLegalMaskedGather(VTy, Alignment))
3925 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3926 TTI.isLegalMaskedScatter(VTy, Alignment));
3927 }
3928 case Instruction::UDiv:
3929 case Instruction::SDiv:
3930 case Instruction::SRem:
3931 case Instruction::URem: {
3932 // We have the option to use the safe-divisor idiom to avoid predication.
3933 // The cost based decision here will always select safe-divisor for
3934 // scalable vectors as scalarization isn't legal.
3935 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3936 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3937 }
3938 }
3939}
3940
3942 if (!blockNeedsPredicationForAnyReason(I->getParent()))
3943 return false;
3944
3945 // Can we prove this instruction is safe to unconditionally execute?
3946 // If not, we must use some form of predication.
3947 switch(I->getOpcode()) {
3948 default:
3949 return false;
3950 case Instruction::Load:
3951 case Instruction::Store: {
3952 if (!Legal->isMaskRequired(I))
3953 return false;
3954 // When we know the load's address is loop invariant and the instruction
3955 // in the original scalar loop was unconditionally executed then we
3956 // don't need to mark it as a predicated instruction. Tail folding may
3957 // introduce additional predication, but we're guaranteed to always have
3958 // at least one active lane. We call Legal->blockNeedsPredication here
3959 // because it doesn't query tail-folding. For stores, we need to prove
3960 // both speculation safety (which follows from the same argument as loads),
3961 // but also must prove the value being stored is correct. The easiest
3962 // form of the later is to require that all values stored are the same.
3964 (isa<LoadInst>(I) ||
3965 (isa<StoreInst>(I) &&
3966 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3967 !Legal->blockNeedsPredication(I->getParent()))
3968 return false;
3969 return true;
3970 }
3971 case Instruction::UDiv:
3972 case Instruction::SDiv:
3973 case Instruction::SRem:
3974 case Instruction::URem:
3975 // TODO: We can use the loop-preheader as context point here and get
3976 // context sensitive reasoning
3978 case Instruction::Call:
3979 return Legal->isMaskRequired(I);
3980 }
3981}
3982
3983std::pair<InstructionCost, InstructionCost>
3985 ElementCount VF) const {
3986 assert(I->getOpcode() == Instruction::UDiv ||
3987 I->getOpcode() == Instruction::SDiv ||
3988 I->getOpcode() == Instruction::SRem ||
3989 I->getOpcode() == Instruction::URem);
3991
3993
3994 // Scalarization isn't legal for scalable vector types
3995 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3996 if (!VF.isScalable()) {
3997 // Get the scalarization cost and scale this amount by the probability of
3998 // executing the predicated block. If the instruction is not predicated,
3999 // we fall through to the next case.
4000 ScalarizationCost = 0;
4001
4002 // These instructions have a non-void type, so account for the phi nodes
4003 // that we will create. This cost is likely to be zero. The phi node
4004 // cost, if any, should be scaled by the block probability because it
4005 // models a copy at the end of each predicated block.
4006 ScalarizationCost += VF.getKnownMinValue() *
4007 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4008
4009 // The cost of the non-predicated instruction.
4010 ScalarizationCost += VF.getKnownMinValue() *
4011 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4012
4013 // The cost of insertelement and extractelement instructions needed for
4014 // scalarization.
4015 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4016
4017 // Scale the cost by the probability of executing the predicated blocks.
4018 // This assumes the predicated block for each vector lane is equally
4019 // likely.
4020 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4021 }
4022 InstructionCost SafeDivisorCost = 0;
4023
4024 auto *VecTy = ToVectorTy(I->getType(), VF);
4025
4026 // The cost of the select guard to ensure all lanes are well defined
4027 // after we speculate above any internal control flow.
4028 SafeDivisorCost += TTI.getCmpSelInstrCost(
4029 Instruction::Select, VecTy,
4030 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4032
4033 // Certain instructions can be cheaper to vectorize if they have a constant
4034 // second vector operand. One example of this are shifts on x86.
4035 Value *Op2 = I->getOperand(1);
4036 auto Op2Info = TTI.getOperandInfo(Op2);
4037 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4038 Legal->isInvariant(Op2))
4040
4041 SmallVector<const Value *, 4> Operands(I->operand_values());
4042 SafeDivisorCost += TTI.getArithmeticInstrCost(
4043 I->getOpcode(), VecTy, CostKind,
4044 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4045 Op2Info, Operands, I);
4046 return {ScalarizationCost, SafeDivisorCost};
4047}
4048
4050 Instruction *I, ElementCount VF) {
4051 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4053 "Decision should not be set yet.");
4054 auto *Group = getInterleavedAccessGroup(I);
4055 assert(Group && "Must have a group.");
4056
4057 // If the instruction's allocated size doesn't equal it's type size, it
4058 // requires padding and will be scalarized.
4059 auto &DL = I->getModule()->getDataLayout();
4060 auto *ScalarTy = getLoadStoreType(I);
4061 if (hasIrregularType(ScalarTy, DL))
4062 return false;
4063
4064 // If the group involves a non-integral pointer, we may not be able to
4065 // losslessly cast all values to a common type.
4066 unsigned InterleaveFactor = Group->getFactor();
4067 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4068 for (unsigned i = 0; i < InterleaveFactor; i++) {
4069 Instruction *Member = Group->getMember(i);
4070 if (!Member)
4071 continue;
4072 auto *MemberTy = getLoadStoreType(Member);
4073 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4074 // Don't coerce non-integral pointers to integers or vice versa.
4075 if (MemberNI != ScalarNI) {
4076 // TODO: Consider adding special nullptr value case here
4077 return false;
4078 } else if (MemberNI && ScalarNI &&
4079 ScalarTy->getPointerAddressSpace() !=
4080 MemberTy->getPointerAddressSpace()) {
4081 return false;
4082 }
4083 }
4084
4085 // Check if masking is required.
4086 // A Group may need masking for one of two reasons: it resides in a block that
4087 // needs predication, or it was decided to use masking to deal with gaps
4088 // (either a gap at the end of a load-access that may result in a speculative
4089 // load, or any gaps in a store-access).
4090 bool PredicatedAccessRequiresMasking =
4091 blockNeedsPredicationForAnyReason(I->getParent()) &&
4093 bool LoadAccessWithGapsRequiresEpilogMasking =
4094 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4096 bool StoreAccessWithGapsRequiresMasking =
4097 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4098 if (!PredicatedAccessRequiresMasking &&
4099 !LoadAccessWithGapsRequiresEpilogMasking &&
4100 !StoreAccessWithGapsRequiresMasking)
4101 return true;
4102
4103 // If masked interleaving is required, we expect that the user/target had
4104 // enabled it, because otherwise it either wouldn't have been created or
4105 // it should have been invalidated by the CostModel.
4107 "Masked interleave-groups for predicated accesses are not enabled.");
4108
4109 if (Group->isReverse())
4110 return false;
4111
4112 auto *Ty = getLoadStoreType(I);
4113 const Align Alignment = getLoadStoreAlignment(I);
4114 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4115 : TTI.isLegalMaskedStore(Ty, Alignment);
4116}
4117
4119 Instruction *I, ElementCount VF) {
4120 // Get and ensure we have a valid memory instruction.
4121 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4122
4124 auto *ScalarTy = getLoadStoreType(I);
4125
4126 // In order to be widened, the pointer should be consecutive, first of all.
4127 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4128 return false;
4129
4130 // If the instruction is a store located in a predicated block, it will be
4131 // scalarized.
4132 if (isScalarWithPredication(I, VF))
4133 return false;
4134
4135 // If the instruction's allocated size doesn't equal it's type size, it
4136 // requires padding and will be scalarized.
4137 auto &DL = I->getModule()->getDataLayout();
4138 if (hasIrregularType(ScalarTy, DL))
4139 return false;
4140
4141 return true;
4142}
4143
4144void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4145 // We should not collect Uniforms more than once per VF. Right now,
4146 // this function is called from collectUniformsAndScalars(), which
4147 // already does this check. Collecting Uniforms for VF=1 does not make any
4148 // sense.
4149
4150 assert(VF.isVector() && !Uniforms.contains(VF) &&
4151 "This function should not be visited twice for the same VF");
4152
4153 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4154 // not analyze again. Uniforms.count(VF) will return 1.
4155 Uniforms[VF].clear();
4156
4157 // We now know that the loop is vectorizable!
4158 // Collect instructions inside the loop that will remain uniform after
4159 // vectorization.
4160
4161 // Global values, params and instructions outside of current loop are out of
4162 // scope.
4163 auto isOutOfScope = [&](Value *V) -> bool {
4164 Instruction *I = dyn_cast<Instruction>(V);
4165 return (!I || !TheLoop->contains(I));
4166 };
4167
4168 // Worklist containing uniform instructions demanding lane 0.
4169 SetVector<Instruction *> Worklist;
4170 BasicBlock *Latch = TheLoop->getLoopLatch();
4171
4172 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4173 // that are scalar with predication must not be considered uniform after
4174 // vectorization, because that would create an erroneous replicating region
4175 // where only a single instance out of VF should be formed.
4176 // TODO: optimize such seldom cases if found important, see PR40816.
4177 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4178 if (isOutOfScope(I)) {
4179 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4180 << *I << "\n");
4181 return;
4182 }
4183 if (isScalarWithPredication(I, VF)) {
4184 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4185 << *I << "\n");
4186 return;
4187 }
4188 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4189 Worklist.insert(I);
4190 };
4191
4192 // Start with the conditional branch. If the branch condition is an
4193 // instruction contained in the loop that is only used by the branch, it is
4194 // uniform.
4195 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4196 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4197 addToWorklistIfAllowed(Cmp);
4198
4199 auto PrevVF = VF.divideCoefficientBy(2);
4200 // Return true if all lanes perform the same memory operation, and we can
4201 // thus chose to execute only one.
4202 auto isUniformMemOpUse = [&](Instruction *I) {
4203 // If the value was already known to not be uniform for the previous
4204 // (smaller VF), it cannot be uniform for the larger VF.
4205 if (PrevVF.isVector()) {
4206 auto Iter = Uniforms.find(PrevVF);
4207 if (Iter != Uniforms.end() && !Iter->second.contains(I))
4208 return false;
4209 }
4210 if (!Legal->isUniformMemOp(*I, VF))
4211 return false;
4212 if (isa<LoadInst>(I))
4213 // Loading the same address always produces the same result - at least
4214 // assuming aliasing and ordering which have already been checked.
4215 return true;
4216 // Storing the same value on every iteration.
4217 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4218 };
4219
4220 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4221 InstWidening WideningDecision = getWideningDecision(I, VF);
4222 assert(WideningDecision != CM_Unknown &&
4223 "Widening decision should be ready at this moment");
4224
4225 if (isUniformMemOpUse(I))
4226 return true;
4227
4228 return (WideningDecision == CM_Widen ||
4229 WideningDecision == CM_Widen_Reverse ||
4230 WideningDecision == CM_Interleave);
4231 };
4232
4233 // Returns true if Ptr is the pointer operand of a memory access instruction
4234 // I, I is known to not require scalarization, and the pointer is not also
4235 // stored.
4236 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4237 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4238 return false;
4239 return getLoadStorePointerOperand(I) == Ptr &&
4240 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4241 };
4242
4243 // Holds a list of values which are known to have at least one uniform use.
4244 // Note that there may be other uses which aren't uniform. A "uniform use"
4245 // here is something which only demands lane 0 of the unrolled iterations;
4246 // it does not imply that all lanes produce the same value (e.g. this is not
4247 // the usual meaning of uniform)
4248 SetVector<Value *> HasUniformUse;
4249
4250 // Scan the loop for instructions which are either a) known to have only
4251 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4252 for (auto *BB : TheLoop->blocks())
4253 for (auto &I : *BB) {
4254 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4255 switch (II->getIntrinsicID()) {
4256 case Intrinsic::sideeffect:
4257 case Intrinsic::experimental_noalias_scope_decl:
4258 case Intrinsic::assume:
4259 case Intrinsic::lifetime_start:
4260 case Intrinsic::lifetime_end:
4262 addToWorklistIfAllowed(&I);
4263 break;
4264 default:
4265 break;
4266 }
4267 }
4268
4269 // ExtractValue instructions must be uniform, because the operands are
4270 // known to be loop-invariant.
4271 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4272 assert(isOutOfScope(EVI->getAggregateOperand()) &&
4273 "Expected aggregate value to be loop invariant");
4274 addToWorklistIfAllowed(EVI);
4275 continue;
4276 }
4277
4278 // If there's no pointer operand, there's nothing to do.
4280 if (!Ptr)
4281 continue;
4282
4283 if (isUniformMemOpUse(&I))
4284 addToWorklistIfAllowed(&I);
4285
4286 if (isVectorizedMemAccessUse(&I, Ptr))
4287 HasUniformUse.insert(Ptr);
4288 }
4289
4290 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4291 // demanding) users. Since loops are assumed to be in LCSSA form, this
4292 // disallows uses outside the loop as well.
4293 for (auto *V : HasUniformUse) {
4294 if (isOutOfScope(V))
4295 continue;
4296 auto *I = cast<Instruction>(V);
4297 auto UsersAreMemAccesses =
4298 llvm::all_of(I->users(), [&](User *U) -> bool {
4299 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4300 });
4301 if (UsersAreMemAccesses)
4302 addToWorklistIfAllowed(I);
4303 }
4304
4305 // Expand Worklist in topological order: whenever a new instruction
4306 // is added , its users should be already inside Worklist. It ensures
4307 // a uniform instruction will only be used by uniform instructions.
4308 unsigned idx = 0;
4309 while (idx != Worklist.size()) {
4310 Instruction *I = Worklist[idx++];
4311
4312 for (auto *OV : I->operand_values()) {
4313 // isOutOfScope operands cannot be uniform instructions.
4314 if (isOutOfScope(OV))
4315 continue;
4316 // First order recurrence Phi's should typically be considered
4317 // non-uniform.
4318 auto *OP = dyn_cast<PHINode>(OV);
4320 continue;
4321 // If all the users of the operand are uniform, then add the
4322 // operand into the uniform worklist.
4323 auto *OI = cast<Instruction>(OV);
4324 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4325 auto *J = cast<Instruction>(U);
4326 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4327 }))
4328 addToWorklistIfAllowed(OI);
4329 }
4330 }
4331
4332 // For an instruction to be added into Worklist above, all its users inside
4333 // the loop should also be in Worklist. However, this condition cannot be
4334 // true for phi nodes that form a cyclic dependence. We must process phi
4335 // nodes separately. An induction variable will remain uniform if all users
4336 // of the induction variable and induction variable update remain uniform.
4337 // The code below handles both pointer and non-pointer induction variables.
4338 for (const auto &Induction : Legal->getInductionVars()) {
4339 auto *Ind = Induction.first;
4340 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4341
4342 // Determine if all users of the induction variable are uniform after
4343 // vectorization.
4344 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4345 auto *I = cast<Instruction>(U);
4346 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4347 isVectorizedMemAccessUse(I, Ind);
4348 });
4349 if (!UniformInd)
4350 continue;
4351
4352 // Determine if all users of the induction variable update instruction are
4353 // uniform after vectorization.
4354 auto UniformIndUpdate =
4355 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4356 auto *I = cast<Instruction>(U);
4357 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4358 isVectorizedMemAccessUse(I, IndUpdate);
4359 });
4360 if (!UniformIndUpdate)
4361 continue;
4362
4363 // The induction variable and its update instruction will remain uniform.
4364 addToWorklistIfAllowed(Ind);
4365 addToWorklistIfAllowed(IndUpdate);
4366 }
4367
4368 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4369}
4370
4372 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4373
4375 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4376 "runtime pointer checks needed. Enable vectorization of this "
4377 "loop with '#pragma clang loop vectorize(enable)' when "
4378 "compiling with -Os/-Oz",
4379 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4380 return true;
4381 }
4382
4383 if (!PSE.getPredicate().isAlwaysTrue()) {
4384 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4385 "runtime SCEV checks needed. Enable vectorization of this "
4386 "loop with '#pragma clang loop vectorize(enable)' when "
4387 "compiling with -Os/-Oz",
4388 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4389 return true;
4390 }
4391
4392 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4393 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4394 reportVectorizationFailure("Runtime stride check for small trip count",
4395 "runtime stride == 1 checks needed. Enable vectorization of "
4396 "this loop without such check by compiling with -Os/-Oz",
4397 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4398 return true;
4399 }
4400
4401 return false;
4402}
4403
4405LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4407 return ElementCount::getScalable(0);
4408
4410 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4411 "ScalableVectorizationDisabled", ORE, TheLoop);
4412 return ElementCount::getScalable(0);
4413 }
4414
4415 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4416
4417 auto MaxScalableVF = ElementCount::getScalable(
4418 std::numeric_limits<ElementCount::ScalarTy>::max());
4419
4420 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4421 // FIXME: While for scalable vectors this is currently sufficient, this should
4422 // be replaced by a more detailed mechanism that filters out specific VFs,
4423 // instead of invalidating vectorization for a whole set of VFs based on the
4424 // MaxVF.
4425
4426 // Disable scalable vectorization if the loop contains unsupported reductions.
4427 if (!canVectorizeReductions(MaxScalableVF)) {
4429 "Scalable vectorization not supported for the reduction "
4430 "operations found in this loop.",
4431 "ScalableVFUnfeasible", ORE, TheLoop);
4432 return ElementCount::getScalable(0);
4433 }
4434
4435 // Disable scalable vectorization if the loop contains any instructions
4436 // with element types not supported for scalable vectors.
4437 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4438 return !Ty->isVoidTy() &&
4440 })) {
4441 reportVectorizationInfo("Scalable vectorization is not supported "
4442 "for all element types found in this loop.",
4443 "ScalableVFUnfeasible", ORE, TheLoop);
4444 return ElementCount::getScalable(0);
4445 }
4446
4448 return MaxScalableVF;
4449
4450 // Limit MaxScalableVF by the maximum safe dependence distance.
4451 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4452 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4453 else
4454 MaxScalableVF = ElementCount::getScalable(0);
4455
4456 if (!MaxScalableVF)
4458 "Max legal vector width too small, scalable vectorization "
4459 "unfeasible.",
4460 "ScalableVFUnfeasible", ORE, TheLoop);
4461
4462 return MaxScalableVF;
4463}
4464
4465FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4466 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4468 unsigned SmallestType, WidestType;
4469 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4470
4471 // Get the maximum safe dependence distance in bits computed by LAA.
4472 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4473 // the memory accesses that is most restrictive (involved in the smallest
4474 // dependence distance).
4475 unsigned MaxSafeElements =
4477
4478 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4479 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4480
4481 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4482 << ".\n");
4483 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4484 << ".\n");
4485
4486 // First analyze the UserVF, fall back if the UserVF should be ignored.
4487 if (UserVF) {
4488 auto MaxSafeUserVF =
4489 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4490
4491 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4492 // If `VF=vscale x N` is safe, then so is `VF=N`
4493 if (UserVF.isScalable())
4494 return FixedScalableVFPair(
4495 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4496 else
4497 return UserVF;
4498 }
4499
4500 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4501
4502 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4503 // is better to ignore the hint and let the compiler choose a suitable VF.
4504 if (!UserVF.isScalable()) {
4505 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4506 << " is unsafe, clamping to max safe VF="
4507 << MaxSafeFixedVF << ".\n");
4508 ORE->emit([&]() {
4509 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4511 TheLoop->getHeader())
4512 << "User-specified vectorization factor &q