LLVM 20.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanPatternMatch.h"
63#include "VPlanTransforms.h"
64#include "VPlanVerifier.h"
65#include "llvm/ADT/APInt.h"
66#include "llvm/ADT/ArrayRef.h"
67#include "llvm/ADT/DenseMap.h"
69#include "llvm/ADT/Hashing.h"
70#include "llvm/ADT/MapVector.h"
71#include "llvm/ADT/STLExtras.h"
73#include "llvm/ADT/SmallSet.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
82#include "llvm/Analysis/CFG.h"
98#include "llvm/IR/Attributes.h"
99#include "llvm/IR/BasicBlock.h"
100#include "llvm/IR/CFG.h"
101#include "llvm/IR/Constant.h"
102#include "llvm/IR/Constants.h"
103#include "llvm/IR/DataLayout.h"
104#include "llvm/IR/DebugInfo.h"
106#include "llvm/IR/DebugLoc.h"
107#include "llvm/IR/DerivedTypes.h"
109#include "llvm/IR/Dominators.h"
110#include "llvm/IR/Function.h"
111#include "llvm/IR/IRBuilder.h"
112#include "llvm/IR/InstrTypes.h"
113#include "llvm/IR/Instruction.h"
114#include "llvm/IR/Instructions.h"
116#include "llvm/IR/Intrinsics.h"
117#include "llvm/IR/MDBuilder.h"
118#include "llvm/IR/Metadata.h"
119#include "llvm/IR/Module.h"
120#include "llvm/IR/Operator.h"
121#include "llvm/IR/PatternMatch.h"
123#include "llvm/IR/Type.h"
124#include "llvm/IR/Use.h"
125#include "llvm/IR/User.h"
126#include "llvm/IR/Value.h"
127#include "llvm/IR/ValueHandle.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
133#include "llvm/Support/Debug.h"
147#include <algorithm>
148#include <cassert>
149#include <cmath>
150#include <cstdint>
151#include <functional>
152#include <iterator>
153#include <limits>
154#include <map>
155#include <memory>
156#include <string>
157#include <tuple>
158#include <utility>
159
160using namespace llvm;
161
162#define LV_NAME "loop-vectorize"
163#define DEBUG_TYPE LV_NAME
164
165#ifndef NDEBUG
166const char VerboseDebug[] = DEBUG_TYPE "-verbose";
167#endif
168
169/// @{
170/// Metadata attribute names
171const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
173 "llvm.loop.vectorize.followup_vectorized";
175 "llvm.loop.vectorize.followup_epilogue";
176/// @}
177
178STATISTIC(LoopsVectorized, "Number of loops vectorized");
179STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
180STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
181
183 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
184 cl::desc("Enable vectorization of epilogue loops."));
185
187 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
188 cl::desc("When epilogue vectorization is enabled, and a value greater than "
189 "1 is specified, forces the given VF for all applicable epilogue "
190 "loops."));
191
193 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
194 cl::desc("Only loops with vectorization factor equal to or larger than "
195 "the specified value are considered for epilogue vectorization."));
196
197/// Loops with a known constant trip count below this number are vectorized only
198/// if no scalar iteration overheads are incurred.
200 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
201 cl::desc("Loops with a constant trip count that is smaller than this "
202 "value are vectorized only if no scalar iteration overheads "
203 "are incurred."));
204
206 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
207 cl::desc("The maximum allowed number of runtime memory checks"));
208
210 "vectorize-use-legacy-cost-model", cl::init(false), cl::Hidden,
211 cl::desc("Use the legacy cost model instead of the VPlan-based cost model. "
212 "This option will be removed in the future."));
213
214// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
215// that predication is preferred, and this lists all options. I.e., the
216// vectorizer will try to fold the tail-loop (epilogue) into the vector body
217// and predicate the instructions accordingly. If tail-folding fails, there are
218// different fallback strategies depending on these values:
220 enum Option {
224 };
225} // namespace PreferPredicateTy
226
228 "prefer-predicate-over-epilogue",
231 cl::desc("Tail-folding and predication preferences over creating a scalar "
232 "epilogue loop."),
234 "scalar-epilogue",
235 "Don't tail-predicate loops, create scalar epilogue"),
237 "predicate-else-scalar-epilogue",
238 "prefer tail-folding, create scalar epilogue if tail "
239 "folding fails."),
241 "predicate-dont-vectorize",
242 "prefers tail-folding, don't attempt vectorization if "
243 "tail-folding fails.")));
244
246 "force-tail-folding-style", cl::desc("Force the tail folding style"),
247 cl::init(TailFoldingStyle::None),
249 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
251 TailFoldingStyle::Data, "data",
252 "Create lane mask for data only, using active.lane.mask intrinsic"),
253 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
254 "data-without-lane-mask",
255 "Create lane mask with compare/stepvector"),
256 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
257 "Create lane mask using active.lane.mask intrinsic, and use "
258 "it for both data and control flow"),
259 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
260 "data-and-control-without-rt-check",
261 "Similar to data-and-control, but remove the runtime check"),
262 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
263 "Use predicated EVL instructions for tail folding. If EVL "
264 "is unsupported, fallback to data-without-lane-mask.")));
265
267 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
268 cl::desc("Maximize bandwidth when selecting vectorization factor which "
269 "will be determined by the smallest type in loop."));
270
272 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
273 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
274
275/// An interleave-group may need masking if it resides in a block that needs
276/// predication, or in order to mask away gaps.
278 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
279 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
280
282 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
283 cl::desc("A flag that overrides the target's number of scalar registers."));
284
286 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
287 cl::desc("A flag that overrides the target's number of vector registers."));
288
290 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
291 cl::desc("A flag that overrides the target's max interleave factor for "
292 "scalar loops."));
293
295 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
296 cl::desc("A flag that overrides the target's max interleave factor for "
297 "vectorized loops."));
298
300 "force-target-instruction-cost", cl::init(0), cl::Hidden,
301 cl::desc("A flag that overrides the target's expected cost for "
302 "an instruction to a single constant value. Mostly "
303 "useful for getting consistent testing."));
304
306 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
307 cl::desc(
308 "Pretend that scalable vectors are supported, even if the target does "
309 "not support them. This flag should only be used for testing."));
310
312 "small-loop-cost", cl::init(20), cl::Hidden,
313 cl::desc(
314 "The cost of a loop that is considered 'small' by the interleaver."));
315
317 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
318 cl::desc("Enable the use of the block frequency analysis to access PGO "
319 "heuristics minimizing code growth in cold regions and being more "
320 "aggressive in hot regions."));
321
322// Runtime interleave loops for load/store throughput.
324 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
325 cl::desc(
326 "Enable runtime interleaving until load/store ports are saturated"));
327
328/// The number of stores in a loop that are allowed to need predication.
330 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
331 cl::desc("Max number of stores to be predicated behind an if."));
332
334 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
335 cl::desc("Count the induction variable only once when interleaving"));
336
338 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
339 cl::desc("Enable if predication of stores during vectorization."));
340
342 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
343 cl::desc("The maximum interleave count to use when interleaving a scalar "
344 "reduction in a nested loop."));
345
346static cl::opt<bool>
347 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
349 cl::desc("Prefer in-loop vector reductions, "
350 "overriding the targets preference."));
351
353 "force-ordered-reductions", cl::init(false), cl::Hidden,
354 cl::desc("Enable the vectorisation of loops with in-order (strict) "
355 "FP reductions"));
356
358 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
359 cl::desc(
360 "Prefer predicating a reduction operation over an after loop select."));
361
362namespace llvm {
364 "enable-vplan-native-path", cl::Hidden,
365 cl::desc("Enable VPlan-native vectorization path with "
366 "support for outer loop vectorization."));
367}
368
369// This flag enables the stress testing of the VPlan H-CFG construction in the
370// VPlan-native vectorization path. It must be used in conjuction with
371// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
372// verification of the H-CFGs built.
374 "vplan-build-stress-test", cl::init(false), cl::Hidden,
375 cl::desc(
376 "Build VPlan for every supported loop nest in the function and bail "
377 "out right after the build (stress test the VPlan H-CFG construction "
378 "in the VPlan-native vectorization path)."));
379
381 "interleave-loops", cl::init(true), cl::Hidden,
382 cl::desc("Enable loop interleaving in Loop vectorization passes"));
384 "vectorize-loops", cl::init(true), cl::Hidden,
385 cl::desc("Run the Loop vectorization passes"));
386
388 "vplan-print-in-dot-format", cl::Hidden,
389 cl::desc("Use dot format instead of plain text when dumping VPlans"));
390
392 "force-widen-divrem-via-safe-divisor", cl::Hidden,
393 cl::desc(
394 "Override cost based safe divisor widening for div/rem instructions"));
395
397 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
399 cl::desc("Try wider VFs if they enable the use of vector variants"));
400
401// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
402// variables not overflowing do not hold. See `emitSCEVChecks`.
403static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
404// Likelyhood of bypassing the vectorized loop because pointers overlap. See
405// `emitMemRuntimeChecks`.
406static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
407// Likelyhood of bypassing the vectorized loop because there are zero trips left
408// after prolog. See `emitIterationCountCheck`.
409static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
410
411/// A helper function that returns true if the given type is irregular. The
412/// type is irregular if its allocated size doesn't equal the store size of an
413/// element of the corresponding vector type.
414static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
415 // Determine if an array of N elements of type Ty is "bitcast compatible"
416 // with a <N x Ty> vector.
417 // This is only true if there is no padding between the array elements.
418 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
419}
420
421/// Returns "best known" trip count for the specified loop \p L as defined by
422/// the following procedure:
423/// 1) Returns exact trip count if it is known.
424/// 2) Returns expected trip count according to profile data if any.
425/// 3) Returns upper bound estimate if it is known.
426/// 4) Returns std::nullopt if all of the above failed.
427static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
428 Loop *L) {
429 // Check if exact trip count is known.
430 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
431 return ExpectedTC;
432
433 // Check if there is an expected trip count available from profile data.
435 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
436 return *EstimatedTC;
437
438 // Check if upper bound estimate is known.
439 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
440 return ExpectedTC;
441
442 return std::nullopt;
443}
444
445namespace {
446// Forward declare GeneratedRTChecks.
447class GeneratedRTChecks;
448
449using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
450} // namespace
451
452namespace llvm {
453
455
456/// InnerLoopVectorizer vectorizes loops which contain only one basic
457/// block to a specified vectorization factor (VF).
458/// This class performs the widening of scalars into vectors, or multiple
459/// scalars. This class also implements the following features:
460/// * It inserts an epilogue loop for handling loops that don't have iteration
461/// counts that are known to be a multiple of the vectorization factor.
462/// * It handles the code generation for reduction variables.
463/// * Scalarization (implementation using scalars) of un-vectorizable
464/// instructions.
465/// InnerLoopVectorizer does not perform any vectorization-legality
466/// checks, and relies on the caller to check for the different legality
467/// aspects. The InnerLoopVectorizer relies on the
468/// LoopVectorizationLegality class to provide information about the induction
469/// and reduction variables that were found to a given vectorization factor.
471public:
474 const TargetLibraryInfo *TLI,
478 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
480 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
481 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
482 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
483 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
485 // Query this against the original loop and save it here because the profile
486 // of the original loop header may change as the transformation happens.
489
491 this->MinProfitableTripCount = VecWidth;
492 else
493 this->MinProfitableTripCount = MinProfitableTripCount;
494 }
495
496 virtual ~InnerLoopVectorizer() = default;
497
498 /// Create a new empty loop that will contain vectorized instructions later
499 /// on, while the old loop will be used as the scalar remainder. Control flow
500 /// is generated around the vectorized (and scalar epilogue) loops consisting
501 /// of various checks and bypasses. Return the pre-header block of the new
502 /// loop and the start value for the canonical induction, if it is != 0. The
503 /// latter is the case when vectorizing the epilogue loop. In the case of
504 /// epilogue vectorization, this function is overriden to handle the more
505 /// complex control flow around the loops. \p ExpandedSCEVs is used to
506 /// look up SCEV expansions for expressions needed during skeleton creation.
507 virtual std::pair<BasicBlock *, Value *>
508 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
509
510 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
511 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
512
513 // Return true if any runtime check is added.
515
516 /// A helper function to scalarize a single Instruction in the innermost loop.
517 /// Generates a sequence of scalar instances for each lane between \p MinLane
518 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
519 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
520 /// Instr's operands.
521 void scalarizeInstruction(const Instruction *Instr,
522 VPReplicateRecipe *RepRecipe,
523 const VPIteration &Instance,
524 VPTransformState &State);
525
526 /// Fix the non-induction PHIs in \p Plan.
527 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
528
529 /// Create a new phi node for the induction variable \p OrigPhi to resume
530 /// iteration count in the scalar epilogue, from where the vectorized loop
531 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
532 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
533 /// and the resume values can come from an additional bypass block, the \p
534 /// AdditionalBypass pair provides information about the bypass block and the
535 /// end value on the edge from bypass to this loop.
537 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
538 ArrayRef<BasicBlock *> BypassBlocks,
539 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
540
541 /// Returns the original loop trip count.
542 Value *getTripCount() const { return TripCount; }
543
544 /// Used to set the trip count after ILV's construction and after the
545 /// preheader block has been executed. Note that this always holds the trip
546 /// count of the original loop for both main loop and epilogue vectorization.
547 void setTripCount(Value *TC) { TripCount = TC; }
548
549protected:
551
552 /// A small list of PHINodes.
554
555 /// A type for scalarized values in the new loop. Each value from the
556 /// original loop, when scalarized, is represented by UF x VF scalar values
557 /// in the new unrolled loop, where UF is the unroll factor and VF is the
558 /// vectorization factor.
560
561 /// Set up the values of the IVs correctly when exiting the vector loop.
562 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
563 Value *VectorTripCount, Value *EndValue,
564 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
565 VPlan &Plan, VPTransformState &State);
566
567 /// Iteratively sink the scalarized operands of a predicated instruction into
568 /// the block that was created for it.
569 void sinkScalarOperands(Instruction *PredInst);
570
571 /// Returns (and creates if needed) the trip count of the widened loop.
573
574 /// Emit a bypass check to see if the vector trip count is zero, including if
575 /// it overflows.
577
578 /// Emit a bypass check to see if all of the SCEV assumptions we've
579 /// had to make are correct. Returns the block containing the checks or
580 /// nullptr if no checks have been added.
582
583 /// Emit bypass checks to check any memory assumptions we may have made.
584 /// Returns the block containing the checks or nullptr if no checks have been
585 /// added.
587
588 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
589 /// vector loop preheader, middle block and scalar preheader.
591
592 /// Create new phi nodes for the induction variables to resume iteration count
593 /// in the scalar epilogue, from where the vectorized loop left off.
594 /// In cases where the loop skeleton is more complicated (eg. epilogue
595 /// vectorization) and the resume values can come from an additional bypass
596 /// block, the \p AdditionalBypass pair provides information about the bypass
597 /// block and the end value on the edge from bypass to this loop.
599 const SCEV2ValueTy &ExpandedSCEVs,
600 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
601
602 /// Complete the loop skeleton by adding debug MDs, creating appropriate
603 /// conditional branches in the middle block, preparing the builder and
604 /// running the verifier. Return the preheader of the completed vector loop.
606
607 /// Allow subclasses to override and print debug traces before/after vplan
608 /// execution, when trace information is requested.
609 virtual void printDebugTracesAtStart(){};
610 virtual void printDebugTracesAtEnd(){};
611
612 /// The original loop.
614
615 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
616 /// dynamic knowledge to simplify SCEV expressions and converts them to a
617 /// more usable form.
619
620 /// Loop Info.
622
623 /// Dominator Tree.
625
626 /// Target Library Info.
628
629 /// Target Transform Info.
631
632 /// Assumption Cache.
634
635 /// Interface to emit optimization remarks.
637
638 /// The vectorization SIMD factor to use. Each vector will have this many
639 /// vector elements.
641
643
644 /// The vectorization unroll factor to use. Each scalar is vectorized to this
645 /// many different vector instructions.
646 unsigned UF;
647
648 /// The builder that we use
650
651 // --- Vectorization state ---
652
653 /// The vector-loop preheader.
655
656 /// The scalar-loop preheader.
658
659 /// Middle Block between the vector and the scalar.
661
662 /// The unique ExitBlock of the scalar loop if one exists. Note that
663 /// there can be multiple exiting edges reaching this block.
665
666 /// The scalar loop body.
668
669 /// A list of all bypass blocks. The first block is the entry of the loop.
671
672 /// Store instructions that were predicated.
674
675 /// Trip count of the original loop.
676 Value *TripCount = nullptr;
677
678 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
680
681 /// The legality analysis.
683
684 /// The profitablity analysis.
686
687 // Record whether runtime checks are added.
688 bool AddedSafetyChecks = false;
689
690 // Holds the end values for each induction variable. We save the end values
691 // so we can later fix-up the external users of the induction variables.
693
694 /// BFI and PSI are used to check for profile guided size optimizations.
697
698 // Whether this loop should be optimized for size based on profile guided size
699 // optimizatios.
701
702 /// Structure to hold information about generated runtime checks, responsible
703 /// for cleaning the checks, if vectorization turns out unprofitable.
704 GeneratedRTChecks &RTChecks;
705
706 // Holds the resume values for reductions in the loops, used to set the
707 // correct start value of reduction PHIs when vectorizing the epilogue.
710};
711
713public:
716 const TargetLibraryInfo *TLI,
718 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
721 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
723 ElementCount::getFixed(1),
724 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
725 BFI, PSI, Check) {}
726};
727
728/// Encapsulate information regarding vectorization of a loop and its epilogue.
729/// This information is meant to be updated and used across two stages of
730/// epilogue vectorization.
733 unsigned MainLoopUF = 0;
735 unsigned EpilogueUF = 0;
740 Value *TripCount = nullptr;
742
744 ElementCount EVF, unsigned EUF)
745 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
746 assert(EUF == 1 &&
747 "A high UF for the epilogue loop is likely not beneficial.");
748 }
749};
750
751/// An extension of the inner loop vectorizer that creates a skeleton for a
752/// vectorized loop that has its epilogue (residual) also vectorized.
753/// The idea is to run the vplan on a given loop twice, firstly to setup the
754/// skeleton and vectorize the main loop, and secondly to complete the skeleton
755/// from the first step and vectorize the epilogue. This is achieved by
756/// deriving two concrete strategy classes from this base class and invoking
757/// them in succession from the loop vectorizer planner.
759public:
767 GeneratedRTChecks &Checks)
769 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
770 CM, BFI, PSI, Checks),
771 EPI(EPI) {}
772
773 // Override this function to handle the more complex control flow around the
774 // three loops.
775 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
776 const SCEV2ValueTy &ExpandedSCEVs) final {
777 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
778 }
779
780 /// The interface for creating a vectorized skeleton using one of two
781 /// different strategies, each corresponding to one execution of the vplan
782 /// as described above.
783 virtual std::pair<BasicBlock *, Value *>
784 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
785
786 /// Holds and updates state information required to vectorize the main loop
787 /// and its epilogue in two separate passes. This setup helps us avoid
788 /// regenerating and recomputing runtime safety checks. It also helps us to
789 /// shorten the iteration-count-check path length for the cases where the
790 /// iteration count of the loop is so small that the main vector loop is
791 /// completely skipped.
793};
794
795/// A specialized derived class of inner loop vectorizer that performs
796/// vectorization of *main* loops in the process of vectorizing loops and their
797/// epilogues.
799public:
807 GeneratedRTChecks &Check)
809 EPI, LVL, CM, BFI, PSI, Check) {}
810 /// Implements the interface for creating a vectorized skeleton using the
811 /// *main loop* strategy (ie the first pass of vplan execution).
812 std::pair<BasicBlock *, Value *>
813 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
814
815protected:
816 /// Emits an iteration count bypass check once for the main loop (when \p
817 /// ForEpilogue is false) and once for the epilogue loop (when \p
818 /// ForEpilogue is true).
819 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
820 void printDebugTracesAtStart() override;
821 void printDebugTracesAtEnd() override;
822};
823
824// A specialized derived class of inner loop vectorizer that performs
825// vectorization of *epilogue* loops in the process of vectorizing loops and
826// their epilogues.
828public:
836 GeneratedRTChecks &Checks)
838 EPI, LVL, CM, BFI, PSI, Checks) {
840 }
841 /// Implements the interface for creating a vectorized skeleton using the
842 /// *epilogue loop* strategy (ie the second pass of vplan execution).
843 std::pair<BasicBlock *, Value *>
844 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
845
846protected:
847 /// Emits an iteration count bypass check after the main vector loop has
848 /// finished to see if there are any iterations left to execute by either
849 /// the vector epilogue or the scalar epilogue.
851 BasicBlock *Bypass,
852 BasicBlock *Insert);
853 void printDebugTracesAtStart() override;
854 void printDebugTracesAtEnd() override;
855};
856} // end namespace llvm
857
858/// Look for a meaningful debug location on the instruction or it's
859/// operands.
861 if (!I)
862 return DebugLoc();
863
865 if (I->getDebugLoc() != Empty)
866 return I->getDebugLoc();
867
868 for (Use &Op : I->operands()) {
869 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
870 if (OpInst->getDebugLoc() != Empty)
871 return OpInst->getDebugLoc();
872 }
873
874 return I->getDebugLoc();
875}
876
877/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
878/// is passed, the message relates to that particular instruction.
879#ifndef NDEBUG
880static void debugVectorizationMessage(const StringRef Prefix,
881 const StringRef DebugMsg,
882 Instruction *I) {
883 dbgs() << "LV: " << Prefix << DebugMsg;
884 if (I != nullptr)
885 dbgs() << " " << *I;
886 else
887 dbgs() << '.';
888 dbgs() << '\n';
889}
890#endif
891
892/// Create an analysis remark that explains why vectorization failed
893///
894/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
895/// RemarkName is the identifier for the remark. If \p I is passed it is an
896/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
897/// the location of the remark. \return the remark object that can be
898/// streamed to.
900 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
901 Value *CodeRegion = TheLoop->getHeader();
902 DebugLoc DL = TheLoop->getStartLoc();
903
904 if (I) {
905 CodeRegion = I->getParent();
906 // If there is no debug location attached to the instruction, revert back to
907 // using the loop's.
908 if (I->getDebugLoc())
909 DL = I->getDebugLoc();
910 }
911
912 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
913}
914
915namespace llvm {
916
917/// Return a value for Step multiplied by VF.
919 int64_t Step) {
920 assert(Ty->isIntegerTy() && "Expected an integer step");
921 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
922}
923
924/// Return the runtime value for VF.
926 return B.CreateElementCount(Ty, VF);
927}
928
930 Loop *OrigLoop) {
931 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
932 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
933
934 ScalarEvolution &SE = *PSE.getSE();
935 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
936}
937
939 const StringRef OREMsg, const StringRef ORETag,
940 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
941 Instruction *I) {
942 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
943 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
944 ORE->emit(
945 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
946 << "loop not vectorized: " << OREMsg);
947}
948
949/// Reports an informative message: print \p Msg for debugging purposes as well
950/// as an optimization remark. Uses either \p I as location of the remark, or
951/// otherwise \p TheLoop.
952static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
953 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
954 Instruction *I = nullptr) {
956 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
957 ORE->emit(
958 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
959 << Msg);
960}
961
962/// Report successful vectorization of the loop. In case an outer loop is
963/// vectorized, prepend "outer" to the vectorization remark.
965 VectorizationFactor VF, unsigned IC) {
967 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
968 nullptr));
969 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
970 ORE->emit([&]() {
971 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
972 TheLoop->getHeader())
973 << "vectorized " << LoopType << "loop (vectorization width: "
974 << ore::NV("VectorizationFactor", VF.Width)
975 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
976 });
977}
978
979} // end namespace llvm
980
981namespace llvm {
982
983// Loop vectorization cost-model hints how the scalar epilogue loop should be
984// lowered.
986
987 // The default: allowing scalar epilogues.
989
990 // Vectorization with OptForSize: don't allow epilogues.
992
993 // A special case of vectorisation with OptForSize: loops with a very small
994 // trip count are considered for vectorization under OptForSize, thereby
995 // making sure the cost of their loop body is dominant, free of runtime
996 // guards and scalar iteration overheads.
998
999 // Loop hint predicate indicating an epilogue is undesired.
1001
1002 // Directive indicating we must either tail fold or not vectorize
1005
1006using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1007
1008/// LoopVectorizationCostModel - estimates the expected speedups due to
1009/// vectorization.
1010/// In many cases vectorization is not profitable. This can happen because of
1011/// a number of reasons. In this class we mainly attempt to predict the
1012/// expected speedup/slowdowns due to the supported instruction set. We use the
1013/// TargetTransformInfo to query the different backends for the cost of
1014/// different operations.
1016public:
1020 const TargetTransformInfo &TTI,
1026 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1027 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1028 Hints(Hints), InterleaveInfo(IAI) {}
1029
1030 /// \return An upper bound for the vectorization factors (both fixed and
1031 /// scalable). If the factors are 0, vectorization and interleaving should be
1032 /// avoided up front.
1033 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1034
1035 /// \return True if runtime checks are required for vectorization, and false
1036 /// otherwise.
1037 bool runtimeChecksRequired();
1038
1039 /// Setup cost-based decisions for user vectorization factor.
1040 /// \return true if the UserVF is a feasible VF to be chosen.
1044 return expectedCost(UserVF).isValid();
1045 }
1046
1047 /// \return The size (in bits) of the smallest and widest types in the code
1048 /// that needs to be vectorized. We ignore values that remain scalar such as
1049 /// 64 bit loop indices.
1050 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1051
1052 /// \return The desired interleave count.
1053 /// If interleave count has been specified by metadata it will be returned.
1054 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1055 /// are the selected vectorization factor and the cost of the selected VF.
1056 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1057
1058 /// Memory access instruction may be vectorized in more than one way.
1059 /// Form of instruction after vectorization depends on cost.
1060 /// This function takes cost-based decisions for Load/Store instructions
1061 /// and collects them in a map. This decisions map is used for building
1062 /// the lists of loop-uniform and loop-scalar instructions.
1063 /// The calculated cost is saved with widening decision in order to
1064 /// avoid redundant calculations.
1066
1067 /// A call may be vectorized in different ways depending on whether we have
1068 /// vectorized variants available and whether the target supports masking.
1069 /// This function analyzes all calls in the function at the supplied VF,
1070 /// makes a decision based on the costs of available options, and stores that
1071 /// decision in a map for use in planning and plan execution.
1073
1074 /// A struct that represents some properties of the register usage
1075 /// of a loop.
1077 /// Holds the number of loop invariant values that are used in the loop.
1078 /// The key is ClassID of target-provided register class.
1080 /// Holds the maximum number of concurrent live intervals in the loop.
1081 /// The key is ClassID of target-provided register class.
1083 };
1084
1085 /// \return Returns information about the register usages of the loop for the
1086 /// given vectorization factors.
1089
1090 /// Collect values we want to ignore in the cost model.
1091 void collectValuesToIgnore();
1092
1093 /// Collect all element types in the loop for which widening is needed.
1095
1096 /// Split reductions into those that happen in the loop, and those that happen
1097 /// outside. In loop reductions are collected into InLoopReductions.
1099
1100 /// Returns true if we should use strict in-order reductions for the given
1101 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1102 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1103 /// of FP operations.
1104 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1105 return !Hints->allowReordering() && RdxDesc.isOrdered();
1106 }
1107
1108 /// \returns The smallest bitwidth each instruction can be represented with.
1109 /// The vector equivalents of these instructions should be truncated to this
1110 /// type.
1112 return MinBWs;
1113 }
1114
1115 /// \returns True if it is more profitable to scalarize instruction \p I for
1116 /// vectorization factor \p VF.
1118 assert(VF.isVector() &&
1119 "Profitable to scalarize relevant only for VF > 1.");
1120 assert(
1121 TheLoop->isInnermost() &&
1122 "cost-model should not be used for outer loops (in VPlan-native path)");
1123
1124 auto Scalars = InstsToScalarize.find(VF);
1125 assert(Scalars != InstsToScalarize.end() &&
1126 "VF not yet analyzed for scalarization profitability");
1127 return Scalars->second.contains(I);
1128 }
1129
1130 /// Returns true if \p I is known to be uniform after vectorization.
1132 assert(
1133 TheLoop->isInnermost() &&
1134 "cost-model should not be used for outer loops (in VPlan-native path)");
1135 // Pseudo probe needs to be duplicated for each unrolled iteration and
1136 // vector lane so that profiled loop trip count can be accurately
1137 // accumulated instead of being under counted.
1138 if (isa<PseudoProbeInst>(I))
1139 return false;
1140
1141 if (VF.isScalar())
1142 return true;
1143
1144 auto UniformsPerVF = Uniforms.find(VF);
1145 assert(UniformsPerVF != Uniforms.end() &&
1146 "VF not yet analyzed for uniformity");
1147 return UniformsPerVF->second.count(I);
1148 }
1149
1150 /// Returns true if \p I is known to be scalar after vectorization.
1152 assert(
1153 TheLoop->isInnermost() &&
1154 "cost-model should not be used for outer loops (in VPlan-native path)");
1155 if (VF.isScalar())
1156 return true;
1157
1158 auto ScalarsPerVF = Scalars.find(VF);
1159 assert(ScalarsPerVF != Scalars.end() &&
1160 "Scalar values are not calculated for VF");
1161 return ScalarsPerVF->second.count(I);
1162 }
1163
1164 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1165 /// for vectorization factor \p VF.
1167 return VF.isVector() && MinBWs.contains(I) &&
1168 !isProfitableToScalarize(I, VF) &&
1170 }
1171
1172 /// Decision that was taken during cost calculation for memory instruction.
1175 CM_Widen, // For consecutive accesses with stride +1.
1176 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1183
1184 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1185 /// instruction \p I and vector width \p VF.
1188 assert(VF.isVector() && "Expected VF >=2");
1189 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1190 }
1191
1192 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1193 /// interleaving group \p Grp and vector width \p VF.
1197 assert(VF.isVector() && "Expected VF >=2");
1198 /// Broadcast this decicion to all instructions inside the group.
1199 /// But the cost will be assigned to one instruction only.
1200 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1201 if (auto *I = Grp->getMember(i)) {
1202 if (Grp->getInsertPos() == I)
1203 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1204 else
1205 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1206 }
1207 }
1208 }
1209
1210 /// Return the cost model decision for the given instruction \p I and vector
1211 /// width \p VF. Return CM_Unknown if this instruction did not pass
1212 /// through the cost modeling.
1214 assert(VF.isVector() && "Expected VF to be a vector VF");
1215 assert(
1216 TheLoop->isInnermost() &&
1217 "cost-model should not be used for outer loops (in VPlan-native path)");
1218
1219 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1220 auto Itr = WideningDecisions.find(InstOnVF);
1221 if (Itr == WideningDecisions.end())
1222 return CM_Unknown;
1223 return Itr->second.first;
1224 }
1225
1226 /// Return the vectorization cost for the given instruction \p I and vector
1227 /// width \p VF.
1229 assert(VF.isVector() && "Expected VF >=2");
1230 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1231 assert(WideningDecisions.contains(InstOnVF) &&
1232 "The cost is not calculated");
1233 return WideningDecisions[InstOnVF].second;
1234 }
1235
1240 std::optional<unsigned> MaskPos;
1242 };
1243
1245 Function *Variant, Intrinsic::ID IID,
1246 std::optional<unsigned> MaskPos,
1248 assert(!VF.isScalar() && "Expected vector VF");
1249 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1250 MaskPos, Cost};
1251 }
1252
1254 ElementCount VF) const {
1255 assert(!VF.isScalar() && "Expected vector VF");
1256 return CallWideningDecisions.at(std::make_pair(CI, VF));
1257 }
1258
1259 /// Return True if instruction \p I is an optimizable truncate whose operand
1260 /// is an induction variable. Such a truncate will be removed by adding a new
1261 /// induction variable with the destination type.
1263 // If the instruction is not a truncate, return false.
1264 auto *Trunc = dyn_cast<TruncInst>(I);
1265 if (!Trunc)
1266 return false;
1267
1268 // Get the source and destination types of the truncate.
1269 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1270 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1271
1272 // If the truncate is free for the given types, return false. Replacing a
1273 // free truncate with an induction variable would add an induction variable
1274 // update instruction to each iteration of the loop. We exclude from this
1275 // check the primary induction variable since it will need an update
1276 // instruction regardless.
1277 Value *Op = Trunc->getOperand(0);
1278 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1279 return false;
1280
1281 // If the truncated value is not an induction variable, return false.
1282 return Legal->isInductionPhi(Op);
1283 }
1284
1285 /// Collects the instructions to scalarize for each predicated instruction in
1286 /// the loop.
1288
1289 /// Collect Uniform and Scalar values for the given \p VF.
1290 /// The sets depend on CM decision for Load/Store instructions
1291 /// that may be vectorized as interleave, gather-scatter or scalarized.
1292 /// Also make a decision on what to do about call instructions in the loop
1293 /// at that VF -- scalarize, call a known vector routine, or call a
1294 /// vector intrinsic.
1296 // Do the analysis once.
1297 if (VF.isScalar() || Uniforms.contains(VF))
1298 return;
1301 collectLoopUniforms(VF);
1302 collectLoopScalars(VF);
1303 }
1304
1305 /// Returns true if the target machine supports masked store operation
1306 /// for the given \p DataType and kind of access to \p Ptr.
1307 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1308 return Legal->isConsecutivePtr(DataType, Ptr) &&
1309 TTI.isLegalMaskedStore(DataType, Alignment);
1310 }
1311
1312 /// Returns true if the target machine supports masked load operation
1313 /// for the given \p DataType and kind of access to \p Ptr.
1314 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1315 return Legal->isConsecutivePtr(DataType, Ptr) &&
1316 TTI.isLegalMaskedLoad(DataType, Alignment);
1317 }
1318
1319 /// Returns true if the target machine can represent \p V as a masked gather
1320 /// or scatter operation.
1322 bool LI = isa<LoadInst>(V);
1323 bool SI = isa<StoreInst>(V);
1324 if (!LI && !SI)
1325 return false;
1326 auto *Ty = getLoadStoreType(V);
1328 if (VF.isVector())
1329 Ty = VectorType::get(Ty, VF);
1330 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1331 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1332 }
1333
1334 /// Returns true if the target machine supports all of the reduction
1335 /// variables found for the given VF.
1337 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1338 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1339 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1340 }));
1341 }
1342
1343 /// Given costs for both strategies, return true if the scalar predication
1344 /// lowering should be used for div/rem. This incorporates an override
1345 /// option so it is not simply a cost comparison.
1347 InstructionCost SafeDivisorCost) const {
1348 switch (ForceSafeDivisor) {
1349 case cl::BOU_UNSET:
1350 return ScalarCost < SafeDivisorCost;
1351 case cl::BOU_TRUE:
1352 return false;
1353 case cl::BOU_FALSE:
1354 return true;
1355 };
1356 llvm_unreachable("impossible case value");
1357 }
1358
1359 /// Returns true if \p I is an instruction which requires predication and
1360 /// for which our chosen predication strategy is scalarization (i.e. we
1361 /// don't have an alternate strategy such as masking available).
1362 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1364
1365 /// Returns true if \p I is an instruction that needs to be predicated
1366 /// at runtime. The result is independent of the predication mechanism.
1367 /// Superset of instructions that return true for isScalarWithPredication.
1368 bool isPredicatedInst(Instruction *I) const;
1369
1370 /// Return the costs for our two available strategies for lowering a
1371 /// div/rem operation which requires speculating at least one lane.
1372 /// First result is for scalarization (will be invalid for scalable
1373 /// vectors); second is for the safe-divisor strategy.
1374 std::pair<InstructionCost, InstructionCost>
1376 ElementCount VF) const;
1377
1378 /// Returns true if \p I is a memory instruction with consecutive memory
1379 /// access that can be widened.
1381
1382 /// Returns true if \p I is a memory instruction in an interleaved-group
1383 /// of memory accesses that can be vectorized with wide vector loads/stores
1384 /// and shuffles.
1386
1387 /// Check if \p Instr belongs to any interleaved access group.
1389 return InterleaveInfo.isInterleaved(Instr);
1390 }
1391
1392 /// Get the interleaved access group that \p Instr belongs to.
1395 return InterleaveInfo.getInterleaveGroup(Instr);
1396 }
1397
1398 /// Returns true if we're required to use a scalar epilogue for at least
1399 /// the final iteration of the original loop.
1400 bool requiresScalarEpilogue(bool IsVectorizing) const {
1401 if (!isScalarEpilogueAllowed()) {
1402 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1403 return false;
1404 }
1405 // If we might exit from anywhere but the latch, must run the exiting
1406 // iteration in scalar form.
1408 LLVM_DEBUG(
1409 dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1410 return true;
1411 }
1412 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1413 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1414 "interleaved group requires scalar epilogue\n");
1415 return true;
1416 }
1417 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1418 return false;
1419 }
1420
1421 /// Returns true if we're required to use a scalar epilogue for at least
1422 /// the final iteration of the original loop for all VFs in \p Range.
1423 /// A scalar epilogue must either be required for all VFs in \p Range or for
1424 /// none.
1426 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1427 return requiresScalarEpilogue(VF.isVector());
1428 };
1429 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1430 assert(
1431 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1432 "all VFs in range must agree on whether a scalar epilogue is required");
1433 return IsRequired;
1434 }
1435
1436 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1437 /// loop hint annotation.
1439 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1440 }
1441
1442 /// Returns the TailFoldingStyle that is best for the current loop.
1443 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1444 if (!ChosenTailFoldingStyle)
1446 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1447 : ChosenTailFoldingStyle->second;
1448 }
1449
1450 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1451 /// overflow or not.
1452 /// \param IsScalableVF true if scalable vector factors enabled.
1453 /// \param UserIC User specific interleave count.
1454 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1455 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1456 if (!Legal->canFoldTailByMasking()) {
1457 ChosenTailFoldingStyle =
1459 return;
1460 }
1461
1462 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1463 ChosenTailFoldingStyle = std::make_pair(
1464 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1465 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1466 return;
1467 }
1468
1469 // Set styles when forced.
1470 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1471 ForceTailFoldingStyle.getValue());
1473 return;
1474 // Override forced styles if needed.
1475 // FIXME: use actual opcode/data type for analysis here.
1476 // FIXME: Investigate opportunity for fixed vector factor.
1477 bool EVLIsLegal =
1478 IsScalableVF && UserIC <= 1 &&
1479 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1481 // FIXME: implement support for max safe dependency distance.
1483 if (!EVLIsLegal) {
1484 // If for some reason EVL mode is unsupported, fallback to
1485 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1486 // in a generic way.
1487 ChosenTailFoldingStyle =
1490 LLVM_DEBUG(
1491 dbgs()
1492 << "LV: Preference for VP intrinsics indicated. Will "
1493 "not try to generate VP Intrinsics "
1494 << (UserIC > 1
1495 ? "since interleave count specified is greater than 1.\n"
1496 : "due to non-interleaving reasons.\n"));
1497 }
1498 }
1499
1500 /// Returns true if all loop blocks should be masked to fold tail loop.
1501 bool foldTailByMasking() const {
1502 // TODO: check if it is possible to check for None style independent of
1503 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1505 }
1506
1507 /// Returns true if the instructions in this block requires predication
1508 /// for any reason, e.g. because tail folding now requires a predicate
1509 /// or because the block in the original loop was predicated.
1512 }
1513
1514 /// Returns true if VP intrinsics with explicit vector length support should
1515 /// be generated in the tail folded loop.
1516 bool foldTailWithEVL() const {
1518 }
1519
1520 /// Returns true if the Phi is part of an inloop reduction.
1521 bool isInLoopReduction(PHINode *Phi) const {
1522 return InLoopReductions.contains(Phi);
1523 }
1524
1525 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1526 /// with factor VF. Return the cost of the instruction, including
1527 /// scalarization overhead if it's needed.
1529
1530 /// Estimate cost of a call instruction CI if it were vectorized with factor
1531 /// VF. Return the cost of the instruction, including scalarization overhead
1532 /// if it's needed.
1534
1535 /// Invalidates decisions already taken by the cost model.
1537 WideningDecisions.clear();
1538 CallWideningDecisions.clear();
1539 Uniforms.clear();
1540 Scalars.clear();
1541 }
1542
1543 /// Returns the expected execution cost. The unit of the cost does
1544 /// not matter because we use the 'cost' units to compare different
1545 /// vector widths. The cost that is returned is *not* normalized by
1546 /// the factor width. If \p Invalid is not nullptr, this function
1547 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1548 /// each instruction that has an Invalid cost for the given VF.
1552
1553 bool hasPredStores() const { return NumPredStores > 0; }
1554
1555 /// Returns true if epilogue vectorization is considered profitable, and
1556 /// false otherwise.
1557 /// \p VF is the vectorization factor chosen for the original loop.
1559
1560 /// Returns the execution time cost of an instruction for a given vector
1561 /// width. Vector width of one means scalar.
1563
1564 /// Return the cost of instructions in an inloop reduction pattern, if I is
1565 /// part of that pattern.
1566 std::optional<InstructionCost>
1569
1570private:
1571 unsigned NumPredStores = 0;
1572
1573 /// \return An upper bound for the vectorization factors for both
1574 /// fixed and scalable vectorization, where the minimum-known number of
1575 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1576 /// disabled or unsupported, then the scalable part will be equal to
1577 /// ElementCount::getScalable(0).
1578 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1579 ElementCount UserVF,
1580 bool FoldTailByMasking);
1581
1582 /// \return the maximized element count based on the targets vector
1583 /// registers and the loop trip-count, but limited to a maximum safe VF.
1584 /// This is a helper function of computeFeasibleMaxVF.
1585 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1586 unsigned SmallestType,
1587 unsigned WidestType,
1588 ElementCount MaxSafeVF,
1589 bool FoldTailByMasking);
1590
1591 /// Checks if scalable vectorization is supported and enabled. Caches the
1592 /// result to avoid repeated debug dumps for repeated queries.
1593 bool isScalableVectorizationAllowed();
1594
1595 /// \return the maximum legal scalable VF, based on the safe max number
1596 /// of elements.
1597 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1598
1599 /// Calculate vectorization cost of memory instruction \p I.
1600 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1601
1602 /// The cost computation for scalarized memory instruction.
1603 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1604
1605 /// The cost computation for interleaving group of memory instructions.
1606 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1607
1608 /// The cost computation for Gather/Scatter instruction.
1609 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1610
1611 /// The cost computation for widening instruction \p I with consecutive
1612 /// memory access.
1613 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1614
1615 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1616 /// Load: scalar load + broadcast.
1617 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1618 /// element)
1619 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1620
1621 /// Estimate the overhead of scalarizing an instruction. This is a
1622 /// convenience wrapper for the type-based getScalarizationOverhead API.
1623 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1625
1626 /// Returns true if an artificially high cost for emulated masked memrefs
1627 /// should be used.
1628 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1629
1630 /// Map of scalar integer values to the smallest bitwidth they can be legally
1631 /// represented as. The vector equivalents of these values should be truncated
1632 /// to this type.
1634
1635 /// A type representing the costs for instructions if they were to be
1636 /// scalarized rather than vectorized. The entries are Instruction-Cost
1637 /// pairs.
1638 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1639
1640 /// A set containing all BasicBlocks that are known to present after
1641 /// vectorization as a predicated block.
1643 PredicatedBBsAfterVectorization;
1644
1645 /// Records whether it is allowed to have the original scalar loop execute at
1646 /// least once. This may be needed as a fallback loop in case runtime
1647 /// aliasing/dependence checks fail, or to handle the tail/remainder
1648 /// iterations when the trip count is unknown or doesn't divide by the VF,
1649 /// or as a peel-loop to handle gaps in interleave-groups.
1650 /// Under optsize and when the trip count is very small we don't allow any
1651 /// iterations to execute in the scalar loop.
1652 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1653
1654 /// Control finally chosen tail folding style. The first element is used if
1655 /// the IV update may overflow, the second element - if it does not.
1656 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1657 ChosenTailFoldingStyle;
1658
1659 /// true if scalable vectorization is supported and enabled.
1660 std::optional<bool> IsScalableVectorizationAllowed;
1661
1662 /// A map holding scalar costs for different vectorization factors. The
1663 /// presence of a cost for an instruction in the mapping indicates that the
1664 /// instruction will be scalarized when vectorizing with the associated
1665 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1667
1668 /// Holds the instructions known to be uniform after vectorization.
1669 /// The data is collected per VF.
1671
1672 /// Holds the instructions known to be scalar after vectorization.
1673 /// The data is collected per VF.
1675
1676 /// Holds the instructions (address computations) that are forced to be
1677 /// scalarized.
1679
1680 /// PHINodes of the reductions that should be expanded in-loop.
1681 SmallPtrSet<PHINode *, 4> InLoopReductions;
1682
1683 /// A Map of inloop reduction operations and their immediate chain operand.
1684 /// FIXME: This can be removed once reductions can be costed correctly in
1685 /// VPlan. This was added to allow quick lookup of the inloop operations.
1686 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1687
1688 /// Returns the expected difference in cost from scalarizing the expression
1689 /// feeding a predicated instruction \p PredInst. The instructions to
1690 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1691 /// non-negative return value implies the expression will be scalarized.
1692 /// Currently, only single-use chains are considered for scalarization.
1693 InstructionCost computePredInstDiscount(Instruction *PredInst,
1694 ScalarCostsTy &ScalarCosts,
1695 ElementCount VF);
1696
1697 /// Collect the instructions that are uniform after vectorization. An
1698 /// instruction is uniform if we represent it with a single scalar value in
1699 /// the vectorized loop corresponding to each vector iteration. Examples of
1700 /// uniform instructions include pointer operands of consecutive or
1701 /// interleaved memory accesses. Note that although uniformity implies an
1702 /// instruction will be scalar, the reverse is not true. In general, a
1703 /// scalarized instruction will be represented by VF scalar values in the
1704 /// vectorized loop, each corresponding to an iteration of the original
1705 /// scalar loop.
1706 void collectLoopUniforms(ElementCount VF);
1707
1708 /// Collect the instructions that are scalar after vectorization. An
1709 /// instruction is scalar if it is known to be uniform or will be scalarized
1710 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1711 /// to the list if they are used by a load/store instruction that is marked as
1712 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1713 /// VF values in the vectorized loop, each corresponding to an iteration of
1714 /// the original scalar loop.
1715 void collectLoopScalars(ElementCount VF);
1716
1717 /// Keeps cost model vectorization decision and cost for instructions.
1718 /// Right now it is used for memory instructions only.
1720 std::pair<InstWidening, InstructionCost>>;
1721
1722 DecisionList WideningDecisions;
1723
1724 using CallDecisionList =
1725 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1726
1727 CallDecisionList CallWideningDecisions;
1728
1729 /// Returns true if \p V is expected to be vectorized and it needs to be
1730 /// extracted.
1731 bool needsExtract(Value *V, ElementCount VF) const {
1732 Instruction *I = dyn_cast<Instruction>(V);
1733 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1735 return false;
1736
1737 // Assume we can vectorize V (and hence we need extraction) if the
1738 // scalars are not computed yet. This can happen, because it is called
1739 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1740 // the scalars are collected. That should be a safe assumption in most
1741 // cases, because we check if the operands have vectorizable types
1742 // beforehand in LoopVectorizationLegality.
1743 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1744 };
1745
1746 /// Returns a range containing only operands needing to be extracted.
1747 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1748 ElementCount VF) const {
1750 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1751 }
1752
1753public:
1754 /// The loop that we evaluate.
1756
1757 /// Predicated scalar evolution analysis.
1759
1760 /// Loop Info analysis.
1762
1763 /// Vectorization legality.
1765
1766 /// Vector target information.
1768
1769 /// Target Library Info.
1771
1772 /// Demanded bits analysis.
1774
1775 /// Assumption cache.
1777
1778 /// Interface to emit optimization remarks.
1780
1782
1783 /// Loop Vectorize Hint.
1785
1786 /// The interleave access information contains groups of interleaved accesses
1787 /// with the same stride and close to each other.
1789
1790 /// Values to ignore in the cost model.
1792
1793 /// Values to ignore in the cost model when VF > 1.
1795
1796 /// All element types found in the loop.
1798};
1799} // end namespace llvm
1800
1801namespace {
1802/// Helper struct to manage generating runtime checks for vectorization.
1803///
1804/// The runtime checks are created up-front in temporary blocks to allow better
1805/// estimating the cost and un-linked from the existing IR. After deciding to
1806/// vectorize, the checks are moved back. If deciding not to vectorize, the
1807/// temporary blocks are completely removed.
1808class GeneratedRTChecks {
1809 /// Basic block which contains the generated SCEV checks, if any.
1810 BasicBlock *SCEVCheckBlock = nullptr;
1811
1812 /// The value representing the result of the generated SCEV checks. If it is
1813 /// nullptr, either no SCEV checks have been generated or they have been used.
1814 Value *SCEVCheckCond = nullptr;
1815
1816 /// Basic block which contains the generated memory runtime checks, if any.
1817 BasicBlock *MemCheckBlock = nullptr;
1818
1819 /// The value representing the result of the generated memory runtime checks.
1820 /// If it is nullptr, either no memory runtime checks have been generated or
1821 /// they have been used.
1822 Value *MemRuntimeCheckCond = nullptr;
1823
1824 DominatorTree *DT;
1825 LoopInfo *LI;
1827
1828 SCEVExpander SCEVExp;
1829 SCEVExpander MemCheckExp;
1830
1831 bool CostTooHigh = false;
1832 const bool AddBranchWeights;
1833
1834 Loop *OuterLoop = nullptr;
1835
1836public:
1837 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1839 bool AddBranchWeights)
1840 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1841 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1842
1843 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1844 /// accurately estimate the cost of the runtime checks. The blocks are
1845 /// un-linked from the IR and is added back during vector code generation. If
1846 /// there is no vector code generation, the check blocks are removed
1847 /// completely.
1848 void Create(Loop *L, const LoopAccessInfo &LAI,
1849 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1850
1851 // Hard cutoff to limit compile-time increase in case a very large number of
1852 // runtime checks needs to be generated.
1853 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1854 // profile info.
1855 CostTooHigh =
1857 if (CostTooHigh)
1858 return;
1859
1860 BasicBlock *LoopHeader = L->getHeader();
1861 BasicBlock *Preheader = L->getLoopPreheader();
1862
1863 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1864 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1865 // may be used by SCEVExpander. The blocks will be un-linked from their
1866 // predecessors and removed from LI & DT at the end of the function.
1867 if (!UnionPred.isAlwaysTrue()) {
1868 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1869 nullptr, "vector.scevcheck");
1870
1871 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1872 &UnionPred, SCEVCheckBlock->getTerminator());
1873 }
1874
1875 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1876 if (RtPtrChecking.Need) {
1877 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1878 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1879 "vector.memcheck");
1880
1881 auto DiffChecks = RtPtrChecking.getDiffChecks();
1882 if (DiffChecks) {
1883 Value *RuntimeVF = nullptr;
1884 MemRuntimeCheckCond = addDiffRuntimeChecks(
1885 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1886 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1887 if (!RuntimeVF)
1888 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1889 return RuntimeVF;
1890 },
1891 IC);
1892 } else {
1893 MemRuntimeCheckCond = addRuntimeChecks(
1894 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1896 }
1897 assert(MemRuntimeCheckCond &&
1898 "no RT checks generated although RtPtrChecking "
1899 "claimed checks are required");
1900 }
1901
1902 if (!MemCheckBlock && !SCEVCheckBlock)
1903 return;
1904
1905 // Unhook the temporary block with the checks, update various places
1906 // accordingly.
1907 if (SCEVCheckBlock)
1908 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1909 if (MemCheckBlock)
1910 MemCheckBlock->replaceAllUsesWith(Preheader);
1911
1912 if (SCEVCheckBlock) {
1913 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1914 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1915 Preheader->getTerminator()->eraseFromParent();
1916 }
1917 if (MemCheckBlock) {
1918 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1919 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1920 Preheader->getTerminator()->eraseFromParent();
1921 }
1922
1923 DT->changeImmediateDominator(LoopHeader, Preheader);
1924 if (MemCheckBlock) {
1925 DT->eraseNode(MemCheckBlock);
1926 LI->removeBlock(MemCheckBlock);
1927 }
1928 if (SCEVCheckBlock) {
1929 DT->eraseNode(SCEVCheckBlock);
1930 LI->removeBlock(SCEVCheckBlock);
1931 }
1932
1933 // Outer loop is used as part of the later cost calculations.
1934 OuterLoop = L->getParentLoop();
1935 }
1936
1937 InstructionCost getCost() {
1938 if (SCEVCheckBlock || MemCheckBlock)
1939 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1940
1941 if (CostTooHigh) {
1943 Cost.setInvalid();
1944 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1945 return Cost;
1946 }
1947
1948 InstructionCost RTCheckCost = 0;
1949 if (SCEVCheckBlock)
1950 for (Instruction &I : *SCEVCheckBlock) {
1951 if (SCEVCheckBlock->getTerminator() == &I)
1952 continue;
1955 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1956 RTCheckCost += C;
1957 }
1958 if (MemCheckBlock) {
1959 InstructionCost MemCheckCost = 0;
1960 for (Instruction &I : *MemCheckBlock) {
1961 if (MemCheckBlock->getTerminator() == &I)
1962 continue;
1965 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1966 MemCheckCost += C;
1967 }
1968
1969 // If the runtime memory checks are being created inside an outer loop
1970 // we should find out if these checks are outer loop invariant. If so,
1971 // the checks will likely be hoisted out and so the effective cost will
1972 // reduce according to the outer loop trip count.
1973 if (OuterLoop) {
1974 ScalarEvolution *SE = MemCheckExp.getSE();
1975 // TODO: If profitable, we could refine this further by analysing every
1976 // individual memory check, since there could be a mixture of loop
1977 // variant and invariant checks that mean the final condition is
1978 // variant.
1979 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1980 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1981 // It seems reasonable to assume that we can reduce the effective
1982 // cost of the checks even when we know nothing about the trip
1983 // count. Assume that the outer loop executes at least twice.
1984 unsigned BestTripCount = 2;
1985
1986 // If exact trip count is known use that.
1987 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
1988 BestTripCount = SmallTC;
1990 // Else use profile data if available.
1991 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
1992 BestTripCount = *EstimatedTC;
1993 }
1994
1995 BestTripCount = std::max(BestTripCount, 1U);
1996 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1997
1998 // Let's ensure the cost is always at least 1.
1999 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2001
2002 if (BestTripCount > 1)
2004 << "We expect runtime memory checks to be hoisted "
2005 << "out of the outer loop. Cost reduced from "
2006 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2007
2008 MemCheckCost = NewMemCheckCost;
2009 }
2010 }
2011
2012 RTCheckCost += MemCheckCost;
2013 }
2014
2015 if (SCEVCheckBlock || MemCheckBlock)
2016 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2017 << "\n");
2018
2019 return RTCheckCost;
2020 }
2021
2022 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2023 /// unused.
2024 ~GeneratedRTChecks() {
2025 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2026 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2027 if (!SCEVCheckCond)
2028 SCEVCleaner.markResultUsed();
2029
2030 if (!MemRuntimeCheckCond)
2031 MemCheckCleaner.markResultUsed();
2032
2033 if (MemRuntimeCheckCond) {
2034 auto &SE = *MemCheckExp.getSE();
2035 // Memory runtime check generation creates compares that use expanded
2036 // values. Remove them before running the SCEVExpanderCleaners.
2037 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2038 if (MemCheckExp.isInsertedInstruction(&I))
2039 continue;
2040 SE.forgetValue(&I);
2041 I.eraseFromParent();
2042 }
2043 }
2044 MemCheckCleaner.cleanup();
2045 SCEVCleaner.cleanup();
2046
2047 if (SCEVCheckCond)
2048 SCEVCheckBlock->eraseFromParent();
2049 if (MemRuntimeCheckCond)
2050 MemCheckBlock->eraseFromParent();
2051 }
2052
2053 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2054 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2055 /// depending on the generated condition.
2056 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2057 BasicBlock *LoopVectorPreHeader,
2058 BasicBlock *LoopExitBlock) {
2059 if (!SCEVCheckCond)
2060 return nullptr;
2061
2062 Value *Cond = SCEVCheckCond;
2063 // Mark the check as used, to prevent it from being removed during cleanup.
2064 SCEVCheckCond = nullptr;
2065 if (auto *C = dyn_cast<ConstantInt>(Cond))
2066 if (C->isZero())
2067 return nullptr;
2068
2069 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2070
2071 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2072 // Create new preheader for vector loop.
2073 if (OuterLoop)
2074 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2075
2076 SCEVCheckBlock->getTerminator()->eraseFromParent();
2077 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2078 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2079 SCEVCheckBlock);
2080
2081 DT->addNewBlock(SCEVCheckBlock, Pred);
2082 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2083
2084 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2085 if (AddBranchWeights)
2086 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2087 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2088 return SCEVCheckBlock;
2089 }
2090
2091 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2092 /// the branches to branch to the vector preheader or \p Bypass, depending on
2093 /// the generated condition.
2094 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2095 BasicBlock *LoopVectorPreHeader) {
2096 // Check if we generated code that checks in runtime if arrays overlap.
2097 if (!MemRuntimeCheckCond)
2098 return nullptr;
2099
2100 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2101 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2102 MemCheckBlock);
2103
2104 DT->addNewBlock(MemCheckBlock, Pred);
2105 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2106 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2107
2108 if (OuterLoop)
2109 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2110
2111 BranchInst &BI =
2112 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2113 if (AddBranchWeights) {
2114 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2115 }
2116 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2117 MemCheckBlock->getTerminator()->setDebugLoc(
2118 Pred->getTerminator()->getDebugLoc());
2119
2120 // Mark the check as used, to prevent it from being removed during cleanup.
2121 MemRuntimeCheckCond = nullptr;
2122 return MemCheckBlock;
2123 }
2124};
2125} // namespace
2126
2128 return Style == TailFoldingStyle::Data ||
2129 Style == TailFoldingStyle::DataAndControlFlow ||
2130 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2131}
2132
2134 return Style == TailFoldingStyle::DataAndControlFlow ||
2135 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2136}
2137
2138// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2139// vectorization. The loop needs to be annotated with #pragma omp simd
2140// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2141// vector length information is not provided, vectorization is not considered
2142// explicit. Interleave hints are not allowed either. These limitations will be
2143// relaxed in the future.
2144// Please, note that we are currently forced to abuse the pragma 'clang
2145// vectorize' semantics. This pragma provides *auto-vectorization hints*
2146// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2147// provides *explicit vectorization hints* (LV can bypass legal checks and
2148// assume that vectorization is legal). However, both hints are implemented
2149// using the same metadata (llvm.loop.vectorize, processed by
2150// LoopVectorizeHints). This will be fixed in the future when the native IR
2151// representation for pragma 'omp simd' is introduced.
2152static bool isExplicitVecOuterLoop(Loop *OuterLp,
2154 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2155 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2156
2157 // Only outer loops with an explicit vectorization hint are supported.
2158 // Unannotated outer loops are ignored.
2160 return false;
2161
2162 Function *Fn = OuterLp->getHeader()->getParent();
2163 if (!Hints.allowVectorization(Fn, OuterLp,
2164 true /*VectorizeOnlyWhenForced*/)) {
2165 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2166 return false;
2167 }
2168
2169 if (Hints.getInterleave() > 1) {
2170 // TODO: Interleave support is future work.
2171 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2172 "outer loops.\n");
2173 Hints.emitRemarkWithHints();
2174 return false;
2175 }
2176
2177 return true;
2178}
2179
2183 // Collect inner loops and outer loops without irreducible control flow. For
2184 // now, only collect outer loops that have explicit vectorization hints. If we
2185 // are stress testing the VPlan H-CFG construction, we collect the outermost
2186 // loop of every loop nest.
2187 if (L.isInnermost() || VPlanBuildStressTest ||
2189 LoopBlocksRPO RPOT(&L);
2190 RPOT.perform(LI);
2191 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2192 V.push_back(&L);
2193 // TODO: Collect inner loops inside marked outer loops in case
2194 // vectorization fails for the outer loop. Do not invoke
2195 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2196 // already known to be reducible. We can use an inherited attribute for
2197 // that.
2198 return;
2199 }
2200 }
2201 for (Loop *InnerL : L)
2202 collectSupportedLoops(*InnerL, LI, ORE, V);
2203}
2204
2205//===----------------------------------------------------------------------===//
2206// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2207// LoopVectorizationCostModel and LoopVectorizationPlanner.
2208//===----------------------------------------------------------------------===//
2209
2210/// Compute the transformed value of Index at offset StartValue using step
2211/// StepValue.
2212/// For integer induction, returns StartValue + Index * StepValue.
2213/// For pointer induction, returns StartValue[Index * StepValue].
2214/// FIXME: The newly created binary instructions should contain nsw/nuw
2215/// flags, which can be found from the original scalar operations.
2216static Value *
2218 Value *Step,
2220 const BinaryOperator *InductionBinOp) {
2221 Type *StepTy = Step->getType();
2222 Value *CastedIndex = StepTy->isIntegerTy()
2223 ? B.CreateSExtOrTrunc(Index, StepTy)
2224 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2225 if (CastedIndex != Index) {
2226 CastedIndex->setName(CastedIndex->getName() + ".cast");
2227 Index = CastedIndex;
2228 }
2229
2230 // Note: the IR at this point is broken. We cannot use SE to create any new
2231 // SCEV and then expand it, hoping that SCEV's simplification will give us
2232 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2233 // lead to various SCEV crashes. So all we can do is to use builder and rely
2234 // on InstCombine for future simplifications. Here we handle some trivial
2235 // cases only.
2236 auto CreateAdd = [&B](Value *X, Value *Y) {
2237 assert(X->getType() == Y->getType() && "Types don't match!");
2238 if (auto *CX = dyn_cast<ConstantInt>(X))
2239 if (CX->isZero())
2240 return Y;
2241 if (auto *CY = dyn_cast<ConstantInt>(Y))
2242 if (CY->isZero())
2243 return X;
2244 return B.CreateAdd(X, Y);
2245 };
2246
2247 // We allow X to be a vector type, in which case Y will potentially be
2248 // splatted into a vector with the same element count.
2249 auto CreateMul = [&B](Value *X, Value *Y) {
2250 assert(X->getType()->getScalarType() == Y->getType() &&
2251 "Types don't match!");
2252 if (auto *CX = dyn_cast<ConstantInt>(X))
2253 if (CX->isOne())
2254 return Y;
2255 if (auto *CY = dyn_cast<ConstantInt>(Y))
2256 if (CY->isOne())
2257 return X;
2258 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2259 if (XVTy && !isa<VectorType>(Y->getType()))
2260 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2261 return B.CreateMul(X, Y);
2262 };
2263
2264 switch (InductionKind) {
2266 assert(!isa<VectorType>(Index->getType()) &&
2267 "Vector indices not supported for integer inductions yet");
2268 assert(Index->getType() == StartValue->getType() &&
2269 "Index type does not match StartValue type");
2270 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2271 return B.CreateSub(StartValue, Index);
2272 auto *Offset = CreateMul(Index, Step);
2273 return CreateAdd(StartValue, Offset);
2274 }
2276 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2278 assert(!isa<VectorType>(Index->getType()) &&
2279 "Vector indices not supported for FP inductions yet");
2280 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2281 assert(InductionBinOp &&
2282 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2283 InductionBinOp->getOpcode() == Instruction::FSub) &&
2284 "Original bin op should be defined for FP induction");
2285
2286 Value *MulExp = B.CreateFMul(Step, Index);
2287 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2288 "induction");
2289 }
2291 return nullptr;
2292 }
2293 llvm_unreachable("invalid enum");
2294}
2295
2296std::optional<unsigned> getMaxVScale(const Function &F,
2297 const TargetTransformInfo &TTI) {
2298 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2299 return MaxVScale;
2300
2301 if (F.hasFnAttribute(Attribute::VScaleRange))
2302 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2303
2304 return std::nullopt;
2305}
2306
2307/// For the given VF and UF and maximum trip count computed for the loop, return
2308/// whether the induction variable might overflow in the vectorized loop. If not,
2309/// then we know a runtime overflow check always evaluates to false and can be
2310/// removed.
2313 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2314 // Always be conservative if we don't know the exact unroll factor.
2315 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2316
2317 Type *IdxTy = Cost->Legal->getWidestInductionType();
2318 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2319
2320 // We know the runtime overflow check is known false iff the (max) trip-count
2321 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2322 // the vector loop induction variable.
2323 if (unsigned TC =
2324 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2325 uint64_t MaxVF = VF.getKnownMinValue();
2326 if (VF.isScalable()) {
2327 std::optional<unsigned> MaxVScale =
2328 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2329 if (!MaxVScale)
2330 return false;
2331 MaxVF *= *MaxVScale;
2332 }
2333
2334 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2335 }
2336
2337 return false;
2338}
2339
2340// Return whether we allow using masked interleave-groups (for dealing with
2341// strided loads/stores that reside in predicated blocks, or for dealing
2342// with gaps).
2344 // If an override option has been passed in for interleaved accesses, use it.
2345 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2347
2349}
2350
2352 VPReplicateRecipe *RepRecipe,
2353 const VPIteration &Instance,
2354 VPTransformState &State) {
2355 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2356
2357 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2358 // the first lane and part.
2359 if (isa<NoAliasScopeDeclInst>(Instr))
2360 if (!Instance.isFirstIteration())
2361 return;
2362
2363 // Does this instruction return a value ?
2364 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2365
2366 Instruction *Cloned = Instr->clone();
2367 if (!IsVoidRetTy) {
2368 Cloned->setName(Instr->getName() + ".cloned");
2369#if !defined(NDEBUG)
2370 // Verify that VPlan type inference results agree with the type of the
2371 // generated values.
2372 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2373 "inferred type and type from generated instructions do not match");
2374#endif
2375 }
2376
2377 RepRecipe->setFlags(Cloned);
2378
2379 if (auto DL = Instr->getDebugLoc())
2380 State.setDebugLocFrom(DL);
2381
2382 // Replace the operands of the cloned instructions with their scalar
2383 // equivalents in the new loop.
2384 for (const auto &I : enumerate(RepRecipe->operands())) {
2385 auto InputInstance = Instance;
2386 VPValue *Operand = I.value();
2388 InputInstance.Lane = VPLane::getFirstLane();
2389 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2390 }
2391 State.addNewMetadata(Cloned, Instr);
2392
2393 // Place the cloned scalar in the new loop.
2394 State.Builder.Insert(Cloned);
2395
2396 State.set(RepRecipe, Cloned, Instance);
2397
2398 // If we just cloned a new assumption, add it the assumption cache.
2399 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2401
2402 // End if-block.
2403 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2404 if (IfPredicateInstr)
2405 PredicatedInstructions.push_back(Cloned);
2406}
2407
2408Value *
2410 if (VectorTripCount)
2411 return VectorTripCount;
2412
2413 Value *TC = getTripCount();
2414 IRBuilder<> Builder(InsertBlock->getTerminator());
2415
2416 Type *Ty = TC->getType();
2417 // This is where we can make the step a runtime constant.
2418 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2419
2420 // If the tail is to be folded by masking, round the number of iterations N
2421 // up to a multiple of Step instead of rounding down. This is done by first
2422 // adding Step-1 and then rounding down. Note that it's ok if this addition
2423 // overflows: the vector induction variable will eventually wrap to zero given
2424 // that it starts at zero and its Step is a power of two; the loop will then
2425 // exit, with the last early-exit vector comparison also producing all-true.
2426 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2427 // is accounted for in emitIterationCountCheck that adds an overflow check.
2428 if (Cost->foldTailByMasking()) {
2430 "VF*UF must be a power of 2 when folding tail by masking");
2431 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2432 "n.rnd.up");
2433 }
2434
2435 // Now we need to generate the expression for the part of the loop that the
2436 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2437 // iterations are not required for correctness, or N - Step, otherwise. Step
2438 // is equal to the vectorization factor (number of SIMD elements) times the
2439 // unroll factor (number of SIMD instructions).
2440 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2441
2442 // There are cases where we *must* run at least one iteration in the remainder
2443 // loop. See the cost model for when this can happen. If the step evenly
2444 // divides the trip count, we set the remainder to be equal to the step. If
2445 // the step does not evenly divide the trip count, no adjustment is necessary
2446 // since there will already be scalar iterations. Note that the minimum
2447 // iterations check ensures that N >= Step.
2448 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2449 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2450 R = Builder.CreateSelect(IsZero, Step, R);
2451 }
2452
2453 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2454
2455 return VectorTripCount;
2456}
2457
2459 Value *Count = getTripCount();
2460 // Reuse existing vector loop preheader for TC checks.
2461 // Note that new preheader block is generated for vector loop.
2462 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2463 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2464
2465 // Generate code to check if the loop's trip count is less than VF * UF, or
2466 // equal to it in case a scalar epilogue is required; this implies that the
2467 // vector trip count is zero. This check also covers the case where adding one
2468 // to the backedge-taken count overflowed leading to an incorrect trip count
2469 // of zero. In this case we will also jump to the scalar loop.
2470 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2472
2473 // If tail is to be folded, vector loop takes care of all iterations.
2474 Type *CountTy = Count->getType();
2475 Value *CheckMinIters = Builder.getFalse();
2476 auto CreateStep = [&]() -> Value * {
2477 // Create step with max(MinProTripCount, UF * VF).
2479 return createStepForVF(Builder, CountTy, VF, UF);
2480
2481 Value *MinProfTC =
2483 if (!VF.isScalable())
2484 return MinProfTC;
2486 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2487 };
2488
2489 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2490 if (Style == TailFoldingStyle::None)
2491 CheckMinIters =
2492 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2493 else if (VF.isScalable() &&
2496 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2497 // an overflow to zero when updating induction variables and so an
2498 // additional overflow check is required before entering the vector loop.
2499
2500 // Get the maximum unsigned value for the type.
2501 Value *MaxUIntTripCount =
2502 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2503 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2504
2505 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2506 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2507 }
2508
2509 // Create new preheader for vector loop.
2511 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2512 "vector.ph");
2513
2514 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2515 DT->getNode(Bypass)->getIDom()) &&
2516 "TC check is expected to dominate Bypass");
2517
2518 // Update dominator for Bypass & LoopExit (if needed).
2519 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2520 BranchInst &BI =
2521 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2523 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2524 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2525 LoopBypassBlocks.push_back(TCCheckBlock);
2526}
2527
2529 BasicBlock *const SCEVCheckBlock =
2530 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2531 if (!SCEVCheckBlock)
2532 return nullptr;
2533
2534 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2536 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2537 "Cannot SCEV check stride or overflow when optimizing for size");
2538
2539
2540 // Update dominator only if this is first RT check.
2541 if (LoopBypassBlocks.empty()) {
2542 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2543 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2544 // If there is an epilogue which must run, there's no edge from the
2545 // middle block to exit blocks and thus no need to update the immediate
2546 // dominator of the exit blocks.
2547 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2548 }
2549
2550 LoopBypassBlocks.push_back(SCEVCheckBlock);
2551 AddedSafetyChecks = true;
2552 return SCEVCheckBlock;
2553}
2554
2556 // VPlan-native path does not do any analysis for runtime checks currently.
2558 return nullptr;
2559
2560 BasicBlock *const MemCheckBlock =
2561 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2562
2563 // Check if we generated code that checks in runtime if arrays overlap. We put
2564 // the checks into a separate block to make the more common case of few
2565 // elements faster.
2566 if (!MemCheckBlock)
2567 return nullptr;
2568
2569 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2570 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2571 "Cannot emit memory checks when optimizing for size, unless forced "
2572 "to vectorize.");
2573 ORE->emit([&]() {
2574 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2577 << "Code-size may be reduced by not forcing "
2578 "vectorization, or by source-code modifications "
2579 "eliminating the need for runtime checks "
2580 "(e.g., adding 'restrict').";
2581 });
2582 }
2583
2584 LoopBypassBlocks.push_back(MemCheckBlock);
2585
2586 AddedSafetyChecks = true;
2587
2588 return MemCheckBlock;
2589}
2590
2594 assert(LoopVectorPreHeader && "Invalid loop structure");
2595 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2596 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2597 "multiple exit loop without required epilogue?");
2598
2601 LI, nullptr, Twine(Prefix) + "middle.block");
2604 nullptr, Twine(Prefix) + "scalar.ph");
2605}
2606
2608 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2609 ArrayRef<BasicBlock *> BypassBlocks,
2610 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2612 assert(VectorTripCount && "Expected valid arguments");
2613
2614 Instruction *OldInduction = Legal->getPrimaryInduction();
2615 Value *&EndValue = IVEndValues[OrigPhi];
2616 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
2617 if (OrigPhi == OldInduction) {
2618 // We know what the end value is.
2619 EndValue = VectorTripCount;
2620 } else {
2622
2623 // Fast-math-flags propagate from the original induction instruction.
2624 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2625 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2626
2627 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
2628 Step, II.getKind(), II.getInductionBinOp());
2629 EndValue->setName("ind.end");
2630
2631 // Compute the end value for the additional bypass (if applicable).
2632 if (AdditionalBypass.first) {
2633 B.SetInsertPoint(AdditionalBypass.first,
2634 AdditionalBypass.first->getFirstInsertionPt());
2635 EndValueFromAdditionalBypass =
2636 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
2637 Step, II.getKind(), II.getInductionBinOp());
2638 EndValueFromAdditionalBypass->setName("ind.end");
2639 }
2640 }
2641
2642 // Create phi nodes to merge from the backedge-taken check block.
2643 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
2645 // Copy original phi DL over to the new one.
2646 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2647
2648 // The new PHI merges the original incoming value, in case of a bypass,
2649 // or the value at the end of the vectorized loop.
2650 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
2651
2652 // Fix the scalar body counter (PHI node).
2653 // The old induction's phi node in the scalar body needs the truncated
2654 // value.
2655 for (BasicBlock *BB : BypassBlocks)
2656 BCResumeVal->addIncoming(II.getStartValue(), BB);
2657
2658 if (AdditionalBypass.first)
2659 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
2660 EndValueFromAdditionalBypass);
2661 return BCResumeVal;
2662}
2663
2664/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2665/// expansion results.
2667 const SCEV2ValueTy &ExpandedSCEVs) {
2668 const SCEV *Step = ID.getStep();
2669 if (auto *C = dyn_cast<SCEVConstant>(Step))
2670 return C->getValue();
2671 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2672 return U->getValue();
2673 auto I = ExpandedSCEVs.find(Step);
2674 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2675 return I->second;
2676}
2677
2679 const SCEV2ValueTy &ExpandedSCEVs,
2680 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2681 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
2682 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
2683 "Inconsistent information about additional bypass.");
2684 // We are going to resume the execution of the scalar loop.
2685 // Go over all of the induction variables that we found and fix the
2686 // PHIs that are left in the scalar version of the loop.
2687 // The starting values of PHI nodes depend on the counter of the last
2688 // iteration in the vectorized loop.
2689 // If we come from a bypass edge then we need to start from the original
2690 // start value.
2691 for (const auto &InductionEntry : Legal->getInductionVars()) {
2692 PHINode *OrigPhi = InductionEntry.first;
2693 const InductionDescriptor &II = InductionEntry.second;
2694 PHINode *BCResumeVal = createInductionResumeValue(
2695 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
2696 AdditionalBypass);
2697 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
2698 }
2699}
2700
2701std::pair<BasicBlock *, Value *>
2703 const SCEV2ValueTy &ExpandedSCEVs) {
2704 /*
2705 In this function we generate a new loop. The new loop will contain
2706 the vectorized instructions while the old loop will continue to run the
2707 scalar remainder.
2708
2709 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2710 / | preheader are expanded here. Eventually all required SCEV
2711 / | expansion should happen here.
2712 / v
2713 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2714 | / |
2715 | / v
2716 || [ ] <-- vector pre header.
2717 |/ |
2718 | v
2719 | [ ] \
2720 | [ ]_| <-- vector loop (created during VPlan execution).
2721 | |
2722 | v
2723 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2724 | | successors created during VPlan execution)
2725 \/ |
2726 /\ v
2727 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2728 | |
2729 (opt) v <-- edge from middle to exit iff epilogue is not required.
2730 | [ ] \
2731 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
2732 \ |
2733 \ v
2734 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2735 ...
2736 */
2737
2738 // Create an empty vector loop, and prepare basic blocks for the runtime
2739 // checks.
2741
2742 // Now, compare the new count to zero. If it is zero skip the vector loop and
2743 // jump to the scalar loop. This check also covers the case where the
2744 // backedge-taken count is uint##_max: adding one to it will overflow leading
2745 // to an incorrect trip count of zero. In this (rare) case we will also jump
2746 // to the scalar loop.
2748
2749 // Generate the code to check any assumptions that we've made for SCEV
2750 // expressions.
2752
2753 // Generate the code that checks in runtime if arrays overlap. We put the
2754 // checks into a separate block to make the more common case of few elements
2755 // faster.
2757
2758 // Emit phis for the new starting index of the scalar loop.
2759 createInductionResumeValues(ExpandedSCEVs);
2760
2761 return {LoopVectorPreHeader, nullptr};
2762}
2763
2764// Fix up external users of the induction variable. At this point, we are
2765// in LCSSA form, with all external PHIs that use the IV having one input value,
2766// coming from the remainder loop. We need those PHIs to also have a correct
2767// value for the IV when arriving directly from the middle block.
2769 const InductionDescriptor &II,
2770 Value *VectorTripCount, Value *EndValue,
2771 BasicBlock *MiddleBlock,
2772 BasicBlock *VectorHeader, VPlan &Plan,
2773 VPTransformState &State) {
2774 // There are two kinds of external IV usages - those that use the value
2775 // computed in the last iteration (the PHI) and those that use the penultimate
2776 // value (the value that feeds into the phi from the loop latch).
2777 // We allow both, but they, obviously, have different values.
2778
2779 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
2780
2781 DenseMap<Value *, Value *> MissingVals;
2782
2783 // An external user of the last iteration's value should see the value that
2784 // the remainder loop uses to initialize its own IV.
2786 for (User *U : PostInc->users()) {
2787 Instruction *UI = cast<Instruction>(U);
2788 if (!OrigLoop->contains(UI)) {
2789 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2790 MissingVals[UI] = EndValue;
2791 }
2792 }
2793
2794 // An external user of the penultimate value need to see EndValue - Step.
2795 // The simplest way to get this is to recompute it from the constituent SCEVs,
2796 // that is Start + (Step * (CRD - 1)).
2797 for (User *U : OrigPhi->users()) {
2798 auto *UI = cast<Instruction>(U);
2799 if (!OrigLoop->contains(UI)) {
2800 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2801 IRBuilder<> B(MiddleBlock->getTerminator());
2802
2803 // Fast-math-flags propagate from the original induction instruction.
2804 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2805 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2806
2807 Value *CountMinusOne = B.CreateSub(
2808 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
2809 CountMinusOne->setName("cmo");
2810
2811 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2812 assert(StepVPV && "step must have been expanded during VPlan execution");
2813 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2814 : State.get(StepVPV, {0, 0});
2815 Value *Escape =
2816 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
2817 II.getKind(), II.getInductionBinOp());
2818 Escape->setName("ind.escape");
2819 MissingVals[UI] = Escape;
2820 }
2821 }
2822
2823 for (auto &I : MissingVals) {
2824 PHINode *PHI = cast<PHINode>(I.first);
2825 // One corner case we have to handle is two IVs "chasing" each-other,
2826 // that is %IV2 = phi [...], [ %IV1, %latch ]
2827 // In this case, if IV1 has an external use, we need to avoid adding both
2828 // "last value of IV1" and "penultimate value of IV2". So, verify that we
2829 // don't already have an incoming value for the middle block.
2830 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
2831 PHI->addIncoming(I.second, MiddleBlock);
2832 Plan.removeLiveOut(PHI);
2833 }
2834 }
2835}
2836
2837namespace {
2838
2839struct CSEDenseMapInfo {
2840 static bool canHandle(const Instruction *I) {
2841 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2842 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2843 }
2844
2845 static inline Instruction *getEmptyKey() {
2847 }
2848
2849 static inline Instruction *getTombstoneKey() {
2851 }
2852
2853 static unsigned getHashValue(const Instruction *I) {
2854 assert(canHandle(I) && "Unknown instruction!");
2855 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2856 I->value_op_end()));
2857 }
2858
2859 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2860 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2861 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2862 return LHS == RHS;
2863 return LHS->isIdenticalTo(RHS);
2864 }
2865};
2866
2867} // end anonymous namespace
2868
2869///Perform cse of induction variable instructions.
2870static void cse(BasicBlock *BB) {
2871 // Perform simple cse.
2873 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2874 if (!CSEDenseMapInfo::canHandle(&In))
2875 continue;
2876
2877 // Check if we can replace this instruction with any of the
2878 // visited instructions.
2879 if (Instruction *V = CSEMap.lookup(&In)) {
2880 In.replaceAllUsesWith(V);
2881 In.eraseFromParent();
2882 continue;
2883 }
2884
2885 CSEMap[&In] = &In;
2886 }
2887}
2888
2891 ElementCount VF) const {
2892 // We only need to calculate a cost if the VF is scalar; for actual vectors
2893 // we should already have a pre-calculated cost at each VF.
2894 if (!VF.isScalar())
2895 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2896
2898 Type *RetTy = CI->getType();
2900 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
2901 return *RedCost;
2902
2904 for (auto &ArgOp : CI->args())
2905 Tys.push_back(ArgOp->getType());
2906
2907 InstructionCost ScalarCallCost =
2909
2910 // If this is an intrinsic we may have a lower cost for it.
2912 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2913 return std::min(ScalarCallCost, IntrinsicCost);
2914 }
2915 return ScalarCallCost;
2916}
2917
2919 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2920 return Elt;
2921 return VectorType::get(Elt, VF);
2922}
2923
2926 ElementCount VF) const {
2928 assert(ID && "Expected intrinsic call!");
2929 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
2930 FastMathFlags FMF;
2931 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2932 FMF = FPMO->getFastMathFlags();
2933
2936 SmallVector<Type *> ParamTys;
2937 std::transform(FTy->param_begin(), FTy->param_end(),
2938 std::back_inserter(ParamTys),
2939 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
2940
2941 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2942 dyn_cast<IntrinsicInst>(CI));
2943 return TTI.getIntrinsicInstrCost(CostAttrs,
2945}
2946
2948 VPlan &Plan) {
2949 // Fix widened non-induction PHIs by setting up the PHI operands.
2951 fixNonInductionPHIs(Plan, State);
2952
2953 // Forget the original basic block.
2956
2957 // After vectorization, the exit blocks of the original loop will have
2958 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2959 // looked through single-entry phis.
2960 SmallVector<BasicBlock *> ExitBlocks;
2961 OrigLoop->getExitBlocks(ExitBlocks);
2962 for (BasicBlock *Exit : ExitBlocks)
2963 for (PHINode &PN : Exit->phis())
2965
2966 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2967 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
2968 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
2969 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2970 // No edge from the middle block to the unique exit block has been inserted
2971 // and there is nothing to fix from vector loop; phis should have incoming
2972 // from scalar loop only.
2973 } else {
2974 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
2975 // the cost model.
2976
2977 // If we inserted an edge from the middle block to the unique exit block,
2978 // update uses outside the loop (phis) to account for the newly inserted
2979 // edge.
2980
2981 // Fix-up external users of the induction variables.
2982 for (const auto &Entry : Legal->getInductionVars())
2983 fixupIVUsers(Entry.first, Entry.second,
2985 IVEndValues[Entry.first], LoopMiddleBlock,
2986 VectorLoop->getHeader(), Plan, State);
2987 }
2988
2989 // Fix live-out phis not already fixed earlier.
2990 for (const auto &KV : Plan.getLiveOuts())
2991 KV.second->fixPhi(Plan, State);
2992
2994 sinkScalarOperands(&*PI);
2995
2996 // Remove redundant induction instructions.
2997 cse(VectorLoop->getHeader());
2998
2999 // Set/update profile weights for the vector and remainder loops as original
3000 // loop iterations are now distributed among them. Note that original loop
3001 // represented by LoopScalarBody becomes remainder loop after vectorization.
3002 //
3003 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3004 // end up getting slightly roughened result but that should be OK since
3005 // profile is not inherently precise anyway. Note also possible bypass of
3006 // vector code caused by legality checks is ignored, assigning all the weight
3007 // to the vector loop, optimistically.
3008 //
3009 // For scalable vectorization we can't know at compile time how many iterations
3010 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3011 // vscale of '1'.
3014 VF.getKnownMinValue() * UF);
3015}
3016
3018 // The basic block and loop containing the predicated instruction.
3019 auto *PredBB = PredInst->getParent();
3020 auto *VectorLoop = LI->getLoopFor(PredBB);
3021
3022 // Initialize a worklist with the operands of the predicated instruction.
3023 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3024
3025 // Holds instructions that we need to analyze again. An instruction may be
3026 // reanalyzed if we don't yet know if we can sink it or not.
3027 SmallVector<Instruction *, 8> InstsToReanalyze;
3028
3029 // Returns true if a given use occurs in the predicated block. Phi nodes use
3030 // their operands in their corresponding predecessor blocks.
3031 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3032 auto *I = cast<Instruction>(U.getUser());
3033 BasicBlock *BB = I->getParent();
3034 if (auto *Phi = dyn_cast<PHINode>(I))
3035 BB = Phi->getIncomingBlock(
3036 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3037 return BB == PredBB;
3038 };
3039
3040 // Iteratively sink the scalarized operands of the predicated instruction
3041 // into the block we created for it. When an instruction is sunk, it's
3042 // operands are then added to the worklist. The algorithm ends after one pass
3043 // through the worklist doesn't sink a single instruction.
3044 bool Changed;
3045 do {
3046 // Add the instructions that need to be reanalyzed to the worklist, and
3047 // reset the changed indicator.
3048 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3049 InstsToReanalyze.clear();
3050 Changed = false;
3051
3052 while (!Worklist.empty()) {
3053 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3054
3055 // We can't sink an instruction if it is a phi node, is not in the loop,
3056 // may have side effects or may read from memory.
3057 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3058 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3059 I->mayHaveSideEffects() || I->mayReadFromMemory())
3060 continue;
3061
3062 // If the instruction is already in PredBB, check if we can sink its
3063 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3064 // sinking the scalar instruction I, hence it appears in PredBB; but it
3065 // may have failed to sink I's operands (recursively), which we try
3066 // (again) here.
3067 if (I->getParent() == PredBB) {
3068 Worklist.insert(I->op_begin(), I->op_end());
3069 continue;
3070 }
3071
3072 // It's legal to sink the instruction if all its uses occur in the
3073 // predicated block. Otherwise, there's nothing to do yet, and we may
3074 // need to reanalyze the instruction.
3075 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3076 InstsToReanalyze.push_back(I);
3077 continue;
3078 }
3079
3080 // Move the instruction to the beginning of the predicated block, and add
3081 // it's operands to the worklist.
3082 I->moveBefore(&*PredBB->getFirstInsertionPt());
3083 Worklist.insert(I->op_begin(), I->op_end());
3084
3085 // The sinking may have enabled other instructions to be sunk, so we will
3086 // need to iterate.
3087 Changed = true;
3088 }
3089 } while (Changed);
3090}
3091
3093 VPTransformState &State) {
3094 auto Iter = vp_depth_first_deep(Plan.getEntry());
3095 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3096 for (VPRecipeBase &P : VPBB->phis()) {
3097 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3098 if (!VPPhi)
3099 continue;
3100 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3101 // Make sure the builder has a valid insert point.
3102 Builder.SetInsertPoint(NewPhi);
3103 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3104 VPValue *Inc = VPPhi->getIncomingValue(i);
3105 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3106 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3107 }
3108 }
3109 }
3110}
3111
3112void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3113 // We should not collect Scalars more than once per VF. Right now, this
3114 // function is called from collectUniformsAndScalars(), which already does
3115 // this check. Collecting Scalars for VF=1 does not make any sense.
3116 assert(VF.isVector() && !Scalars.contains(VF) &&
3117 "This function should not be visited twice for the same VF");
3118
3119 // This avoids any chances of creating a REPLICATE recipe during planning
3120 // since that would result in generation of scalarized code during execution,
3121 // which is not supported for scalable vectors.
3122 if (VF.isScalable()) {
3123 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3124 return;
3125 }
3126
3128
3129 // These sets are used to seed the analysis with pointers used by memory
3130 // accesses that will remain scalar.
3132 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3133 auto *Latch = TheLoop->getLoopLatch();
3134
3135 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3136 // The pointer operands of loads and stores will be scalar as long as the
3137 // memory access is not a gather or scatter operation. The value operand of a
3138 // store will remain scalar if the store is scalarized.
3139 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3140 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3141 assert(WideningDecision != CM_Unknown &&
3142 "Widening decision should be ready at this moment");
3143 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3144 if (Ptr == Store->getValueOperand())
3145 return WideningDecision == CM_Scalarize;
3146 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3147 "Ptr is neither a value or pointer operand");
3148 return WideningDecision != CM_GatherScatter;
3149 };
3150
3151 // A helper that returns true if the given value is a bitcast or
3152 // getelementptr instruction contained in the loop.
3153 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3154 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3155 isa<GetElementPtrInst>(V)) &&
3157 };
3158
3159 // A helper that evaluates a memory access's use of a pointer. If the use will
3160 // be a scalar use and the pointer is only used by memory accesses, we place
3161 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3162 // PossibleNonScalarPtrs.
3163 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3164 // We only care about bitcast and getelementptr instructions contained in
3165 // the loop.
3166 if (!isLoopVaryingBitCastOrGEP(Ptr))
3167 return;
3168
3169 // If the pointer has already been identified as scalar (e.g., if it was
3170 // also identified as uniform), there's nothing to do.
3171 auto *I = cast<Instruction>(Ptr);
3172 if (Worklist.count(I))
3173 return;
3174
3175 // If the use of the pointer will be a scalar use, and all users of the
3176 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3177 // place the pointer in PossibleNonScalarPtrs.
3178 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3179 return isa<LoadInst>(U) || isa<StoreInst>(U);
3180 }))
3181 ScalarPtrs.insert(I);
3182 else
3183 PossibleNonScalarPtrs.insert(I);
3184 };
3185
3186 // We seed the scalars analysis with three classes of instructions: (1)
3187 // instructions marked uniform-after-vectorization and (2) bitcast,
3188 // getelementptr and (pointer) phi instructions used by memory accesses
3189 // requiring a scalar use.
3190 //
3191 // (1) Add to the worklist all instructions that have been identified as
3192 // uniform-after-vectorization.
3193 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3194
3195 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3196 // memory accesses requiring a scalar use. The pointer operands of loads and
3197 // stores will be scalar as long as the memory accesses is not a gather or
3198 // scatter operation. The value operand of a store will remain scalar if the
3199 // store is scalarized.
3200 for (auto *BB : TheLoop->blocks())
3201 for (auto &I : *BB) {
3202 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3203 evaluatePtrUse(Load, Load->getPointerOperand());
3204 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3205 evaluatePtrUse(Store, Store->getPointerOperand());
3206 evaluatePtrUse(Store, Store->getValueOperand());
3207 }
3208 }
3209 for (auto *I : ScalarPtrs)
3210 if (!PossibleNonScalarPtrs.count(I)) {
3211 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3212 Worklist.insert(I);
3213 }
3214
3215 // Insert the forced scalars.
3216 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3217 // induction variable when the PHI user is scalarized.
3218 auto ForcedScalar = ForcedScalars.find(VF);
3219 if (ForcedScalar != ForcedScalars.end())
3220 for (auto *I : ForcedScalar->second) {
3221 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3222 Worklist.insert(I);
3223 }
3224
3225 // Expand the worklist by looking through any bitcasts and getelementptr
3226 // instructions we've already identified as scalar. This is similar to the
3227 // expansion step in collectLoopUniforms(); however, here we're only
3228 // expanding to include additional bitcasts and getelementptr instructions.
3229 unsigned Idx = 0;
3230 while (Idx != Worklist.size()) {
3231 Instruction *Dst = Worklist[Idx++];
3232 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3233 continue;
3234 auto *Src = cast<Instruction>(Dst->getOperand(0));
3235 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3236 auto *J = cast<Instruction>(U);
3237 return !TheLoop->contains(J) || Worklist.count(J) ||
3238 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3239 isScalarUse(J, Src));
3240 })) {
3241 Worklist.insert(Src);
3242 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3243 }
3244 }
3245
3246 // An induction variable will remain scalar if all users of the induction
3247 // variable and induction variable update remain scalar.
3248 for (const auto &Induction : Legal->getInductionVars()) {
3249 auto *Ind = Induction.first;
3250 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3251
3252 // If tail-folding is applied, the primary induction variable will be used
3253 // to feed a vector compare.
3254 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3255 continue;
3256
3257 // Returns true if \p Indvar is a pointer induction that is used directly by
3258 // load/store instruction \p I.
3259 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3260 Instruction *I) {
3261 return Induction.second.getKind() ==
3263 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3264 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3265 };
3266
3267 // Determine if all users of the induction variable are scalar after
3268 // vectorization.
3269 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3270 auto *I = cast<Instruction>(U);
3271 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3272 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3273 });
3274 if (!ScalarInd)
3275 continue;
3276
3277 // If the induction variable update is a fixed-order recurrence, neither the
3278 // induction variable or its update should be marked scalar after
3279 // vectorization.
3280 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3281 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3282 continue;
3283
3284 // Determine if all users of the induction variable update instruction are
3285 // scalar after vectorization.
3286 auto ScalarIndUpdate =
3287 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3288 auto *I = cast<Instruction>(U);
3289 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3290 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3291 });
3292 if (!ScalarIndUpdate)
3293 continue;
3294
3295 // The induction variable and its update instruction will remain scalar.
3296 Worklist.insert(Ind);
3297 Worklist.insert(IndUpdate);
3298 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3299 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3300 << "\n");
3301 }
3302
3303 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3304}
3305
3307 Instruction *I, ElementCount VF) const {
3308 if (!isPredicatedInst(I))
3309 return false;
3310
3311 // Do we have a non-scalar lowering for this predicated
3312 // instruction? No - it is scalar with predication.
3313 switch(I->getOpcode()) {
3314 default:
3315 return true;
3316 case Instruction::Call:
3317 if (VF.isScalar())
3318 return true;
3319 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3320 .Kind == CM_Scalarize;
3321 case Instruction::Load:
3322 case Instruction::Store: {
3324 auto *Ty = getLoadStoreType(I);
3325 Type *VTy = Ty;
3326 if (VF.isVector())
3327 VTy = VectorType::get(Ty, VF);
3328 const Align Alignment = getLoadStoreAlignment(I);
3329 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3330 TTI.isLegalMaskedGather(VTy, Alignment))
3331 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3332 TTI.isLegalMaskedScatter(VTy, Alignment));
3333 }
3334 case Instruction::UDiv:
3335 case Instruction::SDiv:
3336 case Instruction::SRem:
3337 case Instruction::URem: {
3338 // We have the option to use the safe-divisor idiom to avoid predication.
3339 // The cost based decision here will always select safe-divisor for
3340 // scalable vectors as scalarization isn't legal.
3341 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3342 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3343 }
3344 }
3345}
3346
3348 if (!blockNeedsPredicationForAnyReason(I->getParent()))
3349 return false;
3350
3351 // Can we prove this instruction is safe to unconditionally execute?
3352 // If not, we must use some form of predication.
3353 switch(I->getOpcode()) {
3354 default:
3355 return false;
3356 case Instruction::Load:
3357 case Instruction::Store: {
3358 if (!Legal->isMaskRequired(I))
3359 return false;
3360 // When we know the load's address is loop invariant and the instruction
3361 // in the original scalar loop was unconditionally executed then we
3362 // don't need to mark it as a predicated instruction. Tail folding may
3363 // introduce additional predication, but we're guaranteed to always have
3364 // at least one active lane. We call Legal->blockNeedsPredication here
3365 // because it doesn't query tail-folding. For stores, we need to prove
3366 // both speculation safety (which follows from the same argument as loads),
3367 // but also must prove the value being stored is correct. The easiest
3368 // form of the later is to require that all values stored are the same.
3370 (isa<LoadInst>(I) ||
3371 (isa<StoreInst>(I) &&
3372 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3373 !Legal->blockNeedsPredication(I->getParent()))
3374 return false;
3375 return true;
3376 }
3377 case Instruction::UDiv:
3378 case Instruction::SDiv:
3379 case Instruction::SRem:
3380 case Instruction::URem:
3381 // TODO: We can use the loop-preheader as context point here and get
3382 // context sensitive reasoning
3384 case Instruction::Call:
3385 return Legal->isMaskRequired(I);
3386 }
3387}
3388
3389std::pair<InstructionCost, InstructionCost>
3391 ElementCount VF) const {
3392 assert(I->getOpcode() == Instruction::UDiv ||
3393 I->getOpcode() == Instruction::SDiv ||
3394 I->getOpcode() == Instruction::SRem ||
3395 I->getOpcode() == Instruction::URem);
3397
3399
3400 // Scalarization isn't legal for scalable vector types
3401 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3402 if (!VF.isScalable()) {
3403 // Get the scalarization cost and scale this amount by the probability of
3404 // executing the predicated block. If the instruction is not predicated,
3405 // we fall through to the next case.
3406 ScalarizationCost = 0;
3407
3408 // These instructions have a non-void type, so account for the phi nodes
3409 // that we will create. This cost is likely to be zero. The phi node
3410 // cost, if any, should be scaled by the block probability because it
3411 // models a copy at the end of each predicated block.
3412 ScalarizationCost += VF.getKnownMinValue() *
3413 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3414
3415 // The cost of the non-predicated instruction.
3416 ScalarizationCost += VF.getKnownMinValue() *
3417 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3418
3419 // The cost of insertelement and extractelement instructions needed for
3420 // scalarization.
3421 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3422
3423 // Scale the cost by the probability of executing the predicated blocks.
3424 // This assumes the predicated block for each vector lane is equally
3425 // likely.
3426 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3427 }
3428 InstructionCost SafeDivisorCost = 0;
3429
3430 auto *VecTy = ToVectorTy(I->getType(), VF);
3431
3432 // The cost of the select guard to ensure all lanes are well defined
3433 // after we speculate above any internal control flow.
3434 SafeDivisorCost += TTI.getCmpSelInstrCost(
3435 Instruction::Select, VecTy,
3436 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
3438
3439 // Certain instructions can be cheaper to vectorize if they have a constant
3440 // second vector operand. One example of this are shifts on x86.
3441 Value *Op2 = I->getOperand(1);
3442 auto Op2Info = TTI.getOperandInfo(Op2);
3443 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3444 Legal->isInvariant(Op2))
3446
3447 SmallVector<const Value *, 4> Operands(I->operand_values());
3448 SafeDivisorCost += TTI.getArithmeticInstrCost(
3449 I->getOpcode(), VecTy, CostKind,
3450 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3451 Op2Info, Operands, I);
3452 return {ScalarizationCost, SafeDivisorCost};
3453}
3454
3456 Instruction *I, ElementCount VF) const {
3457 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3459 "Decision should not be set yet.");
3460 auto *Group = getInterleavedAccessGroup(I);
3461 assert(Group && "Must have a group.");
3462
3463 // If the instruction's allocated size doesn't equal it's type size, it
3464 // requires padding and will be scalarized.
3465 auto &DL = I->getDataLayout();
3466 auto *ScalarTy = getLoadStoreType(I);
3467 if (hasIrregularType(ScalarTy, DL))
3468 return false;
3469
3470 // If the group involves a non-integral pointer, we may not be able to
3471 // losslessly cast all values to a common type.
3472 unsigned InterleaveFactor = Group->getFactor();
3473 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3474 for (unsigned i = 0; i < InterleaveFactor; i++) {
3475 Instruction *Member = Group->getMember(i);
3476 if (!Member)
3477 continue;
3478 auto *MemberTy = getLoadStoreType(Member);
3479 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3480 // Don't coerce non-integral pointers to integers or vice versa.
3481 if (MemberNI != ScalarNI) {
3482 // TODO: Consider adding special nullptr value case here
3483 return false;
3484 } else if (MemberNI && ScalarNI &&
3485 ScalarTy->getPointerAddressSpace() !=
3486 MemberTy->getPointerAddressSpace()) {
3487 return false;
3488 }
3489 }
3490
3491 // Check if masking is required.
3492 // A Group may need masking for one of two reasons: it resides in a block that
3493 // needs predication, or it was decided to use masking to deal with gaps
3494 // (either a gap at the end of a load-access that may result in a speculative
3495 // load, or any gaps in a store-access).
3496 bool PredicatedAccessRequiresMasking =
3497 blockNeedsPredicationForAnyReason(I->getParent()) &&
3499 bool LoadAccessWithGapsRequiresEpilogMasking =
3500 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3502 bool StoreAccessWithGapsRequiresMasking =
3503 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3504 if (!PredicatedAccessRequiresMasking &&
3505 !LoadAccessWithGapsRequiresEpilogMasking &&
3506 !StoreAccessWithGapsRequiresMasking)
3507 return true;
3508
3509 // If masked interleaving is required, we expect that the user/target had
3510 // enabled it, because otherwise it either wouldn't have been created or
3511 // it should have been invalidated by the CostModel.
3513 "Masked interleave-groups for predicated accesses are not enabled.");
3514
3515 if (Group->isReverse())
3516 return false;
3517
3518 auto *Ty = getLoadStoreType(I);
3519 const Align Alignment = getLoadStoreAlignment(I);
3520 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3521 : TTI.isLegalMaskedStore(Ty, Alignment);
3522}
3523
3525 Instruction *I, ElementCount VF) {
3526 // Get and ensure we have a valid memory instruction.
3527 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3528
3530 auto *ScalarTy = getLoadStoreType(I);
3531
3532 // In order to be widened, the pointer should be consecutive, first of all.
3533 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3534 return false;
3535
3536 // If the instruction is a store located in a predicated block, it will be
3537 // scalarized.
3538 if (isScalarWithPredication(I, VF))
3539 return false;
3540
3541 // If the instruction's allocated size doesn't equal it's type size, it
3542 // requires padding and will be scalarized.
3543 auto &DL = I->getDataLayout();
3544 if (hasIrregularType(ScalarTy, DL))
3545 return false;
3546
3547 return true;
3548}
3549
3550void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3551 // We should not collect Uniforms more than once per VF. Right now,
3552 // this function is called from collectUniformsAndScalars(), which
3553 // already does this check. Collecting Uniforms for VF=1 does not make any
3554 // sense.
3555
3556 assert(VF.isVector() && !Uniforms.contains(VF) &&
3557 "This function should not be visited twice for the same VF");
3558
3559 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
3560 // not analyze again. Uniforms.count(VF) will return 1.
3561 Uniforms[VF].clear();
3562
3563 // We now know that the loop is vectorizable!
3564 // Collect instructions inside the loop that will remain uniform after
3565 // vectorization.
3566
3567 // Global values, params and instructions outside of current loop are out of
3568 // scope.
3569 auto isOutOfScope = [&](Value *V) -> bool {
3570 Instruction *I = dyn_cast<Instruction>(V);
3571 return (!I || !TheLoop->contains(I));
3572 };
3573
3574 // Worklist containing uniform instructions demanding lane 0.
3575 SetVector<Instruction *> Worklist;
3576
3577 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3578 // that require predication must not be considered uniform after
3579 // vectorization, because that would create an erroneous replicating region
3580 // where only a single instance out of VF should be formed.
3581 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
3582 if (isOutOfScope(I)) {
3583 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3584 << *I << "\n");
3585 return;
3586 }
3587 if (isPredicatedInst(I)) {
3588 LLVM_DEBUG(
3589 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3590 << "\n");
3591 return;
3592 }
3593 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3594 Worklist.insert(I);
3595 };
3596
3597 // Start with the conditional branches exiting the loop. If the branch
3598 // condition is an instruction contained in the loop that is only used by the
3599 // branch, it is uniform.
3601 TheLoop->getExitingBlocks(Exiting);
3602 for (BasicBlock *E : Exiting) {
3603 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3604 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3605 addToWorklistIfAllowed(Cmp);
3606 }
3607
3608 auto PrevVF = VF.divideCoefficientBy(2);
3609 // Return true if all lanes perform the same memory operation, and we can
3610 // thus chose to execute only one.
3611 auto isUniformMemOpUse = [&](Instruction *I) {
3612 // If the value was already known to not be uniform for the previous
3613 // (smaller VF), it cannot be uniform for the larger VF.
3614 if (PrevVF.isVector()) {
3615 auto Iter = Uniforms.find(PrevVF);
3616 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3617 return false;
3618 }
3619 if (!Legal->isUniformMemOp(*I, VF))
3620 return false;
3621 if (isa<LoadInst>(I))
3622 // Loading the same address always produces the same result - at least
3623 // assuming aliasing and ordering which have already been checked.
3624 return true;
3625 // Storing the same value on every iteration.
3626 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3627 };
3628
3629 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
3630 InstWidening WideningDecision = getWideningDecision(I, VF);
3631 assert(WideningDecision != CM_Unknown &&
3632 "Widening decision should be ready at this moment");
3633
3634 if (isUniformMemOpUse(I))
3635 return true;
3636
3637 return (WideningDecision == CM_Widen ||
3638 WideningDecision == CM_Widen_Reverse ||
3639 WideningDecision == CM_Interleave);
3640 };
3641
3642 // Returns true if Ptr is the pointer operand of a memory access instruction
3643 // I, I is known to not require scalarization, and the pointer is not also
3644 // stored.
3645 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3646 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3647 return false;
3648 return getLoadStorePointerOperand(I) == Ptr &&
3649 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3650 };
3651
3652 // Holds a list of values which are known to have at least one uniform use.
3653 // Note that there may be other uses which aren't uniform. A "uniform use"
3654 // here is something which only demands lane 0 of the unrolled iterations;
3655 // it does not imply that all lanes produce the same value (e.g. this is not
3656 // the usual meaning of uniform)
3657 SetVector<Value *> HasUniformUse;
3658
3659 // Scan the loop for instructions which are either a) known to have only
3660 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3661 for (auto *BB : TheLoop->blocks())
3662 for (auto &I : *BB) {
3663 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3664 switch (II->getIntrinsicID()) {
3665 case Intrinsic::sideeffect:
3666 case Intrinsic::experimental_noalias_scope_decl:
3667 case Intrinsic::assume:
3668 case Intrinsic::lifetime_start:
3669 case Intrinsic::lifetime_end:
3671 addToWorklistIfAllowed(&I);
3672 break;
3673 default:
3674 break;
3675 }
3676 }
3677
3678 // ExtractValue instructions must be uniform, because the operands are
3679 // known to be loop-invariant.
3680 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3681 assert(isOutOfScope(EVI->getAggregateOperand()) &&
3682 "Expected aggregate value to be loop invariant");
3683 addToWorklistIfAllowed(EVI);
3684 continue;
3685 }
3686
3687 // If there's no pointer operand, there's nothing to do.
3689 if (!Ptr)
3690 continue;
3691
3692 if (isUniformMemOpUse(&I))
3693 addToWorklistIfAllowed(&I);
3694
3695 if (isVectorizedMemAccessUse(&I, Ptr))
3696 HasUniformUse.insert(Ptr);
3697 }
3698
3699 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3700 // demanding) users. Since loops are assumed to be in LCSSA form, this
3701 // disallows uses outside the loop as well.
3702 for (auto *V : HasUniformUse) {
3703 if (isOutOfScope(V))
3704 continue;
3705 auto *I = cast<Instruction>(V);
3706 auto UsersAreMemAccesses =
3707 llvm::all_of(I->users(), [&](User *U) -> bool {
3708 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
3709 });
3710 if (UsersAreMemAccesses)
3711 addToWorklistIfAllowed(I);
3712 }
3713
3714 // Expand Worklist in topological order: whenever a new instruction
3715 // is added , its users should be already inside Worklist. It ensures
3716 // a uniform instruction will only be used by uniform instructions.
3717 unsigned idx = 0;
3718 while (idx != Worklist.size()) {
3719 Instruction *I = Worklist[idx++];
3720
3721 for (auto *OV : I->operand_values()) {
3722 // isOutOfScope operands cannot be uniform instructions.
3723 if (isOutOfScope(OV))
3724 continue;
3725 // First order recurrence Phi's should typically be considered
3726 // non-uniform.
3727 auto *OP = dyn_cast<PHINode>(OV);
3729 continue;
3730 // If all the users of the operand are uniform, then add the
3731 // operand into the uniform worklist.
3732 auto *OI = cast<Instruction>(OV);
3733 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3734 auto *J = cast<Instruction>(U);
3735 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
3736 }))
3737 addToWorklistIfAllowed(OI);
3738 }
3739 }
3740
3741 // For an instruction to be added into Worklist above, all its users inside
3742 // the loop should also be in Worklist. However, this condition cannot be
3743 // true for phi nodes that form a cyclic dependence. We must process phi
3744 // nodes separately. An induction variable will remain uniform if all users
3745 // of the induction variable and induction variable update remain uniform.
3746 // The code below handles both pointer and non-pointer induction variables.
3747 BasicBlock *Latch = TheLoop->getLoopLatch();
3748 for (const auto &Induction : Legal->getInductionVars()) {
3749 auto *Ind = Induction.first;
3750 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3751
3752 // Determine if all users of the induction variable are uniform after
3753 // vectorization.
3754 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3755 auto *I = cast<Instruction>(U);
3756 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3757 isVectorizedMemAccessUse(I, Ind);
3758 });
3759 if (!UniformInd)
3760 continue;
3761
3762 // Determine if all users of the induction variable update instruction are
3763 // uniform after vectorization.
3764 auto UniformIndUpdate =
3765 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3766 auto *I = cast<Instruction>(U);
3767 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3768 isVectorizedMemAccessUse(I, IndUpdate);
3769 });
3770 if (!UniformIndUpdate)
3771 continue;
3772
3773 // The induction variable and its update instruction will remain uniform.
3774 addToWorklistIfAllowed(Ind);
3775 addToWorklistIfAllowed(IndUpdate);
3776 }
3777
3778 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3779}
3780
3782 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3783
3785 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3786 "runtime pointer checks needed. Enable vectorization of this "
3787 "loop with '#pragma clang loop vectorize(enable)' when "
3788 "compiling with -Os/-Oz",
3789 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3790 return true;
3791 }
3792
3793 if (!PSE.getPredicate().isAlwaysTrue()) {
3794 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3795 "runtime SCEV checks needed. Enable vectorization of this "
3796 "loop with '#pragma clang loop vectorize(enable)' when "
3797 "compiling with -Os/-Oz",
3798 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3799 return true;
3800 }
3801
3802 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3803 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3804 reportVectorizationFailure("Runtime stride check for small trip count",
3805 "runtime stride == 1 checks needed. Enable vectorization of "
3806 "this loop without such check by compiling with -Os/-Oz",
3807 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3808 return true;
3809 }
3810
3811 return false;
3812}
3813
3814bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3815 if (IsScalableVectorizationAllowed)
3816 return *IsScalableVectorizationAllowed;
3817
3818 IsScalableVectorizationAllowed = false;
3820 return false;
3821
3823 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3824 "ScalableVectorizationDisabled", ORE, TheLoop);
3825 return false;
3826 }
3827
3828 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3829
3830 auto MaxScalableVF = ElementCount::getScalable(
3831 std::numeric_limits<ElementCount::ScalarTy>::max());
3832
3833 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3834 // FIXME: While for scalable vectors this is currently sufficient, this should
3835 // be replaced by a more detailed mechanism that filters out specific VFs,
3836 // instead of invalidating vectorization for a whole set of VFs based on the
3837 // MaxVF.
3838
3839 // Disable scalable vectorization if the loop contains unsupported reductions.
3840 if (!canVectorizeReductions(MaxScalableVF)) {
3842 "Scalable vectorization not supported for the reduction "
3843 "operations found in this loop.",
3844 "ScalableVFUnfeasible", ORE, TheLoop);
3845 return false;
3846 }
3847
3848 // Disable scalable vectorization if the loop contains any instructions
3849 // with element types not supported for scalable vectors.
3850 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3851 return !Ty->isVoidTy() &&
3853 })) {
3854 reportVectorizationInfo("Scalable vectorization is not supported "
3855 "for all element types found in this loop.",
3856 "ScalableVFUnfeasible", ORE, TheLoop);
3857 return false;
3858 }
3859
3861 reportVectorizationInfo("The target does not provide maximum vscale value "
3862 "for safe distance analysis.",
3863 "ScalableVFUnfeasible", ORE, TheLoop);
3864 return false;
3865 }
3866
3867 IsScalableVectorizationAllowed = true;
3868 return true;
3869}
3870
3872LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3873 if (!isScalableVectorizationAllowed())
3874 return ElementCount::getScalable(0);
3875
3876 auto MaxScalableVF = ElementCount::getScalable(
3877 std::numeric_limits<ElementCount::ScalarTy>::max());
3879 return MaxScalableVF;
3880
3881 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3882 // Limit MaxScalableVF by the maximum safe dependence distance.
3883 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3884
3885 if (!MaxScalableVF)
3887 "Max legal vector width too small, scalable vectorization "
3888 "unfeasible.",
3889 "ScalableVFUnfeasible", ORE, TheLoop);
3890
3891 return MaxScalableVF;
3892}
3893
3894FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3895 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3897 unsigned SmallestType, WidestType;
3898 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3899
3900 // Get the maximum safe dependence distance in bits computed by LAA.
3901 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3902 // the memory accesses that is most restrictive (involved in the smallest
3903 // dependence distance).
3904 unsigned MaxSafeElements =
3906
3907 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3908 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3909
3910 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3911 << ".\n");
3912 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3913 << ".\n");
3914
3915 // First analyze the UserVF, fall back if the UserVF should be ignored.
3916 if (UserVF) {
3917 auto MaxSafeUserVF =
3918 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3919
3920 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3921 // If `VF=vscale x N` is safe, then so is `VF=N`
3922 if (UserVF.isScalable())
3923 return FixedScalableVFPair(
3924 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3925 else
3926 return UserVF;
3927 }
3928
3929 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3930
3931 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3932 // is better to ignore the hint and let the compiler choose a suitable VF.
3933 if (!UserVF.isScalable()) {
3934 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3935 << " is unsafe, clamping to max safe VF="
3936 << MaxSafeFixedVF << ".\n");
3937 ORE->emit([&]() {
3938 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3940 TheLoop->getHeader())
3941 << "User-specified vectorization factor "
3942 << ore::NV("UserVectorizationFactor", UserVF)
3943 << " is unsafe, clamping to maximum safe vectorization factor "
3944 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3945 });
3946 return MaxSafeFixedVF;
3947 }
3948
3950 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3951 << " is ignored because scalable vectors are not "
3952 "available.\n");
3953 ORE->emit([&]() {
3954 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3956 TheLoop->getHeader())
3957 << "User-specified vectorization factor "
3958 << ore::NV("UserVectorizationFactor", UserVF)
3959 << " is ignored because the target does not support scalable "
3960 "vectors. The compiler will pick a more suitable value.";
3961 });
3962 } else {
3963 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3964 << " is unsafe. Ignoring scalable UserVF.\n");
3965 ORE->emit([&]() {
3966 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3968 TheLoop->getHeader())
3969 << "User-specified vectorization factor "
3970 << ore::NV("UserVectorizationFactor", UserVF)
3971 << " is unsafe. Ignoring the hint to let the compiler pick a "
3972 "more suitable value.";
3973 });
3974 }
3975 }
3976
3977 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3978 << " / " << WidestType << " bits.\n");
3979
3982 if (auto MaxVF =
3983 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3984 MaxSafeFixedVF, FoldTailByMasking))
3985 Result.FixedVF = MaxVF;
3986
3987 if (auto MaxVF =
3988 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3989 MaxSafeScalableVF, FoldTailByMasking))
3990 if (MaxVF.isScalable()) {
3991 Result.ScalableVF = MaxVF;
3992 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3993 << "\n");
3994 }
3995
3996 return Result;
3997}
3998
4002 // TODO: It may by useful to do since it's still likely to be dynamically
4003 // uniform if the target can skip.
4005 "Not inserting runtime ptr check for divergent target",
4006 "runtime pointer checks needed. Not enabled for divergent target",
4007 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4009 }
4010
4011 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4012 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4013 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4014 if (TC == 1) {
4015 reportVectorizationFailure("Single iteration (non) loop",
4016 "loop trip count is one, irrelevant for vectorization",
4017 "SingleIterationLoop", ORE, TheLoop);
4019 }
4020
4021 switch (ScalarEpilogueStatus) {
4023 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4025 [[fallthrough]];
4027 LLVM_DEBUG(
4028 dbgs() << "LV: vector predicate hint/switch found.\n"
4029 << "LV: Not allowing scalar epilogue, creating predicated "
4030 << "vector loop.\n");
4031 break;
4033 // fallthrough as a special case of OptForSize
4035 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4036 LLVM_DEBUG(
4037 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4038 else
4039 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4040 << "count.\n");
4041
4042 // Bail if runtime checks are required, which are not good when optimising
4043 // for size.
4046
4047 break;
4048 }
4049
4050 // The only loops we can vectorize without a scalar epilogue, are loops with
4051 // a bottom-test and a single exiting block. We'd have to handle the fact
4052 // that not every instruction executes on the last iteration. This will
4053 // require a lane mask which varies through the vector loop body. (TODO)
4055 // If there was a tail-folding hint/switch, but we can't fold the tail by
4056 // masking, fallback to a vectorization with a scalar epilogue.
4057 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4058 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4059 "scalar epilogue instead.\n");
4060 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4061 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4062 }
4064 }
4065
4066 // Now try the tail folding
4067
4068 // Invalidate interleave groups that require an epilogue if we can't mask
4069 // the interleave-group.
4071 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4072 "No decisions should have been taken at this point");
4073 // Note: There is no need to invalidate any cost modeling decisions here, as
4074 // non where taken so far.
4076 }
4077
4078 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4079
4080 // Avoid tail folding if the trip count is known to be a multiple of any VF
4081 // we choose.
4082 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4083 MaxFactors.FixedVF.getFixedValue();
4084 if (MaxFactors.ScalableVF) {
4085 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4086 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4087 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4088 *MaxPowerOf2RuntimeVF,
4089 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4090 } else
4091 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4092 }
4093
4094 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4095 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4096 "MaxFixedVF must be a power of 2");
4097 unsigned MaxVFtimesIC =
4098 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4099 ScalarEvolution *SE = PSE.getSE();
4100 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4101 const SCEV *ExitCount = SE->getAddExpr(
4102 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4103 const SCEV *Rem = SE->getURemExpr(
4104 SE->applyLoopGuards(ExitCount, TheLoop),
4105 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4106 if (Rem->isZero()) {
4107 // Accept MaxFixedVF if we do not have a tail.
4108 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4109 return MaxFactors;
4110 }
4111 }
4112
4113 // If we don't know the precise trip count, or if the trip count that we
4114 // found modulo the vectorization factor is not zero, try to fold the tail
4115 // by masking.
4116 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4117 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4118 if (foldTailByMasking()) {
4120 LLVM_DEBUG(
4121 dbgs()
4122 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4123 "try to generate VP Intrinsics with scalable vector "
4124 "factors only.\n");
4125 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4126 // for now.
4127 // TODO: extend it for fixed vectors, if required.
4128 assert(MaxFactors.ScalableVF.isScalable() &&
4129 "Expected scalable vector factor.");
4130
4131 MaxFactors.FixedVF = ElementCount::getFixed(1);
4132 }
4133 return MaxFactors;
4134 }
4135
4136 // If there was a tail-folding hint/switch, but we can't fold the tail by
4137 // masking, fallback to a vectorization with a scalar epilogue.
4138 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4139 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4140 "scalar epilogue instead.\n");
4141 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4142 return MaxFactors;
4143 }
4144
4145 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4146 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4148 }
4149
4150 if (TC == 0) {
4152 "Unable to calculate the loop count due to complex control flow",
4153 "unable to calculate the loop count due to complex control flow",
4154 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4156 }
4157
4159 "Cannot optimize for size and vectorize at the same time.",
4160 "cannot optimize for size and vectorize at the same time. "
4161 "Enable vectorization of this loop with '#pragma clang loop "
4162 "vectorize(enable)' when compiling with -Os/-Oz",
4163 "NoTailLoopWithOptForSize", ORE, TheLoop);
4165}
4166
4167ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4168 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4169 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4170 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4171 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4172 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4174
4175 // Convenience function to return the minimum of two ElementCounts.
4176 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4177 assert((LHS.isScalable() == RHS.isScalable()) &&
4178 "Scalable flags must match");
4179 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4180 };
4181
4182 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4183 // Note that both WidestRegister and WidestType may not be a powers of 2.
4184 auto MaxVectorElementCount = ElementCount::get(
4185 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4186 ComputeScalableMaxVF);
4187 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4188 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4189 << (MaxVectorElementCount * WidestType) << " bits.\n");
4190
4191 if (!MaxVectorElementCount) {
4192 LLVM_DEBUG(dbgs() << "LV: The target has no "
4193 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4194 << " vector registers.\n");
4195 return ElementCount::getFixed(1);
4196 }
4197
4198 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4199 if (MaxVectorElementCount.isScalable() &&
4200 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4201 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4202 auto Min = Attr.getVScaleRangeMin();
4203 WidestRegisterMinEC *= Min;
4204 }
4205
4206 // When a scalar epilogue is required, at least one iteration of the scalar
4207 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4208 // max VF that results in a dead vector loop.
4209 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4210 MaxTripCount -= 1;
4211
4212 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4213 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4214 // If upper bound loop trip count (TC) is known at compile time there is no
4215 // point in choosing VF greater than TC (as done in the loop below). Select
4216 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4217 // scalable, we only fall back on a fixed VF when the TC is less than or
4218 // equal to the known number of lanes.
4219 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4220 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4221 "exceeding the constant trip count: "
4222 << ClampedUpperTripCount << "\n");
4223 return ElementCount::get(
4224 ClampedUpperTripCount,
4225 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4226 }
4227
4229 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4231 ElementCount MaxVF = MaxVectorElementCount;
4232 if (MaximizeBandwidth ||
4233 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4236 auto MaxVectorElementCountMaxBW = ElementCount::get(
4237 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4238 ComputeScalableMaxVF);
4239 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4240
4241 // Collect all viable vectorization factors larger than the default MaxVF
4242 // (i.e. MaxVectorElementCount).
4244 for (ElementCount VS = MaxVectorElementCount * 2;
4245 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4246 VFs.push_back(VS);
4247
4248 // For each VF calculate its register usage.
4249 auto RUs = calculateRegisterUsage(VFs);
4250
4251 // Select the largest VF which doesn't require more registers than existing
4252 // ones.
4253 for (int I = RUs.size() - 1; I >= 0; --I) {
4254 const auto &MLU = RUs[I].MaxLocalUsers;
4255 if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4256 return LU.second <= TTI.getNumberOfRegisters(LU.first);
4257 })) {
4258 MaxVF = VFs[I];
4259 break;
4260 }
4261 }
4262 if (ElementCount MinVF =
4263 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4264 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4265 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4266 << ") with target's minimum: " << MinVF << '\n');
4267 MaxVF = MinVF;
4268 }
4269 }
4270
4271 // Invalidate any widening decisions we might have made, in case the loop
4272 // requires prediction (decided later), but we have already made some
4273 // load/store widening decisions.
4275 }
4276 return MaxVF;
4277}
4278
4279/// Convenience function that returns the value of vscale_range iff
4280/// vscale_range.min == vscale_range.max or otherwise returns the value
4281/// returned by the corresponding TTI method.
4282static std::optional<unsigned>
4284 const Function *Fn = L->getHeader()->getParent();
4285 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4286 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4287 auto Min = Attr.getVScaleRangeMin();
4288 auto Max = Attr.getVScaleRangeMax();
4289 if (Max && Min == Max)
4290 return Max;
4291 }
4292
4293 return TTI.getVScaleForTuning();
4294}
4295
4296bool LoopVectorizationPlanner::isMoreProfitable(
4297 const VectorizationFactor &A, const VectorizationFactor &B) const {
4298 InstructionCost CostA = A.Cost;
4299 InstructionCost CostB = B.Cost;
4300
4301 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4302
4303 // Improve estimate for the vector width if it is scalable.
4304 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4305 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4306 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4307 if (A.Width.isScalable())
4308 EstimatedWidthA *= *VScale;
4309 if (B.Width.isScalable())
4310 EstimatedWidthB *= *VScale;
4311 }
4312
4313 // Assume vscale may be larger than 1 (or the value being tuned for),
4314 // so that scalable vectorization is slightly favorable over fixed-width
4315 // vectorization.
4316 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4317 A.Width.isScalable() && !B.Width.isScalable();
4318
4319 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4320 const InstructionCost &RHS) {
4321 return PreferScalable ? LHS <= RHS : LHS < RHS;
4322 };
4323
4324 // To avoid the need for FP division:
4325 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4326 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4327 if (!MaxTripCount)
4328 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4329
4330 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4331 InstructionCost VectorCost,
4332 InstructionCost ScalarCost) {
4333 // If the trip count is a known (possibly small) constant, the trip count
4334 // will be rounded up to an integer number of iterations under
4335 // FoldTailByMasking. The total cost in that case will be
4336 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4337 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4338 // some extra overheads, but for the purpose of comparing the costs of
4339 // different VFs we can use this to compare the total loop-body cost
4340 // expected after vectorization.
4341 if (CM.foldTailByMasking())
4342 return VectorCost * divideCeil(MaxTripCount, VF);
4343 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4344 };
4345
4346 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4347 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4348 return CmpFn(RTCostA, RTCostB);
4349}
4350
4353 Loop *TheLoop) {
4354 if (InvalidCosts.empty())
4355 return;
4356
4357 // Emit a report of VFs with invalid costs in the loop.
4358
4359 // Group the remarks per instruction, keeping the instruction order from
4360 // InvalidCosts.
4361 std::map<Instruction *, unsigned> Numbering;
4362 unsigned I = 0;
4363 for (auto &Pair : InvalidCosts)
4364 if (!Numbering.count(Pair.first))
4365 Numbering[Pair.first] = I++;
4366
4367 // Sort the list, first on instruction(number) then on VF.
4368 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4369 if (Numbering[A.first] != Numbering[B.first])
4370 return Numbering[A.first] < Numbering[B.first];
4371 const auto &LHS = A.second;
4372 const auto &RHS = B.second;
4373 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4374 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4375 });
4376
4377 // For a list of ordered instruction-vf pairs:
4378 // [(load, vf1), (load, vf2), (store, vf1)]
4379 // Group the instructions together to emit separate remarks for:
4380 // load (vf1, vf2)
4381 // store (vf1)
4382 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4383 auto Subset = ArrayRef<InstructionVFPair>();
4384 do {
4385 if (Subset.empty())
4386 Subset = Tail.take_front(1);
4387
4388 Instruction *I = Subset.front().first;
4389
4390 // If the next instruction is different, or if there are no other pairs,
4391 // emit a remark for the collated subset. e.g.
4392 // [(load, vf1), (load, vf2))]
4393 // to emit:
4394 // remark: invalid costs for 'load' at VF=(vf, vf2)
4395 if (Subset == Tail || Tail[Subset.size()].first != I) {
4396 std::string OutString;
4397 raw_string_ostream OS(OutString);
4398 assert(!Subset.empty() && "Unexpected empty range");
4399 OS << "Instruction with invalid costs prevented vectorization at VF=(";
4400 for (const auto &Pair : Subset)
4401 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4402 OS << "):";
4403 if (auto *CI = dyn_cast<CallInst>(I))
4404 OS << " call to " << CI->getCalledFunction()->getName();
4405 else
4406 OS << " " << I->getOpcodeName();
4407 OS.flush();
4408 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4409 Tail = Tail.drop_front(Subset.size());
4410 Subset = {};
4411 } else
4412 // Grow the subset by one element
4413 Subset = Tail.take_front(Subset.size() + 1);
4414 } while (!Tail.empty());
4415}
4416
4417/// Check if any recipe of \p Plan will generate a vector value, which will be
4418/// assigned a vector register.
4420 const TargetTransformInfo &TTI) {
4421 assert(VF.isVector() && "Checking a scalar VF?");
4422 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
4424 DenseSet<VPRecipeBase *> EphemeralRecipes;
4425 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4426 // Set of already visited types.
4427 DenseSet<Type *> Visited;
4428 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4430 for (VPRecipeBase &R : *VPBB) {
4431 if (EphemeralRecipes.contains(&R))
4432 continue;
4433 // Continue early if the recipe is considered to not produce a vector
4434 // result. Note that this includes VPInstruction where some opcodes may
4435 // produce a vector, to preserve existing behavior as VPInstructions model
4436 // aspects not directly mapped to existing IR instructions.
4437 switch (R.getVPDefID()) {
4438 case VPDef::VPDerivedIVSC:
4439 case VPDef::VPScalarIVStepsSC:
4440 case VPDef::VPScalarCastSC:
4441 case VPDef::VPReplicateSC:
4442 case VPDef::VPInstructionSC:
4443 case VPDef::VPCanonicalIVPHISC:
4444 case VPDef::VPVectorPointerSC:
4445 case VPDef::VPExpandSCEVSC:
4446 case VPDef::VPEVLBasedIVPHISC:
4447 case VPDef::VPPredInstPHISC:
4448 case VPDef::VPBranchOnMaskSC:
4449 continue;
4450 case VPDef::VPReductionSC:
4451 case VPDef::VPActiveLaneMaskPHISC:
4452 case VPDef::VPWidenCallSC:
4453 case VPDef::VPWidenCanonicalIVSC:
4454 case VPDef::VPWidenCastSC:
4455 case VPDef::VPWidenGEPSC:
4456 case VPDef::VPWidenSC:
4457 case VPDef::VPWidenSelectSC:
4458 case VPDef::VPBlendSC:
4459 case VPDef::VPFirstOrderRecurrencePHISC:
4460 case VPDef::VPWidenPHISC:
4461 case VPDef::VPWidenIntOrFpInductionSC:
4462 case VPDef::VPWidenPointerInductionSC:
4463 case VPDef::VPReductionPHISC:
4464 case VPDef::VPInterleaveSC:
4465 case VPDef::VPWidenLoadEVLSC:
4466 case VPDef::VPWidenLoadSC:
4467 case VPDef::VPWidenStoreEVLSC:
4468 case VPDef::VPWidenStoreSC:
4469 break;
4470 default:
4471 llvm_unreachable("unhandled recipe");
4472 }
4473
4474 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4475 Type *VectorTy = ToVectorTy(ScalarTy, VF);
4476 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4477 if (!NumLegalParts)
4478 return false;
4479 if (VF.isScalable()) {
4480 // <vscale x 1 x iN> is assumed to be profitable over iN because
4481 // scalable registers are a distinct register class from scalar
4482 // ones. If we ever find a target which wants to lower scalable
4483 // vectors back to scalars, we'll need to update this code to
4484 // explicitly ask TTI about the register class uses for each part.
4485 return NumLegalParts <= VF.getKnownMinValue();
4486 }
4487 // Two or more parts that share a register - are vectorized.
4488 return NumLegalParts < VF.getKnownMinValue();
4489 };
4490
4491 // If no def nor is a store, e.g., branches, continue - no value to check.
4492 if (R.getNumDefinedValues() == 0 &&
4493 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4494 &R))
4495 continue;
4496 // For multi-def recipes, currently only interleaved loads, suffice to
4497 // check first def only.
4498 // For stores check their stored value; for interleaved stores suffice
4499 // the check first stored value only. In all cases this is the second
4500 // operand.
4501 VPValue *ToCheck =
4502 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4503 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4504 if (!Visited.insert({ScalarTy}).second)
4505 continue;
4506 if (WillWiden(ScalarTy))
4507 return true;
4508 }
4509 }
4510
4511 return false;
4512}
4513
4514VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4516 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4517 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4518 assert(any_of(VPlans,
4519 [](std::unique_ptr<VPlan> &P) {
4520 return P->hasVF(ElementCount::getFixed(1));
4521 }) &&
4522 "Expected Scalar VF to be a candidate");
4523
4524 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4525 ExpectedCost);
4526 VectorizationFactor ChosenFactor = ScalarCost;
4527
4528 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4529 if (ForceVectorization &&
4530 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4531 // Ignore scalar width, because the user explicitly wants vectorization.
4532 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4533 // evaluation.
4534 ChosenFactor.Cost = InstructionCost::getMax();
4535 }
4536
4537 SmallVector<InstructionVFPair> InvalidCosts;
4538 for (auto &P : VPlans) {
4539 for (ElementCount VF :