LLVM 20.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanPatternMatch.h"
63#include "VPlanTransforms.h"
64#include "VPlanVerifier.h"
65#include "llvm/ADT/APInt.h"
66#include "llvm/ADT/ArrayRef.h"
67#include "llvm/ADT/DenseMap.h"
69#include "llvm/ADT/Hashing.h"
70#include "llvm/ADT/MapVector.h"
71#include "llvm/ADT/STLExtras.h"
73#include "llvm/ADT/SmallSet.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/TypeSwitch.h"
83#include "llvm/Analysis/CFG.h"
99#include "llvm/IR/Attributes.h"
100#include "llvm/IR/BasicBlock.h"
101#include "llvm/IR/CFG.h"
102#include "llvm/IR/Constant.h"
103#include "llvm/IR/Constants.h"
104#include "llvm/IR/DataLayout.h"
105#include "llvm/IR/DebugInfo.h"
107#include "llvm/IR/DebugLoc.h"
108#include "llvm/IR/DerivedTypes.h"
110#include "llvm/IR/Dominators.h"
111#include "llvm/IR/Function.h"
112#include "llvm/IR/IRBuilder.h"
113#include "llvm/IR/InstrTypes.h"
114#include "llvm/IR/Instruction.h"
115#include "llvm/IR/Instructions.h"
117#include "llvm/IR/Intrinsics.h"
118#include "llvm/IR/MDBuilder.h"
119#include "llvm/IR/Metadata.h"
120#include "llvm/IR/Module.h"
121#include "llvm/IR/Operator.h"
122#include "llvm/IR/PatternMatch.h"
124#include "llvm/IR/Type.h"
125#include "llvm/IR/Use.h"
126#include "llvm/IR/User.h"
127#include "llvm/IR/Value.h"
128#include "llvm/IR/ValueHandle.h"
130#include "llvm/IR/Verifier.h"
131#include "llvm/Support/Casting.h"
134#include "llvm/Support/Debug.h"
148#include <algorithm>
149#include <cassert>
150#include <cmath>
151#include <cstdint>
152#include <functional>
153#include <iterator>
154#include <limits>
155#include <map>
156#include <memory>
157#include <string>
158#include <tuple>
159#include <utility>
160
161using namespace llvm;
162
163#define LV_NAME "loop-vectorize"
164#define DEBUG_TYPE LV_NAME
165
166#ifndef NDEBUG
167const char VerboseDebug[] = DEBUG_TYPE "-verbose";
168#endif
169
170/// @{
171/// Metadata attribute names
172const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
174 "llvm.loop.vectorize.followup_vectorized";
176 "llvm.loop.vectorize.followup_epilogue";
177/// @}
178
179STATISTIC(LoopsVectorized, "Number of loops vectorized");
180STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
181STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
182
184 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
185 cl::desc("Enable vectorization of epilogue loops."));
186
188 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
189 cl::desc("When epilogue vectorization is enabled, and a value greater than "
190 "1 is specified, forces the given VF for all applicable epilogue "
191 "loops."));
192
194 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
195 cl::desc("Only loops with vectorization factor equal to or larger than "
196 "the specified value are considered for epilogue vectorization."));
197
198/// Loops with a known constant trip count below this number are vectorized only
199/// if no scalar iteration overheads are incurred.
201 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
202 cl::desc("Loops with a constant trip count that is smaller than this "
203 "value are vectorized only if no scalar iteration overheads "
204 "are incurred."));
205
207 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
208 cl::desc("The maximum allowed number of runtime memory checks"));
209
210// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
211// that predication is preferred, and this lists all options. I.e., the
212// vectorizer will try to fold the tail-loop (epilogue) into the vector body
213// and predicate the instructions accordingly. If tail-folding fails, there are
214// different fallback strategies depending on these values:
216 enum Option {
220 };
221} // namespace PreferPredicateTy
222
224 "prefer-predicate-over-epilogue",
227 cl::desc("Tail-folding and predication preferences over creating a scalar "
228 "epilogue loop."),
230 "scalar-epilogue",
231 "Don't tail-predicate loops, create scalar epilogue"),
233 "predicate-else-scalar-epilogue",
234 "prefer tail-folding, create scalar epilogue if tail "
235 "folding fails."),
237 "predicate-dont-vectorize",
238 "prefers tail-folding, don't attempt vectorization if "
239 "tail-folding fails.")));
240
242 "force-tail-folding-style", cl::desc("Force the tail folding style"),
243 cl::init(TailFoldingStyle::None),
245 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
247 TailFoldingStyle::Data, "data",
248 "Create lane mask for data only, using active.lane.mask intrinsic"),
249 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
250 "data-without-lane-mask",
251 "Create lane mask with compare/stepvector"),
252 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
253 "Create lane mask using active.lane.mask intrinsic, and use "
254 "it for both data and control flow"),
255 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
256 "data-and-control-without-rt-check",
257 "Similar to data-and-control, but remove the runtime check"),
258 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
259 "Use predicated EVL instructions for tail folding. If EVL "
260 "is unsupported, fallback to data-without-lane-mask.")));
261
263 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
264 cl::desc("Maximize bandwidth when selecting vectorization factor which "
265 "will be determined by the smallest type in loop."));
266
268 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
269 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
270
271/// An interleave-group may need masking if it resides in a block that needs
272/// predication, or in order to mask away gaps.
274 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
275 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
276
278 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
279 cl::desc("A flag that overrides the target's number of scalar registers."));
280
282 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
283 cl::desc("A flag that overrides the target's number of vector registers."));
284
286 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
287 cl::desc("A flag that overrides the target's max interleave factor for "
288 "scalar loops."));
289
291 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
292 cl::desc("A flag that overrides the target's max interleave factor for "
293 "vectorized loops."));
294
296 "force-target-instruction-cost", cl::init(0), cl::Hidden,
297 cl::desc("A flag that overrides the target's expected cost for "
298 "an instruction to a single constant value. Mostly "
299 "useful for getting consistent testing."));
300
302 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
303 cl::desc(
304 "Pretend that scalable vectors are supported, even if the target does "
305 "not support them. This flag should only be used for testing."));
306
308 "small-loop-cost", cl::init(20), cl::Hidden,
309 cl::desc(
310 "The cost of a loop that is considered 'small' by the interleaver."));
311
313 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
314 cl::desc("Enable the use of the block frequency analysis to access PGO "
315 "heuristics minimizing code growth in cold regions and being more "
316 "aggressive in hot regions."));
317
318// Runtime interleave loops for load/store throughput.
320 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
321 cl::desc(
322 "Enable runtime interleaving until load/store ports are saturated"));
323
324/// The number of stores in a loop that are allowed to need predication.
326 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
327 cl::desc("Max number of stores to be predicated behind an if."));
328
330 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
331 cl::desc("Count the induction variable only once when interleaving"));
332
334 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
335 cl::desc("Enable if predication of stores during vectorization."));
336
338 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
339 cl::desc("The maximum interleave count to use when interleaving a scalar "
340 "reduction in a nested loop."));
341
342static cl::opt<bool>
343 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
345 cl::desc("Prefer in-loop vector reductions, "
346 "overriding the targets preference."));
347
349 "force-ordered-reductions", cl::init(false), cl::Hidden,
350 cl::desc("Enable the vectorisation of loops with in-order (strict) "
351 "FP reductions"));
352
354 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
355 cl::desc(
356 "Prefer predicating a reduction operation over an after loop select."));
357
358namespace llvm {
360 "enable-vplan-native-path", cl::Hidden,
361 cl::desc("Enable VPlan-native vectorization path with "
362 "support for outer loop vectorization."));
363}
364
365// This flag enables the stress testing of the VPlan H-CFG construction in the
366// VPlan-native vectorization path. It must be used in conjuction with
367// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
368// verification of the H-CFGs built.
370 "vplan-build-stress-test", cl::init(false), cl::Hidden,
371 cl::desc(
372 "Build VPlan for every supported loop nest in the function and bail "
373 "out right after the build (stress test the VPlan H-CFG construction "
374 "in the VPlan-native vectorization path)."));
375
377 "interleave-loops", cl::init(true), cl::Hidden,
378 cl::desc("Enable loop interleaving in Loop vectorization passes"));
380 "vectorize-loops", cl::init(true), cl::Hidden,
381 cl::desc("Run the Loop vectorization passes"));
382
384 "vplan-print-in-dot-format", cl::Hidden,
385 cl::desc("Use dot format instead of plain text when dumping VPlans"));
386
388 "force-widen-divrem-via-safe-divisor", cl::Hidden,
389 cl::desc(
390 "Override cost based safe divisor widening for div/rem instructions"));
391
393 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
395 cl::desc("Try wider VFs if they enable the use of vector variants"));
396
397// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
398// variables not overflowing do not hold. See `emitSCEVChecks`.
399static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
400// Likelyhood of bypassing the vectorized loop because pointers overlap. See
401// `emitMemRuntimeChecks`.
402static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
403// Likelyhood of bypassing the vectorized loop because there are zero trips left
404// after prolog. See `emitIterationCountCheck`.
405static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
406
407/// A helper function that returns true if the given type is irregular. The
408/// type is irregular if its allocated size doesn't equal the store size of an
409/// element of the corresponding vector type.
410static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
411 // Determine if an array of N elements of type Ty is "bitcast compatible"
412 // with a <N x Ty> vector.
413 // This is only true if there is no padding between the array elements.
414 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
415}
416
417/// Returns "best known" trip count for the specified loop \p L as defined by
418/// the following procedure:
419/// 1) Returns exact trip count if it is known.
420/// 2) Returns expected trip count according to profile data if any.
421/// 3) Returns upper bound estimate if it is known.
422/// 4) Returns std::nullopt if all of the above failed.
423static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
424 Loop *L) {
425 // Check if exact trip count is known.
426 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
427 return ExpectedTC;
428
429 // Check if there is an expected trip count available from profile data.
431 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
432 return *EstimatedTC;
433
434 // Check if upper bound estimate is known.
435 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
436 return ExpectedTC;
437
438 return std::nullopt;
439}
440
441namespace {
442// Forward declare GeneratedRTChecks.
443class GeneratedRTChecks;
444
445using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
446} // namespace
447
448namespace llvm {
449
451
452/// InnerLoopVectorizer vectorizes loops which contain only one basic
453/// block to a specified vectorization factor (VF).
454/// This class performs the widening of scalars into vectors, or multiple
455/// scalars. This class also implements the following features:
456/// * It inserts an epilogue loop for handling loops that don't have iteration
457/// counts that are known to be a multiple of the vectorization factor.
458/// * It handles the code generation for reduction variables.
459/// * Scalarization (implementation using scalars) of un-vectorizable
460/// instructions.
461/// InnerLoopVectorizer does not perform any vectorization-legality
462/// checks, and relies on the caller to check for the different legality
463/// aspects. The InnerLoopVectorizer relies on the
464/// LoopVectorizationLegality class to provide information about the induction
465/// and reduction variables that were found to a given vectorization factor.
467public:
470 const TargetLibraryInfo *TLI,
474 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
476 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
477 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
478 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
479 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
481 // Query this against the original loop and save it here because the profile
482 // of the original loop header may change as the transformation happens.
485
487 this->MinProfitableTripCount = VecWidth;
488 else
489 this->MinProfitableTripCount = MinProfitableTripCount;
490 }
491
492 virtual ~InnerLoopVectorizer() = default;
493
494 /// Create a new empty loop that will contain vectorized instructions later
495 /// on, while the old loop will be used as the scalar remainder. Control flow
496 /// is generated around the vectorized (and scalar epilogue) loops consisting
497 /// of various checks and bypasses. Return the pre-header block of the new
498 /// loop and the start value for the canonical induction, if it is != 0. The
499 /// latter is the case when vectorizing the epilogue loop. In the case of
500 /// epilogue vectorization, this function is overriden to handle the more
501 /// complex control flow around the loops. \p ExpandedSCEVs is used to
502 /// look up SCEV expansions for expressions needed during skeleton creation.
503 virtual std::pair<BasicBlock *, Value *>
504 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
505
506 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
507 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
508
509 // Return true if any runtime check is added.
511
512 /// A helper function to scalarize a single Instruction in the innermost loop.
513 /// Generates a sequence of scalar instances for each lane between \p MinLane
514 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
515 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
516 /// Instr's operands.
517 void scalarizeInstruction(const Instruction *Instr,
518 VPReplicateRecipe *RepRecipe,
519 const VPIteration &Instance,
520 VPTransformState &State);
521
522 /// Fix the non-induction PHIs in \p Plan.
523 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
524
525 /// Create a new phi node for the induction variable \p OrigPhi to resume
526 /// iteration count in the scalar epilogue, from where the vectorized loop
527 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
528 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
529 /// and the resume values can come from an additional bypass block, the \p
530 /// AdditionalBypass pair provides information about the bypass block and the
531 /// end value on the edge from bypass to this loop.
533 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
534 ArrayRef<BasicBlock *> BypassBlocks,
535 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
536
537 /// Returns the original loop trip count.
538 Value *getTripCount() const { return TripCount; }
539
540 /// Used to set the trip count after ILV's construction and after the
541 /// preheader block has been executed. Note that this always holds the trip
542 /// count of the original loop for both main loop and epilogue vectorization.
543 void setTripCount(Value *TC) { TripCount = TC; }
544
545protected:
547
548 /// A small list of PHINodes.
550
551 /// A type for scalarized values in the new loop. Each value from the
552 /// original loop, when scalarized, is represented by UF x VF scalar values
553 /// in the new unrolled loop, where UF is the unroll factor and VF is the
554 /// vectorization factor.
556
557 /// Set up the values of the IVs correctly when exiting the vector loop.
558 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
559 Value *VectorTripCount, Value *EndValue,
560 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
561 VPlan &Plan, VPTransformState &State);
562
563 /// Iteratively sink the scalarized operands of a predicated instruction into
564 /// the block that was created for it.
565 void sinkScalarOperands(Instruction *PredInst);
566
567 /// Returns (and creates if needed) the trip count of the widened loop.
569
570 /// Emit a bypass check to see if the vector trip count is zero, including if
571 /// it overflows.
573
574 /// Emit a bypass check to see if all of the SCEV assumptions we've
575 /// had to make are correct. Returns the block containing the checks or
576 /// nullptr if no checks have been added.
578
579 /// Emit bypass checks to check any memory assumptions we may have made.
580 /// Returns the block containing the checks or nullptr if no checks have been
581 /// added.
583
584 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
585 /// vector loop preheader, middle block and scalar preheader.
587
588 /// Create new phi nodes for the induction variables to resume iteration count
589 /// in the scalar epilogue, from where the vectorized loop left off.
590 /// In cases where the loop skeleton is more complicated (eg. epilogue
591 /// vectorization) and the resume values can come from an additional bypass
592 /// block, the \p AdditionalBypass pair provides information about the bypass
593 /// block and the end value on the edge from bypass to this loop.
595 const SCEV2ValueTy &ExpandedSCEVs,
596 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
597
598 /// Complete the loop skeleton by adding debug MDs, creating appropriate
599 /// conditional branches in the middle block, preparing the builder and
600 /// running the verifier. Return the preheader of the completed vector loop.
602
603 /// Allow subclasses to override and print debug traces before/after vplan
604 /// execution, when trace information is requested.
605 virtual void printDebugTracesAtStart(){};
606 virtual void printDebugTracesAtEnd(){};
607
608 /// The original loop.
610
611 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
612 /// dynamic knowledge to simplify SCEV expressions and converts them to a
613 /// more usable form.
615
616 /// Loop Info.
618
619 /// Dominator Tree.
621
622 /// Target Library Info.
624
625 /// Target Transform Info.
627
628 /// Assumption Cache.
630
631 /// Interface to emit optimization remarks.
633
634 /// The vectorization SIMD factor to use. Each vector will have this many
635 /// vector elements.
637
639
640 /// The vectorization unroll factor to use. Each scalar is vectorized to this
641 /// many different vector instructions.
642 unsigned UF;
643
644 /// The builder that we use
646
647 // --- Vectorization state ---
648
649 /// The vector-loop preheader.
651
652 /// The scalar-loop preheader.
654
655 /// Middle Block between the vector and the scalar.
657
658 /// The unique ExitBlock of the scalar loop if one exists. Note that
659 /// there can be multiple exiting edges reaching this block.
661
662 /// The scalar loop body.
664
665 /// A list of all bypass blocks. The first block is the entry of the loop.
667
668 /// Store instructions that were predicated.
670
671 /// Trip count of the original loop.
672 Value *TripCount = nullptr;
673
674 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
676
677 /// The legality analysis.
679
680 /// The profitablity analysis.
682
683 // Record whether runtime checks are added.
684 bool AddedSafetyChecks = false;
685
686 // Holds the end values for each induction variable. We save the end values
687 // so we can later fix-up the external users of the induction variables.
689
690 /// BFI and PSI are used to check for profile guided size optimizations.
693
694 // Whether this loop should be optimized for size based on profile guided size
695 // optimizatios.
697
698 /// Structure to hold information about generated runtime checks, responsible
699 /// for cleaning the checks, if vectorization turns out unprofitable.
700 GeneratedRTChecks &RTChecks;
701
702 // Holds the resume values for reductions in the loops, used to set the
703 // correct start value of reduction PHIs when vectorizing the epilogue.
706};
707
709public:
712 const TargetLibraryInfo *TLI,
714 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
717 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
719 ElementCount::getFixed(1),
720 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
721 BFI, PSI, Check) {}
722};
723
724/// Encapsulate information regarding vectorization of a loop and its epilogue.
725/// This information is meant to be updated and used across two stages of
726/// epilogue vectorization.
729 unsigned MainLoopUF = 0;
731 unsigned EpilogueUF = 0;
736 Value *TripCount = nullptr;
738
740 ElementCount EVF, unsigned EUF)
741 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
742 assert(EUF == 1 &&
743 "A high UF for the epilogue loop is likely not beneficial.");
744 }
745};
746
747/// An extension of the inner loop vectorizer that creates a skeleton for a
748/// vectorized loop that has its epilogue (residual) also vectorized.
749/// The idea is to run the vplan on a given loop twice, firstly to setup the
750/// skeleton and vectorize the main loop, and secondly to complete the skeleton
751/// from the first step and vectorize the epilogue. This is achieved by
752/// deriving two concrete strategy classes from this base class and invoking
753/// them in succession from the loop vectorizer planner.
755public:
763 GeneratedRTChecks &Checks)
765 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
766 CM, BFI, PSI, Checks),
767 EPI(EPI) {}
768
769 // Override this function to handle the more complex control flow around the
770 // three loops.
771 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
772 const SCEV2ValueTy &ExpandedSCEVs) final {
773 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
774 }
775
776 /// The interface for creating a vectorized skeleton using one of two
777 /// different strategies, each corresponding to one execution of the vplan
778 /// as described above.
779 virtual std::pair<BasicBlock *, Value *>
780 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
781
782 /// Holds and updates state information required to vectorize the main loop
783 /// and its epilogue in two separate passes. This setup helps us avoid
784 /// regenerating and recomputing runtime safety checks. It also helps us to
785 /// shorten the iteration-count-check path length for the cases where the
786 /// iteration count of the loop is so small that the main vector loop is
787 /// completely skipped.
789};
790
791/// A specialized derived class of inner loop vectorizer that performs
792/// vectorization of *main* loops in the process of vectorizing loops and their
793/// epilogues.
795public:
803 GeneratedRTChecks &Check)
805 EPI, LVL, CM, BFI, PSI, Check) {}
806 /// Implements the interface for creating a vectorized skeleton using the
807 /// *main loop* strategy (ie the first pass of vplan execution).
808 std::pair<BasicBlock *, Value *>
809 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
810
811protected:
812 /// Emits an iteration count bypass check once for the main loop (when \p
813 /// ForEpilogue is false) and once for the epilogue loop (when \p
814 /// ForEpilogue is true).
815 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
816 void printDebugTracesAtStart() override;
817 void printDebugTracesAtEnd() override;
818};
819
820// A specialized derived class of inner loop vectorizer that performs
821// vectorization of *epilogue* loops in the process of vectorizing loops and
822// their epilogues.
824public:
832 GeneratedRTChecks &Checks)
834 EPI, LVL, CM, BFI, PSI, Checks) {
836 }
837 /// Implements the interface for creating a vectorized skeleton using the
838 /// *epilogue loop* strategy (ie the second pass of vplan execution).
839 std::pair<BasicBlock *, Value *>
840 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
841
842protected:
843 /// Emits an iteration count bypass check after the main vector loop has
844 /// finished to see if there are any iterations left to execute by either
845 /// the vector epilogue or the scalar epilogue.
847 BasicBlock *Bypass,
848 BasicBlock *Insert);
849 void printDebugTracesAtStart() override;
850 void printDebugTracesAtEnd() override;
851};
852} // end namespace llvm
853
854/// Look for a meaningful debug location on the instruction or it's
855/// operands.
857 if (!I)
858 return DebugLoc();
859
861 if (I->getDebugLoc() != Empty)
862 return I->getDebugLoc();
863
864 for (Use &Op : I->operands()) {
865 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
866 if (OpInst->getDebugLoc() != Empty)
867 return OpInst->getDebugLoc();
868 }
869
870 return I->getDebugLoc();
871}
872
873/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
874/// is passed, the message relates to that particular instruction.
875#ifndef NDEBUG
876static void debugVectorizationMessage(const StringRef Prefix,
877 const StringRef DebugMsg,
878 Instruction *I) {
879 dbgs() << "LV: " << Prefix << DebugMsg;
880 if (I != nullptr)
881 dbgs() << " " << *I;
882 else
883 dbgs() << '.';
884 dbgs() << '\n';
885}
886#endif
887
888/// Create an analysis remark that explains why vectorization failed
889///
890/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
891/// RemarkName is the identifier for the remark. If \p I is passed it is an
892/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
893/// the location of the remark. If \p DL is passed, use it as debug location for
894/// the remark. \return the remark object that can be streamed to.
896createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
897 Instruction *I, DebugLoc DL = {}) {
898 Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
899 // If debug location is attached to the instruction, use it. Otherwise if DL
900 // was not provided, use the loop's.
901 if (I && I->getDebugLoc())
902 DL = I->getDebugLoc();
903 else if (!DL)
904 DL = TheLoop->getStartLoc();
905
906 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
907}
908
909namespace llvm {
910
911/// Return a value for Step multiplied by VF.
913 int64_t Step) {
914 assert(Ty->isIntegerTy() && "Expected an integer step");
915 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
916}
917
918/// Return the runtime value for VF.
920 return B.CreateElementCount(Ty, VF);
921}
922
924 Loop *OrigLoop) {
925 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
926 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
927
928 ScalarEvolution &SE = *PSE.getSE();
929 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
930}
931
933 const StringRef OREMsg, const StringRef ORETag,
934 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
935 Instruction *I) {
936 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
937 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
938 ORE->emit(
939 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
940 << "loop not vectorized: " << OREMsg);
941}
942
943/// Reports an informative message: print \p Msg for debugging purposes as well
944/// as an optimization remark. Uses either \p I as location of the remark, or
945/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
946/// remark. If \p DL is passed, use it as debug location for the remark.
947static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
949 Loop *TheLoop, Instruction *I = nullptr,
950 DebugLoc DL = {}) {
952 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
953 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
954 I, DL)
955 << Msg);
956}
957
958/// Report successful vectorization of the loop. In case an outer loop is
959/// vectorized, prepend "outer" to the vectorization remark.
961 VectorizationFactor VF, unsigned IC) {
963 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
964 nullptr));
965 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
966 ORE->emit([&]() {
967 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
968 TheLoop->getHeader())
969 << "vectorized " << LoopType << "loop (vectorization width: "
970 << ore::NV("VectorizationFactor", VF.Width)
971 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
972 });
973}
974
975} // end namespace llvm
976
977namespace llvm {
978
979// Loop vectorization cost-model hints how the scalar epilogue loop should be
980// lowered.
982
983 // The default: allowing scalar epilogues.
985
986 // Vectorization with OptForSize: don't allow epilogues.
988
989 // A special case of vectorisation with OptForSize: loops with a very small
990 // trip count are considered for vectorization under OptForSize, thereby
991 // making sure the cost of their loop body is dominant, free of runtime
992 // guards and scalar iteration overheads.
994
995 // Loop hint predicate indicating an epilogue is undesired.
997
998 // Directive indicating we must either tail fold or not vectorize
1001
1002using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1003
1004/// LoopVectorizationCostModel - estimates the expected speedups due to
1005/// vectorization.
1006/// In many cases vectorization is not profitable. This can happen because of
1007/// a number of reasons. In this class we mainly attempt to predict the
1008/// expected speedup/slowdowns due to the supported instruction set. We use the
1009/// TargetTransformInfo to query the different backends for the cost of
1010/// different operations.
1012public:
1016 const TargetTransformInfo &TTI,
1022 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1023 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1024 Hints(Hints), InterleaveInfo(IAI) {}
1025
1026 /// \return An upper bound for the vectorization factors (both fixed and
1027 /// scalable). If the factors are 0, vectorization and interleaving should be
1028 /// avoided up front.
1029 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1030
1031 /// \return True if runtime checks are required for vectorization, and false
1032 /// otherwise.
1033 bool runtimeChecksRequired();
1034
1035 /// Setup cost-based decisions for user vectorization factor.
1036 /// \return true if the UserVF is a feasible VF to be chosen.
1040 return expectedCost(UserVF).isValid();
1041 }
1042
1043 /// \return The size (in bits) of the smallest and widest types in the code
1044 /// that needs to be vectorized. We ignore values that remain scalar such as
1045 /// 64 bit loop indices.
1046 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1047
1048 /// \return The desired interleave count.
1049 /// If interleave count has been specified by metadata it will be returned.
1050 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1051 /// are the selected vectorization factor and the cost of the selected VF.
1052 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1053
1054 /// Memory access instruction may be vectorized in more than one way.
1055 /// Form of instruction after vectorization depends on cost.
1056 /// This function takes cost-based decisions for Load/Store instructions
1057 /// and collects them in a map. This decisions map is used for building
1058 /// the lists of loop-uniform and loop-scalar instructions.
1059 /// The calculated cost is saved with widening decision in order to
1060 /// avoid redundant calculations.
1062
1063 /// A call may be vectorized in different ways depending on whether we have
1064 /// vectorized variants available and whether the target supports masking.
1065 /// This function analyzes all calls in the function at the supplied VF,
1066 /// makes a decision based on the costs of available options, and stores that
1067 /// decision in a map for use in planning and plan execution.
1069
1070 /// A struct that represents some properties of the register usage
1071 /// of a loop.
1073 /// Holds the number of loop invariant values that are used in the loop.
1074 /// The key is ClassID of target-provided register class.
1076 /// Holds the maximum number of concurrent live intervals in the loop.
1077 /// The key is ClassID of target-provided register class.
1079 };
1080
1081 /// \return Returns information about the register usages of the loop for the
1082 /// given vectorization factors.
1085
1086 /// Collect values we want to ignore in the cost model.
1087 void collectValuesToIgnore();
1088
1089 /// Collect all element types in the loop for which widening is needed.
1091
1092 /// Split reductions into those that happen in the loop, and those that happen
1093 /// outside. In loop reductions are collected into InLoopReductions.
1095
1096 /// Returns true if we should use strict in-order reductions for the given
1097 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1098 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1099 /// of FP operations.
1100 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1101 return !Hints->allowReordering() && RdxDesc.isOrdered();
1102 }
1103
1104 /// \returns The smallest bitwidth each instruction can be represented with.
1105 /// The vector equivalents of these instructions should be truncated to this
1106 /// type.
1108 return MinBWs;
1109 }
1110
1111 /// \returns True if it is more profitable to scalarize instruction \p I for
1112 /// vectorization factor \p VF.
1114 assert(VF.isVector() &&
1115 "Profitable to scalarize relevant only for VF > 1.");
1116 assert(
1117 TheLoop->isInnermost() &&
1118 "cost-model should not be used for outer loops (in VPlan-native path)");
1119
1120 auto Scalars = InstsToScalarize.find(VF);
1121 assert(Scalars != InstsToScalarize.end() &&
1122 "VF not yet analyzed for scalarization profitability");
1123 return Scalars->second.contains(I);
1124 }
1125
1126 /// Returns true if \p I is known to be uniform after vectorization.
1128 assert(
1129 TheLoop->isInnermost() &&
1130 "cost-model should not be used for outer loops (in VPlan-native path)");
1131 // Pseudo probe needs to be duplicated for each unrolled iteration and
1132 // vector lane so that profiled loop trip count can be accurately
1133 // accumulated instead of being under counted.
1134 if (isa<PseudoProbeInst>(I))
1135 return false;
1136
1137 if (VF.isScalar())
1138 return true;
1139
1140 auto UniformsPerVF = Uniforms.find(VF);
1141 assert(UniformsPerVF != Uniforms.end() &&
1142 "VF not yet analyzed for uniformity");
1143 return UniformsPerVF->second.count(I);
1144 }
1145
1146 /// Returns true if \p I is known to be scalar after vectorization.
1148 assert(
1149 TheLoop->isInnermost() &&
1150 "cost-model should not be used for outer loops (in VPlan-native path)");
1151 if (VF.isScalar())
1152 return true;
1153
1154 auto ScalarsPerVF = Scalars.find(VF);
1155 assert(ScalarsPerVF != Scalars.end() &&
1156 "Scalar values are not calculated for VF");
1157 return ScalarsPerVF->second.count(I);
1158 }
1159
1160 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1161 /// for vectorization factor \p VF.
1163 return VF.isVector() && MinBWs.contains(I) &&
1164 !isProfitableToScalarize(I, VF) &&
1166 }
1167
1168 /// Decision that was taken during cost calculation for memory instruction.
1171 CM_Widen, // For consecutive accesses with stride +1.
1172 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1179
1180 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1181 /// instruction \p I and vector width \p VF.
1184 assert(VF.isVector() && "Expected VF >=2");
1185 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1186 }
1187
1188 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1189 /// interleaving group \p Grp and vector width \p VF.
1193 assert(VF.isVector() && "Expected VF >=2");
1194 /// Broadcast this decicion to all instructions inside the group.
1195 /// But the cost will be assigned to one instruction only.
1196 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1197 if (auto *I = Grp->getMember(i)) {
1198 if (Grp->getInsertPos() == I)
1199 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1200 else
1201 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1202 }
1203 }
1204 }
1205
1206 /// Return the cost model decision for the given instruction \p I and vector
1207 /// width \p VF. Return CM_Unknown if this instruction did not pass
1208 /// through the cost modeling.
1210 assert(VF.isVector() && "Expected VF to be a vector VF");
1211 assert(
1212 TheLoop->isInnermost() &&
1213 "cost-model should not be used for outer loops (in VPlan-native path)");
1214
1215 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1216 auto Itr = WideningDecisions.find(InstOnVF);
1217 if (Itr == WideningDecisions.end())
1218 return CM_Unknown;
1219 return Itr->second.first;
1220 }
1221
1222 /// Return the vectorization cost for the given instruction \p I and vector
1223 /// width \p VF.
1225 assert(VF.isVector() && "Expected VF >=2");
1226 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1227 assert(WideningDecisions.contains(InstOnVF) &&
1228 "The cost is not calculated");
1229 return WideningDecisions[InstOnVF].second;
1230 }
1231
1236 std::optional<unsigned> MaskPos;
1238 };
1239
1241 Function *Variant, Intrinsic::ID IID,
1242 std::optional<unsigned> MaskPos,
1244 assert(!VF.isScalar() && "Expected vector VF");
1245 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1246 MaskPos, Cost};
1247 }
1248
1250 ElementCount VF) const {
1251 assert(!VF.isScalar() && "Expected vector VF");
1252 return CallWideningDecisions.at(std::make_pair(CI, VF));
1253 }
1254
1255 /// Return True if instruction \p I is an optimizable truncate whose operand
1256 /// is an induction variable. Such a truncate will be removed by adding a new
1257 /// induction variable with the destination type.
1259 // If the instruction is not a truncate, return false.
1260 auto *Trunc = dyn_cast<TruncInst>(I);
1261 if (!Trunc)
1262 return false;
1263
1264 // Get the source and destination types of the truncate.
1265 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1266 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1267
1268 // If the truncate is free for the given types, return false. Replacing a
1269 // free truncate with an induction variable would add an induction variable
1270 // update instruction to each iteration of the loop. We exclude from this
1271 // check the primary induction variable since it will need an update
1272 // instruction regardless.
1273 Value *Op = Trunc->getOperand(0);
1274 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1275 return false;
1276
1277 // If the truncated value is not an induction variable, return false.
1278 return Legal->isInductionPhi(Op);
1279 }
1280
1281 /// Collects the instructions to scalarize for each predicated instruction in
1282 /// the loop.
1284
1285 /// Collect Uniform and Scalar values for the given \p VF.
1286 /// The sets depend on CM decision for Load/Store instructions
1287 /// that may be vectorized as interleave, gather-scatter or scalarized.
1288 /// Also make a decision on what to do about call instructions in the loop
1289 /// at that VF -- scalarize, call a known vector routine, or call a
1290 /// vector intrinsic.
1292 // Do the analysis once.
1293 if (VF.isScalar() || Uniforms.contains(VF))
1294 return;
1297 collectLoopUniforms(VF);
1298 collectLoopScalars(VF);
1299 }
1300
1301 /// Returns true if the target machine supports masked store operation
1302 /// for the given \p DataType and kind of access to \p Ptr.
1303 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1304 return Legal->isConsecutivePtr(DataType, Ptr) &&
1305 TTI.isLegalMaskedStore(DataType, Alignment);
1306 }
1307
1308 /// Returns true if the target machine supports masked load operation
1309 /// for the given \p DataType and kind of access to \p Ptr.
1310 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1311 return Legal->isConsecutivePtr(DataType, Ptr) &&
1312 TTI.isLegalMaskedLoad(DataType, Alignment);
1313 }
1314
1315 /// Returns true if the target machine can represent \p V as a masked gather
1316 /// or scatter operation.
1318 bool LI = isa<LoadInst>(V);
1319 bool SI = isa<StoreInst>(V);
1320 if (!LI && !SI)
1321 return false;
1322 auto *Ty = getLoadStoreType(V);
1324 if (VF.isVector())
1325 Ty = VectorType::get(Ty, VF);
1326 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1327 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1328 }
1329
1330 /// Returns true if the target machine supports all of the reduction
1331 /// variables found for the given VF.
1333 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1334 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1335 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1336 }));
1337 }
1338
1339 /// Given costs for both strategies, return true if the scalar predication
1340 /// lowering should be used for div/rem. This incorporates an override
1341 /// option so it is not simply a cost comparison.
1343 InstructionCost SafeDivisorCost) const {
1344 switch (ForceSafeDivisor) {
1345 case cl::BOU_UNSET:
1346 return ScalarCost < SafeDivisorCost;
1347 case cl::BOU_TRUE:
1348 return false;
1349 case cl::BOU_FALSE:
1350 return true;
1351 };
1352 llvm_unreachable("impossible case value");
1353 }
1354
1355 /// Returns true if \p I is an instruction which requires predication and
1356 /// for which our chosen predication strategy is scalarization (i.e. we
1357 /// don't have an alternate strategy such as masking available).
1358 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1360
1361 /// Returns true if \p I is an instruction that needs to be predicated
1362 /// at runtime. The result is independent of the predication mechanism.
1363 /// Superset of instructions that return true for isScalarWithPredication.
1364 bool isPredicatedInst(Instruction *I) const;
1365
1366 /// Return the costs for our two available strategies for lowering a
1367 /// div/rem operation which requires speculating at least one lane.
1368 /// First result is for scalarization (will be invalid for scalable
1369 /// vectors); second is for the safe-divisor strategy.
1370 std::pair<InstructionCost, InstructionCost>
1372 ElementCount VF) const;
1373
1374 /// Returns true if \p I is a memory instruction with consecutive memory
1375 /// access that can be widened.
1377
1378 /// Returns true if \p I is a memory instruction in an interleaved-group
1379 /// of memory accesses that can be vectorized with wide vector loads/stores
1380 /// and shuffles.
1382
1383 /// Check if \p Instr belongs to any interleaved access group.
1385 return InterleaveInfo.isInterleaved(Instr);
1386 }
1387
1388 /// Get the interleaved access group that \p Instr belongs to.
1391 return InterleaveInfo.getInterleaveGroup(Instr);
1392 }
1393
1394 /// Returns true if we're required to use a scalar epilogue for at least
1395 /// the final iteration of the original loop.
1396 bool requiresScalarEpilogue(bool IsVectorizing) const {
1397 if (!isScalarEpilogueAllowed()) {
1398 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1399 return false;
1400 }
1401 // If we might exit from anywhere but the latch, must run the exiting
1402 // iteration in scalar form.
1404 LLVM_DEBUG(
1405 dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1406 return true;
1407 }
1408 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1409 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1410 "interleaved group requires scalar epilogue\n");
1411 return true;
1412 }
1413 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1414 return false;
1415 }
1416
1417 /// Returns true if we're required to use a scalar epilogue for at least
1418 /// the final iteration of the original loop for all VFs in \p Range.
1419 /// A scalar epilogue must either be required for all VFs in \p Range or for
1420 /// none.
1422 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1423 return requiresScalarEpilogue(VF.isVector());
1424 };
1425 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1426 assert(
1427 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1428 "all VFs in range must agree on whether a scalar epilogue is required");
1429 return IsRequired;
1430 }
1431
1432 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1433 /// loop hint annotation.
1435 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1436 }
1437
1438 /// Returns the TailFoldingStyle that is best for the current loop.
1439 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1440 if (!ChosenTailFoldingStyle)
1442 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1443 : ChosenTailFoldingStyle->second;
1444 }
1445
1446 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1447 /// overflow or not.
1448 /// \param IsScalableVF true if scalable vector factors enabled.
1449 /// \param UserIC User specific interleave count.
1450 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1451 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1452 if (!Legal->canFoldTailByMasking()) {
1453 ChosenTailFoldingStyle =
1455 return;
1456 }
1457
1458 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1459 ChosenTailFoldingStyle = std::make_pair(
1460 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1461 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1462 return;
1463 }
1464
1465 // Set styles when forced.
1466 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1467 ForceTailFoldingStyle.getValue());
1469 return;
1470 // Override forced styles if needed.
1471 // FIXME: use actual opcode/data type for analysis here.
1472 // FIXME: Investigate opportunity for fixed vector factor.
1473 bool EVLIsLegal =
1474 IsScalableVF && UserIC <= 1 &&
1475 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1477 // FIXME: implement support for max safe dependency distance.
1479 if (!EVLIsLegal) {
1480 // If for some reason EVL mode is unsupported, fallback to
1481 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1482 // in a generic way.
1483 ChosenTailFoldingStyle =
1486 LLVM_DEBUG(
1487 dbgs()
1488 << "LV: Preference for VP intrinsics indicated. Will "
1489 "not try to generate VP Intrinsics "
1490 << (UserIC > 1
1491 ? "since interleave count specified is greater than 1.\n"
1492 : "due to non-interleaving reasons.\n"));
1493 }
1494 }
1495
1496 /// Returns true if all loop blocks should be masked to fold tail loop.
1497 bool foldTailByMasking() const {
1498 // TODO: check if it is possible to check for None style independent of
1499 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1501 }
1502
1503 /// Returns true if the instructions in this block requires predication
1504 /// for any reason, e.g. because tail folding now requires a predicate
1505 /// or because the block in the original loop was predicated.
1508 }
1509
1510 /// Returns true if VP intrinsics with explicit vector length support should
1511 /// be generated in the tail folded loop.
1512 bool foldTailWithEVL() const {
1514 }
1515
1516 /// Returns true if the Phi is part of an inloop reduction.
1517 bool isInLoopReduction(PHINode *Phi) const {
1518 return InLoopReductions.contains(Phi);
1519 }
1520
1521 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1522 /// with factor VF. Return the cost of the instruction, including
1523 /// scalarization overhead if it's needed.
1525
1526 /// Estimate cost of a call instruction CI if it were vectorized with factor
1527 /// VF. Return the cost of the instruction, including scalarization overhead
1528 /// if it's needed.
1530
1531 /// Invalidates decisions already taken by the cost model.
1533 WideningDecisions.clear();
1534 CallWideningDecisions.clear();
1535 Uniforms.clear();
1536 Scalars.clear();
1537 }
1538
1539 /// Returns the expected execution cost. The unit of the cost does
1540 /// not matter because we use the 'cost' units to compare different
1541 /// vector widths. The cost that is returned is *not* normalized by
1542 /// the factor width.
1544
1545 bool hasPredStores() const { return NumPredStores > 0; }
1546
1547 /// Returns true if epilogue vectorization is considered profitable, and
1548 /// false otherwise.
1549 /// \p VF is the vectorization factor chosen for the original loop.
1551
1552 /// Returns the execution time cost of an instruction for a given vector
1553 /// width. Vector width of one means scalar.
1555
1556 /// Return the cost of instructions in an inloop reduction pattern, if I is
1557 /// part of that pattern.
1558 std::optional<InstructionCost>
1561
1562private:
1563 unsigned NumPredStores = 0;
1564
1565 /// \return An upper bound for the vectorization factors for both
1566 /// fixed and scalable vectorization, where the minimum-known number of
1567 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1568 /// disabled or unsupported, then the scalable part will be equal to
1569 /// ElementCount::getScalable(0).
1570 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1571 ElementCount UserVF,
1572 bool FoldTailByMasking);
1573
1574 /// \return the maximized element count based on the targets vector
1575 /// registers and the loop trip-count, but limited to a maximum safe VF.
1576 /// This is a helper function of computeFeasibleMaxVF.
1577 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1578 unsigned SmallestType,
1579 unsigned WidestType,
1580 ElementCount MaxSafeVF,
1581 bool FoldTailByMasking);
1582
1583 /// Checks if scalable vectorization is supported and enabled. Caches the
1584 /// result to avoid repeated debug dumps for repeated queries.
1585 bool isScalableVectorizationAllowed();
1586
1587 /// \return the maximum legal scalable VF, based on the safe max number
1588 /// of elements.
1589 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1590
1591 /// Calculate vectorization cost of memory instruction \p I.
1592 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1593
1594 /// The cost computation for scalarized memory instruction.
1595 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1596
1597 /// The cost computation for interleaving group of memory instructions.
1598 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1599
1600 /// The cost computation for Gather/Scatter instruction.
1601 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1602
1603 /// The cost computation for widening instruction \p I with consecutive
1604 /// memory access.
1605 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1606
1607 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1608 /// Load: scalar load + broadcast.
1609 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1610 /// element)
1611 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1612
1613 /// Estimate the overhead of scalarizing an instruction. This is a
1614 /// convenience wrapper for the type-based getScalarizationOverhead API.
1615 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1617
1618 /// Returns true if an artificially high cost for emulated masked memrefs
1619 /// should be used.
1620 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1621
1622 /// Map of scalar integer values to the smallest bitwidth they can be legally
1623 /// represented as. The vector equivalents of these values should be truncated
1624 /// to this type.
1626
1627 /// A type representing the costs for instructions if they were to be
1628 /// scalarized rather than vectorized. The entries are Instruction-Cost
1629 /// pairs.
1630 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1631
1632 /// A set containing all BasicBlocks that are known to present after
1633 /// vectorization as a predicated block.
1635 PredicatedBBsAfterVectorization;
1636
1637 /// Records whether it is allowed to have the original scalar loop execute at
1638 /// least once. This may be needed as a fallback loop in case runtime
1639 /// aliasing/dependence checks fail, or to handle the tail/remainder
1640 /// iterations when the trip count is unknown or doesn't divide by the VF,
1641 /// or as a peel-loop to handle gaps in interleave-groups.
1642 /// Under optsize and when the trip count is very small we don't allow any
1643 /// iterations to execute in the scalar loop.
1644 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1645
1646 /// Control finally chosen tail folding style. The first element is used if
1647 /// the IV update may overflow, the second element - if it does not.
1648 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1649 ChosenTailFoldingStyle;
1650
1651 /// true if scalable vectorization is supported and enabled.
1652 std::optional<bool> IsScalableVectorizationAllowed;
1653
1654 /// A map holding scalar costs for different vectorization factors. The
1655 /// presence of a cost for an instruction in the mapping indicates that the
1656 /// instruction will be scalarized when vectorizing with the associated
1657 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1659
1660 /// Holds the instructions known to be uniform after vectorization.
1661 /// The data is collected per VF.
1663
1664 /// Holds the instructions known to be scalar after vectorization.
1665 /// The data is collected per VF.
1667
1668 /// Holds the instructions (address computations) that are forced to be
1669 /// scalarized.
1671
1672 /// PHINodes of the reductions that should be expanded in-loop.
1673 SmallPtrSet<PHINode *, 4> InLoopReductions;
1674
1675 /// A Map of inloop reduction operations and their immediate chain operand.
1676 /// FIXME: This can be removed once reductions can be costed correctly in
1677 /// VPlan. This was added to allow quick lookup of the inloop operations.
1678 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1679
1680 /// Returns the expected difference in cost from scalarizing the expression
1681 /// feeding a predicated instruction \p PredInst. The instructions to
1682 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1683 /// non-negative return value implies the expression will be scalarized.
1684 /// Currently, only single-use chains are considered for scalarization.
1685 InstructionCost computePredInstDiscount(Instruction *PredInst,
1686 ScalarCostsTy &ScalarCosts,
1687 ElementCount VF);
1688
1689 /// Collect the instructions that are uniform after vectorization. An
1690 /// instruction is uniform if we represent it with a single scalar value in
1691 /// the vectorized loop corresponding to each vector iteration. Examples of
1692 /// uniform instructions include pointer operands of consecutive or
1693 /// interleaved memory accesses. Note that although uniformity implies an
1694 /// instruction will be scalar, the reverse is not true. In general, a
1695 /// scalarized instruction will be represented by VF scalar values in the
1696 /// vectorized loop, each corresponding to an iteration of the original
1697 /// scalar loop.
1698 void collectLoopUniforms(ElementCount VF);
1699
1700 /// Collect the instructions that are scalar after vectorization. An
1701 /// instruction is scalar if it is known to be uniform or will be scalarized
1702 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1703 /// to the list if they are used by a load/store instruction that is marked as
1704 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1705 /// VF values in the vectorized loop, each corresponding to an iteration of
1706 /// the original scalar loop.
1707 void collectLoopScalars(ElementCount VF);
1708
1709 /// Keeps cost model vectorization decision and cost for instructions.
1710 /// Right now it is used for memory instructions only.
1712 std::pair<InstWidening, InstructionCost>>;
1713
1714 DecisionList WideningDecisions;
1715
1716 using CallDecisionList =
1717 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1718
1719 CallDecisionList CallWideningDecisions;
1720
1721 /// Returns true if \p V is expected to be vectorized and it needs to be
1722 /// extracted.
1723 bool needsExtract(Value *V, ElementCount VF) const {
1724 Instruction *I = dyn_cast<Instruction>(V);
1725 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1727 return false;
1728
1729 // Assume we can vectorize V (and hence we need extraction) if the
1730 // scalars are not computed yet. This can happen, because it is called
1731 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1732 // the scalars are collected. That should be a safe assumption in most
1733 // cases, because we check if the operands have vectorizable types
1734 // beforehand in LoopVectorizationLegality.
1735 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1736 };
1737
1738 /// Returns a range containing only operands needing to be extracted.
1739 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1740 ElementCount VF) const {
1742 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1743 }
1744
1745public:
1746 /// The loop that we evaluate.
1748
1749 /// Predicated scalar evolution analysis.
1751
1752 /// Loop Info analysis.
1754
1755 /// Vectorization legality.
1757
1758 /// Vector target information.
1760
1761 /// Target Library Info.
1763
1764 /// Demanded bits analysis.
1766
1767 /// Assumption cache.
1769
1770 /// Interface to emit optimization remarks.
1772
1774
1775 /// Loop Vectorize Hint.
1777
1778 /// The interleave access information contains groups of interleaved accesses
1779 /// with the same stride and close to each other.
1781
1782 /// Values to ignore in the cost model.
1784
1785 /// Values to ignore in the cost model when VF > 1.
1787
1788 /// All element types found in the loop.
1790};
1791} // end namespace llvm
1792
1793namespace {
1794/// Helper struct to manage generating runtime checks for vectorization.
1795///
1796/// The runtime checks are created up-front in temporary blocks to allow better
1797/// estimating the cost and un-linked from the existing IR. After deciding to
1798/// vectorize, the checks are moved back. If deciding not to vectorize, the
1799/// temporary blocks are completely removed.
1800class GeneratedRTChecks {
1801 /// Basic block which contains the generated SCEV checks, if any.
1802 BasicBlock *SCEVCheckBlock = nullptr;
1803
1804 /// The value representing the result of the generated SCEV checks. If it is
1805 /// nullptr, either no SCEV checks have been generated or they have been used.
1806 Value *SCEVCheckCond = nullptr;
1807
1808 /// Basic block which contains the generated memory runtime checks, if any.
1809 BasicBlock *MemCheckBlock = nullptr;
1810
1811 /// The value representing the result of the generated memory runtime checks.
1812 /// If it is nullptr, either no memory runtime checks have been generated or
1813 /// they have been used.
1814 Value *MemRuntimeCheckCond = nullptr;
1815
1816 DominatorTree *DT;
1817 LoopInfo *LI;
1819
1820 SCEVExpander SCEVExp;
1821 SCEVExpander MemCheckExp;
1822
1823 bool CostTooHigh = false;
1824 const bool AddBranchWeights;
1825
1826 Loop *OuterLoop = nullptr;
1827
1828public:
1829 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1831 bool AddBranchWeights)
1832 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1833 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1834
1835 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1836 /// accurately estimate the cost of the runtime checks. The blocks are
1837 /// un-linked from the IR and is added back during vector code generation. If
1838 /// there is no vector code generation, the check blocks are removed
1839 /// completely.
1840 void Create(Loop *L, const LoopAccessInfo &LAI,
1841 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1842
1843 // Hard cutoff to limit compile-time increase in case a very large number of
1844 // runtime checks needs to be generated.
1845 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1846 // profile info.
1847 CostTooHigh =
1849 if (CostTooHigh)
1850 return;
1851
1852 BasicBlock *LoopHeader = L->getHeader();
1853 BasicBlock *Preheader = L->getLoopPreheader();
1854
1855 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1856 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1857 // may be used by SCEVExpander. The blocks will be un-linked from their
1858 // predecessors and removed from LI & DT at the end of the function.
1859 if (!UnionPred.isAlwaysTrue()) {
1860 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1861 nullptr, "vector.scevcheck");
1862
1863 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1864 &UnionPred, SCEVCheckBlock->getTerminator());
1865 }
1866
1867 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1868 if (RtPtrChecking.Need) {
1869 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1870 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1871 "vector.memcheck");
1872
1873 auto DiffChecks = RtPtrChecking.getDiffChecks();
1874 if (DiffChecks) {
1875 Value *RuntimeVF = nullptr;
1876 MemRuntimeCheckCond = addDiffRuntimeChecks(
1877 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1878 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1879 if (!RuntimeVF)
1880 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1881 return RuntimeVF;
1882 },
1883 IC);
1884 } else {
1885 MemRuntimeCheckCond = addRuntimeChecks(
1886 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1888 }
1889 assert(MemRuntimeCheckCond &&
1890 "no RT checks generated although RtPtrChecking "
1891 "claimed checks are required");
1892 }
1893
1894 if (!MemCheckBlock && !SCEVCheckBlock)
1895 return;
1896
1897 // Unhook the temporary block with the checks, update various places
1898 // accordingly.
1899 if (SCEVCheckBlock)
1900 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1901 if (MemCheckBlock)
1902 MemCheckBlock->replaceAllUsesWith(Preheader);
1903
1904 if (SCEVCheckBlock) {
1905 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1906 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1907 Preheader->getTerminator()->eraseFromParent();
1908 }
1909 if (MemCheckBlock) {
1910 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1911 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1912 Preheader->getTerminator()->eraseFromParent();
1913 }
1914
1915 DT->changeImmediateDominator(LoopHeader, Preheader);
1916 if (MemCheckBlock) {
1917 DT->eraseNode(MemCheckBlock);
1918 LI->removeBlock(MemCheckBlock);
1919 }
1920 if (SCEVCheckBlock) {
1921 DT->eraseNode(SCEVCheckBlock);
1922 LI->removeBlock(SCEVCheckBlock);
1923 }
1924
1925 // Outer loop is used as part of the later cost calculations.
1926 OuterLoop = L->getParentLoop();
1927 }
1928
1929 InstructionCost getCost() {
1930 if (SCEVCheckBlock || MemCheckBlock)
1931 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1932
1933 if (CostTooHigh) {
1935 Cost.setInvalid();
1936 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1937 return Cost;
1938 }
1939
1940 InstructionCost RTCheckCost = 0;
1941 if (SCEVCheckBlock)
1942 for (Instruction &I : *SCEVCheckBlock) {
1943 if (SCEVCheckBlock->getTerminator() == &I)
1944 continue;
1947 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1948 RTCheckCost += C;
1949 }
1950 if (MemCheckBlock) {
1951 InstructionCost MemCheckCost = 0;
1952 for (Instruction &I : *MemCheckBlock) {
1953 if (MemCheckBlock->getTerminator() == &I)
1954 continue;
1957 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1958 MemCheckCost += C;
1959 }
1960
1961 // If the runtime memory checks are being created inside an outer loop
1962 // we should find out if these checks are outer loop invariant. If so,
1963 // the checks will likely be hoisted out and so the effective cost will
1964 // reduce according to the outer loop trip count.
1965 if (OuterLoop) {
1966 ScalarEvolution *SE = MemCheckExp.getSE();
1967 // TODO: If profitable, we could refine this further by analysing every
1968 // individual memory check, since there could be a mixture of loop
1969 // variant and invariant checks that mean the final condition is
1970 // variant.
1971 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1972 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1973 // It seems reasonable to assume that we can reduce the effective
1974 // cost of the checks even when we know nothing about the trip
1975 // count. Assume that the outer loop executes at least twice.
1976 unsigned BestTripCount = 2;
1977
1978 // If exact trip count is known use that.
1979 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
1980 BestTripCount = SmallTC;
1982 // Else use profile data if available.
1983 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
1984 BestTripCount = *EstimatedTC;
1985 }
1986
1987 BestTripCount = std::max(BestTripCount, 1U);
1988 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1989
1990 // Let's ensure the cost is always at least 1.
1991 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
1993
1994 if (BestTripCount > 1)
1996 << "We expect runtime memory checks to be hoisted "
1997 << "out of the outer loop. Cost reduced from "
1998 << MemCheckCost << " to " << NewMemCheckCost << '\n');
1999
2000 MemCheckCost = NewMemCheckCost;
2001 }
2002 }
2003
2004 RTCheckCost += MemCheckCost;
2005 }
2006
2007 if (SCEVCheckBlock || MemCheckBlock)
2008 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2009 << "\n");
2010
2011 return RTCheckCost;
2012 }
2013
2014 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2015 /// unused.
2016 ~GeneratedRTChecks() {
2017 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2018 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2019 if (!SCEVCheckCond)
2020 SCEVCleaner.markResultUsed();
2021
2022 if (!MemRuntimeCheckCond)
2023 MemCheckCleaner.markResultUsed();
2024
2025 if (MemRuntimeCheckCond) {
2026 auto &SE = *MemCheckExp.getSE();
2027 // Memory runtime check generation creates compares that use expanded
2028 // values. Remove them before running the SCEVExpanderCleaners.
2029 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2030 if (MemCheckExp.isInsertedInstruction(&I))
2031 continue;
2032 SE.forgetValue(&I);
2033 I.eraseFromParent();
2034 }
2035 }
2036 MemCheckCleaner.cleanup();
2037 SCEVCleaner.cleanup();
2038
2039 if (SCEVCheckCond)
2040 SCEVCheckBlock->eraseFromParent();
2041 if (MemRuntimeCheckCond)
2042 MemCheckBlock->eraseFromParent();
2043 }
2044
2045 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2046 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2047 /// depending on the generated condition.
2048 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2049 BasicBlock *LoopVectorPreHeader,
2050 BasicBlock *LoopExitBlock) {
2051 if (!SCEVCheckCond)
2052 return nullptr;
2053
2054 Value *Cond = SCEVCheckCond;
2055 // Mark the check as used, to prevent it from being removed during cleanup.
2056 SCEVCheckCond = nullptr;
2057 if (auto *C = dyn_cast<ConstantInt>(Cond))
2058 if (C->isZero())
2059 return nullptr;
2060
2061 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2062
2063 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2064 // Create new preheader for vector loop.
2065 if (OuterLoop)
2066 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2067
2068 SCEVCheckBlock->getTerminator()->eraseFromParent();
2069 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2070 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2071 SCEVCheckBlock);
2072
2073 DT->addNewBlock(SCEVCheckBlock, Pred);
2074 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2075
2076 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2077 if (AddBranchWeights)
2078 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2079 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2080 return SCEVCheckBlock;
2081 }
2082
2083 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2084 /// the branches to branch to the vector preheader or \p Bypass, depending on
2085 /// the generated condition.
2086 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2087 BasicBlock *LoopVectorPreHeader) {
2088 // Check if we generated code that checks in runtime if arrays overlap.
2089 if (!MemRuntimeCheckCond)
2090 return nullptr;
2091
2092 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2093 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2094 MemCheckBlock);
2095
2096 DT->addNewBlock(MemCheckBlock, Pred);
2097 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2098 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2099
2100 if (OuterLoop)
2101 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2102
2103 BranchInst &BI =
2104 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2105 if (AddBranchWeights) {
2106 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2107 }
2108 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2109 MemCheckBlock->getTerminator()->setDebugLoc(
2110 Pred->getTerminator()->getDebugLoc());
2111
2112 // Mark the check as used, to prevent it from being removed during cleanup.
2113 MemRuntimeCheckCond = nullptr;
2114 return MemCheckBlock;
2115 }
2116};
2117} // namespace
2118
2120 return Style == TailFoldingStyle::Data ||
2121 Style == TailFoldingStyle::DataAndControlFlow ||
2122 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2123}
2124
2126 return Style == TailFoldingStyle::DataAndControlFlow ||
2127 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2128}
2129
2130// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2131// vectorization. The loop needs to be annotated with #pragma omp simd
2132// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2133// vector length information is not provided, vectorization is not considered
2134// explicit. Interleave hints are not allowed either. These limitations will be
2135// relaxed in the future.
2136// Please, note that we are currently forced to abuse the pragma 'clang
2137// vectorize' semantics. This pragma provides *auto-vectorization hints*
2138// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2139// provides *explicit vectorization hints* (LV can bypass legal checks and
2140// assume that vectorization is legal). However, both hints are implemented
2141// using the same metadata (llvm.loop.vectorize, processed by
2142// LoopVectorizeHints). This will be fixed in the future when the native IR
2143// representation for pragma 'omp simd' is introduced.
2144static bool isExplicitVecOuterLoop(Loop *OuterLp,
2146 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2147 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2148
2149 // Only outer loops with an explicit vectorization hint are supported.
2150 // Unannotated outer loops are ignored.
2152 return false;
2153
2154 Function *Fn = OuterLp->getHeader()->getParent();
2155 if (!Hints.allowVectorization(Fn, OuterLp,
2156 true /*VectorizeOnlyWhenForced*/)) {
2157 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2158 return false;
2159 }
2160
2161 if (Hints.getInterleave() > 1) {
2162 // TODO: Interleave support is future work.
2163 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2164 "outer loops.\n");
2165 Hints.emitRemarkWithHints();
2166 return false;
2167 }
2168
2169 return true;
2170}
2171
2175 // Collect inner loops and outer loops without irreducible control flow. For
2176 // now, only collect outer loops that have explicit vectorization hints. If we
2177 // are stress testing the VPlan H-CFG construction, we collect the outermost
2178 // loop of every loop nest.
2179 if (L.isInnermost() || VPlanBuildStressTest ||
2181 LoopBlocksRPO RPOT(&L);
2182 RPOT.perform(LI);
2183 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2184 V.push_back(&L);
2185 // TODO: Collect inner loops inside marked outer loops in case
2186 // vectorization fails for the outer loop. Do not invoke
2187 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2188 // already known to be reducible. We can use an inherited attribute for
2189 // that.
2190 return;
2191 }
2192 }
2193 for (Loop *InnerL : L)
2194 collectSupportedLoops(*InnerL, LI, ORE, V);
2195}
2196
2197//===----------------------------------------------------------------------===//
2198// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2199// LoopVectorizationCostModel and LoopVectorizationPlanner.
2200//===----------------------------------------------------------------------===//
2201
2202/// Compute the transformed value of Index at offset StartValue using step
2203/// StepValue.
2204/// For integer induction, returns StartValue + Index * StepValue.
2205/// For pointer induction, returns StartValue[Index * StepValue].
2206/// FIXME: The newly created binary instructions should contain nsw/nuw
2207/// flags, which can be found from the original scalar operations.
2208static Value *
2210 Value *Step,
2212 const BinaryOperator *InductionBinOp) {
2213 Type *StepTy = Step->getType();
2214 Value *CastedIndex = StepTy->isIntegerTy()
2215 ? B.CreateSExtOrTrunc(Index, StepTy)
2216 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2217 if (CastedIndex != Index) {
2218 CastedIndex->setName(CastedIndex->getName() + ".cast");
2219 Index = CastedIndex;
2220 }
2221
2222 // Note: the IR at this point is broken. We cannot use SE to create any new
2223 // SCEV and then expand it, hoping that SCEV's simplification will give us
2224 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2225 // lead to various SCEV crashes. So all we can do is to use builder and rely
2226 // on InstCombine for future simplifications. Here we handle some trivial
2227 // cases only.
2228 auto CreateAdd = [&B](Value *X, Value *Y) {
2229 assert(X->getType() == Y->getType() && "Types don't match!");
2230 if (auto *CX = dyn_cast<ConstantInt>(X))
2231 if (CX->isZero())
2232 return Y;
2233 if (auto *CY = dyn_cast<ConstantInt>(Y))
2234 if (CY->isZero())
2235 return X;
2236 return B.CreateAdd(X, Y);
2237 };
2238
2239 // We allow X to be a vector type, in which case Y will potentially be
2240 // splatted into a vector with the same element count.
2241 auto CreateMul = [&B](Value *X, Value *Y) {
2242 assert(X->getType()->getScalarType() == Y->getType() &&
2243 "Types don't match!");
2244 if (auto *CX = dyn_cast<ConstantInt>(X))
2245 if (CX->isOne())
2246 return Y;
2247 if (auto *CY = dyn_cast<ConstantInt>(Y))
2248 if (CY->isOne())
2249 return X;
2250 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2251 if (XVTy && !isa<VectorType>(Y->getType()))
2252 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2253 return B.CreateMul(X, Y);
2254 };
2255
2256 switch (InductionKind) {
2258 assert(!isa<VectorType>(Index->getType()) &&
2259 "Vector indices not supported for integer inductions yet");
2260 assert(Index->getType() == StartValue->getType() &&
2261 "Index type does not match StartValue type");
2262 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2263 return B.CreateSub(StartValue, Index);
2264 auto *Offset = CreateMul(Index, Step);
2265 return CreateAdd(StartValue, Offset);
2266 }
2268 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2270 assert(!isa<VectorType>(Index->getType()) &&
2271 "Vector indices not supported for FP inductions yet");
2272 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2273 assert(InductionBinOp &&
2274 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2275 InductionBinOp->getOpcode() == Instruction::FSub) &&
2276 "Original bin op should be defined for FP induction");
2277
2278 Value *MulExp = B.CreateFMul(Step, Index);
2279 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2280 "induction");
2281 }
2283 return nullptr;
2284 }
2285 llvm_unreachable("invalid enum");
2286}
2287
2288std::optional<unsigned> getMaxVScale(const Function &F,
2289 const TargetTransformInfo &TTI) {
2290 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2291 return MaxVScale;
2292
2293 if (F.hasFnAttribute(Attribute::VScaleRange))
2294 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2295
2296 return std::nullopt;
2297}
2298
2299/// For the given VF and UF and maximum trip count computed for the loop, return
2300/// whether the induction variable might overflow in the vectorized loop. If not,
2301/// then we know a runtime overflow check always evaluates to false and can be
2302/// removed.
2305 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2306 // Always be conservative if we don't know the exact unroll factor.
2307 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2308
2309 Type *IdxTy = Cost->Legal->getWidestInductionType();
2310 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2311
2312 // We know the runtime overflow check is known false iff the (max) trip-count
2313 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2314 // the vector loop induction variable.
2315 if (unsigned TC =
2316 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2317 uint64_t MaxVF = VF.getKnownMinValue();
2318 if (VF.isScalable()) {
2319 std::optional<unsigned> MaxVScale =
2320 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2321 if (!MaxVScale)
2322 return false;
2323 MaxVF *= *MaxVScale;
2324 }
2325
2326 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2327 }
2328
2329 return false;
2330}
2331
2332// Return whether we allow using masked interleave-groups (for dealing with
2333// strided loads/stores that reside in predicated blocks, or for dealing
2334// with gaps).
2336 // If an override option has been passed in for interleaved accesses, use it.
2337 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2339
2341}
2342
2344 VPReplicateRecipe *RepRecipe,
2345 const VPIteration &Instance,
2346 VPTransformState &State) {
2347 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2348
2349 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2350 // the first lane and part.
2351 if (isa<NoAliasScopeDeclInst>(Instr))
2352 if (!Instance.isFirstIteration())
2353 return;
2354
2355 // Does this instruction return a value ?
2356 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2357
2358 Instruction *Cloned = Instr->clone();
2359 if (!IsVoidRetTy) {
2360 Cloned->setName(Instr->getName() + ".cloned");
2361#if !defined(NDEBUG)
2362 // Verify that VPlan type inference results agree with the type of the
2363 // generated values.
2364 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2365 "inferred type and type from generated instructions do not match");
2366#endif
2367 }
2368
2369 RepRecipe->setFlags(Cloned);
2370
2371 if (auto DL = Instr->getDebugLoc())
2372 State.setDebugLocFrom(DL);
2373
2374 // Replace the operands of the cloned instructions with their scalar
2375 // equivalents in the new loop.
2376 for (const auto &I : enumerate(RepRecipe->operands())) {
2377 auto InputInstance = Instance;
2378 VPValue *Operand = I.value();
2380 InputInstance.Lane = VPLane::getFirstLane();
2381 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2382 }
2383 State.addNewMetadata(Cloned, Instr);
2384
2385 // Place the cloned scalar in the new loop.
2386 State.Builder.Insert(Cloned);
2387
2388 State.set(RepRecipe, Cloned, Instance);
2389
2390 // If we just cloned a new assumption, add it the assumption cache.
2391 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2393
2394 // End if-block.
2395 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2396 if (IfPredicateInstr)
2397 PredicatedInstructions.push_back(Cloned);
2398}
2399
2400Value *
2402 if (VectorTripCount)
2403 return VectorTripCount;
2404
2405 Value *TC = getTripCount();
2406 IRBuilder<> Builder(InsertBlock->getTerminator());
2407
2408 Type *Ty = TC->getType();
2409 // This is where we can make the step a runtime constant.
2410 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2411
2412 // If the tail is to be folded by masking, round the number of iterations N
2413 // up to a multiple of Step instead of rounding down. This is done by first
2414 // adding Step-1 and then rounding down. Note that it's ok if this addition
2415 // overflows: the vector induction variable will eventually wrap to zero given
2416 // that it starts at zero and its Step is a power of two; the loop will then
2417 // exit, with the last early-exit vector comparison also producing all-true.
2418 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2419 // is accounted for in emitIterationCountCheck that adds an overflow check.
2420 if (Cost->foldTailByMasking()) {
2422 "VF*UF must be a power of 2 when folding tail by masking");
2423 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2424 "n.rnd.up");
2425 }
2426
2427 // Now we need to generate the expression for the part of the loop that the
2428 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2429 // iterations are not required for correctness, or N - Step, otherwise. Step
2430 // is equal to the vectorization factor (number of SIMD elements) times the
2431 // unroll factor (number of SIMD instructions).
2432 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2433
2434 // There are cases where we *must* run at least one iteration in the remainder
2435 // loop. See the cost model for when this can happen. If the step evenly
2436 // divides the trip count, we set the remainder to be equal to the step. If
2437 // the step does not evenly divide the trip count, no adjustment is necessary
2438 // since there will already be scalar iterations. Note that the minimum
2439 // iterations check ensures that N >= Step.
2440 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2441 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2442 R = Builder.CreateSelect(IsZero, Step, R);
2443 }
2444
2445 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2446
2447 return VectorTripCount;
2448}
2449
2451 Value *Count = getTripCount();
2452 // Reuse existing vector loop preheader for TC checks.
2453 // Note that new preheader block is generated for vector loop.
2454 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2455 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2456
2457 // Generate code to check if the loop's trip count is less than VF * UF, or
2458 // equal to it in case a scalar epilogue is required; this implies that the
2459 // vector trip count is zero. This check also covers the case where adding one
2460 // to the backedge-taken count overflowed leading to an incorrect trip count
2461 // of zero. In this case we will also jump to the scalar loop.
2462 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2464
2465 // If tail is to be folded, vector loop takes care of all iterations.
2466 Type *CountTy = Count->getType();
2467 Value *CheckMinIters = Builder.getFalse();
2468 auto CreateStep = [&]() -> Value * {
2469 // Create step with max(MinProTripCount, UF * VF).
2471 return createStepForVF(Builder, CountTy, VF, UF);
2472
2473 Value *MinProfTC =
2475 if (!VF.isScalable())
2476 return MinProfTC;
2478 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2479 };
2480
2481 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2482 if (Style == TailFoldingStyle::None)
2483 CheckMinIters =
2484 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2485 else if (VF.isScalable() &&
2488 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2489 // an overflow to zero when updating induction variables and so an
2490 // additional overflow check is required before entering the vector loop.
2491
2492 // Get the maximum unsigned value for the type.
2493 Value *MaxUIntTripCount =
2494 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2495 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2496
2497 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2498 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2499 }
2500
2501 // Create new preheader for vector loop.
2503 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2504 "vector.ph");
2505
2506 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2507 DT->getNode(Bypass)->getIDom()) &&
2508 "TC check is expected to dominate Bypass");
2509
2510 // Update dominator for Bypass & LoopExit (if needed).
2511 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2512 BranchInst &BI =
2513 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2515 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2516 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2517 LoopBypassBlocks.push_back(TCCheckBlock);
2518}
2519
2521 BasicBlock *const SCEVCheckBlock =
2522 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2523 if (!SCEVCheckBlock)
2524 return nullptr;
2525
2526 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2528 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2529 "Cannot SCEV check stride or overflow when optimizing for size");
2530
2531
2532 // Update dominator only if this is first RT check.
2533 if (LoopBypassBlocks.empty()) {
2534 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2535 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2536 // If there is an epilogue which must run, there's no edge from the
2537 // middle block to exit blocks and thus no need to update the immediate
2538 // dominator of the exit blocks.
2539 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2540 }
2541
2542 LoopBypassBlocks.push_back(SCEVCheckBlock);
2543 AddedSafetyChecks = true;
2544 return SCEVCheckBlock;
2545}
2546
2548 // VPlan-native path does not do any analysis for runtime checks currently.
2550 return nullptr;
2551
2552 BasicBlock *const MemCheckBlock =
2553 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2554
2555 // Check if we generated code that checks in runtime if arrays overlap. We put
2556 // the checks into a separate block to make the more common case of few
2557 // elements faster.
2558 if (!MemCheckBlock)
2559 return nullptr;
2560
2561 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2562 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2563 "Cannot emit memory checks when optimizing for size, unless forced "
2564 "to vectorize.");
2565 ORE->emit([&]() {
2566 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2569 << "Code-size may be reduced by not forcing "
2570 "vectorization, or by source-code modifications "
2571 "eliminating the need for runtime checks "
2572 "(e.g., adding 'restrict').";
2573 });
2574 }
2575
2576 LoopBypassBlocks.push_back(MemCheckBlock);
2577
2578 AddedSafetyChecks = true;
2579
2580 return MemCheckBlock;
2581}
2582
2586 assert(LoopVectorPreHeader && "Invalid loop structure");
2587 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2588 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2589 "multiple exit loop without required epilogue?");
2590
2593 LI, nullptr, Twine(Prefix) + "middle.block");
2596 nullptr, Twine(Prefix) + "scalar.ph");
2597}
2598
2600 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2601 ArrayRef<BasicBlock *> BypassBlocks,
2602 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2604 assert(VectorTripCount && "Expected valid arguments");
2605
2606 Instruction *OldInduction = Legal->getPrimaryInduction();
2607 Value *&EndValue = IVEndValues[OrigPhi];
2608 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
2609 if (OrigPhi == OldInduction) {
2610 // We know what the end value is.
2611 EndValue = VectorTripCount;
2612 } else {
2614
2615 // Fast-math-flags propagate from the original induction instruction.
2616 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2617 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2618
2619 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
2620 Step, II.getKind(), II.getInductionBinOp());
2621 EndValue->setName("ind.end");
2622
2623 // Compute the end value for the additional bypass (if applicable).
2624 if (AdditionalBypass.first) {
2625 B.SetInsertPoint(AdditionalBypass.first,
2626 AdditionalBypass.first->getFirstInsertionPt());
2627 EndValueFromAdditionalBypass =
2628 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
2629 Step, II.getKind(), II.getInductionBinOp());
2630 EndValueFromAdditionalBypass->setName("ind.end");
2631 }
2632 }
2633
2634 // Create phi nodes to merge from the backedge-taken check block.
2635 PHINode *BCResumeVal =
2636 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
2638 // Copy original phi DL over to the new one.
2639 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2640
2641 // The new PHI merges the original incoming value, in case of a bypass,
2642 // or the value at the end of the vectorized loop.
2643 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
2644
2645 // Fix the scalar body counter (PHI node).
2646 // The old induction's phi node in the scalar body needs the truncated
2647 // value.
2648 for (BasicBlock *BB : BypassBlocks)
2649 BCResumeVal->addIncoming(II.getStartValue(), BB);
2650
2651 if (AdditionalBypass.first)
2652 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
2653 EndValueFromAdditionalBypass);
2654 return BCResumeVal;
2655}
2656
2657/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2658/// expansion results.
2660 const SCEV2ValueTy &ExpandedSCEVs) {
2661 const SCEV *Step = ID.getStep();
2662 if (auto *C = dyn_cast<SCEVConstant>(Step))
2663 return C->getValue();
2664 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2665 return U->getValue();
2666 auto I = ExpandedSCEVs.find(Step);
2667 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2668 return I->second;
2669}
2670
2672 const SCEV2ValueTy &ExpandedSCEVs,
2673 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2674 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
2675 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
2676 "Inconsistent information about additional bypass.");
2677 // We are going to resume the execution of the scalar loop.
2678 // Go over all of the induction variables that we found and fix the
2679 // PHIs that are left in the scalar version of the loop.
2680 // The starting values of PHI nodes depend on the counter of the last
2681 // iteration in the vectorized loop.
2682 // If we come from a bypass edge then we need to start from the original
2683 // start value.
2684 for (const auto &InductionEntry : Legal->getInductionVars()) {
2685 PHINode *OrigPhi = InductionEntry.first;
2686 const InductionDescriptor &II = InductionEntry.second;
2687 PHINode *BCResumeVal = createInductionResumeValue(
2688 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
2689 AdditionalBypass);
2690 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
2691 }
2692}
2693
2694std::pair<BasicBlock *, Value *>
2696 const SCEV2ValueTy &ExpandedSCEVs) {
2697 /*
2698 In this function we generate a new loop. The new loop will contain
2699 the vectorized instructions while the old loop will continue to run the
2700 scalar remainder.
2701
2702 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2703 / | preheader are expanded here. Eventually all required SCEV
2704 / | expansion should happen here.
2705 / v
2706 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2707 | / |
2708 | / v
2709 || [ ] <-- vector pre header.
2710 |/ |
2711 | v
2712 | [ ] \
2713 | [ ]_| <-- vector loop (created during VPlan execution).
2714 | |
2715 | v
2716 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2717 | | successors created during VPlan execution)
2718 \/ |
2719 /\ v
2720 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2721 | |
2722 (opt) v <-- edge from middle to exit iff epilogue is not required.
2723 | [ ] \
2724 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
2725 \ |
2726 \ v
2727 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2728 ...
2729 */
2730
2731 // Create an empty vector loop, and prepare basic blocks for the runtime
2732 // checks.
2734
2735 // Now, compare the new count to zero. If it is zero skip the vector loop and
2736 // jump to the scalar loop. This check also covers the case where the
2737 // backedge-taken count is uint##_max: adding one to it will overflow leading
2738 // to an incorrect trip count of zero. In this (rare) case we will also jump
2739 // to the scalar loop.
2741
2742 // Generate the code to check any assumptions that we've made for SCEV
2743 // expressions.
2745
2746 // Generate the code that checks in runtime if arrays overlap. We put the
2747 // checks into a separate block to make the more common case of few elements
2748 // faster.
2750
2751 // Emit phis for the new starting index of the scalar loop.
2752 createInductionResumeValues(ExpandedSCEVs);
2753
2754 return {LoopVectorPreHeader, nullptr};
2755}
2756
2757// Fix up external users of the induction variable. At this point, we are
2758// in LCSSA form, with all external PHIs that use the IV having one input value,
2759// coming from the remainder loop. We need those PHIs to also have a correct
2760// value for the IV when arriving directly from the middle block.
2762 const InductionDescriptor &II,
2763 Value *VectorTripCount, Value *EndValue,
2764 BasicBlock *MiddleBlock,
2765 BasicBlock *VectorHeader, VPlan &Plan,
2766 VPTransformState &State) {
2767 // There are two kinds of external IV usages - those that use the value
2768 // computed in the last iteration (the PHI) and those that use the penultimate
2769 // value (the value that feeds into the phi from the loop latch).
2770 // We allow both, but they, obviously, have different values.
2771
2772 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
2773
2774 DenseMap<Value *, Value *> MissingVals;
2775
2776 // An external user of the last iteration's value should see the value that
2777 // the remainder loop uses to initialize its own IV.
2779 for (User *U : PostInc->users()) {
2780 Instruction *UI = cast<Instruction>(U);
2781 if (!OrigLoop->contains(UI)) {
2782 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2783 MissingVals[UI] = EndValue;
2784 }
2785 }
2786
2787 // An external user of the penultimate value need to see EndValue - Step.
2788 // The simplest way to get this is to recompute it from the constituent SCEVs,
2789 // that is Start + (Step * (CRD - 1)).
2790 for (User *U : OrigPhi->users()) {
2791 auto *UI = cast<Instruction>(U);
2792 if (!OrigLoop->contains(UI)) {
2793 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2794 IRBuilder<> B(MiddleBlock->getTerminator());
2795
2796 // Fast-math-flags propagate from the original induction instruction.
2797 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2798 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2799
2800 Value *CountMinusOne = B.CreateSub(
2801 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
2802 CountMinusOne->setName("cmo");
2803
2804 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2805 assert(StepVPV && "step must have been expanded during VPlan execution");
2806 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2807 : State.get(StepVPV, {0, 0});
2808 Value *Escape =
2809 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
2810 II.getKind(), II.getInductionBinOp());
2811 Escape->setName("ind.escape");
2812 MissingVals[UI] = Escape;
2813 }
2814 }
2815
2816 for (auto &I : MissingVals) {
2817 PHINode *PHI = cast<PHINode>(I.first);
2818 // One corner case we have to handle is two IVs "chasing" each-other,
2819 // that is %IV2 = phi [...], [ %IV1, %latch ]
2820 // In this case, if IV1 has an external use, we need to avoid adding both
2821 // "last value of IV1" and "penultimate value of IV2". So, verify that we
2822 // don't already have an incoming value for the middle block.
2823 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2824 PHI->addIncoming(I.second, MiddleBlock);
2825 }
2826}
2827
2828namespace {
2829
2830struct CSEDenseMapInfo {
2831 static bool canHandle(const Instruction *I) {
2832 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2833 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2834 }
2835
2836 static inline Instruction *getEmptyKey() {
2838 }
2839
2840 static inline Instruction *getTombstoneKey() {
2842 }
2843
2844 static unsigned getHashValue(const Instruction *I) {
2845 assert(canHandle(I) && "Unknown instruction!");
2846 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2847 I->value_op_end()));
2848 }
2849
2850 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2851 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2852 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2853 return LHS == RHS;
2854 return LHS->isIdenticalTo(RHS);
2855 }
2856};
2857
2858} // end anonymous namespace
2859
2860///Perform cse of induction variable instructions.
2861static void cse(BasicBlock *BB) {
2862 // Perform simple cse.
2864 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2865 if (!CSEDenseMapInfo::canHandle(&In))
2866 continue;
2867
2868 // Check if we can replace this instruction with any of the
2869 // visited instructions.
2870 if (Instruction *V = CSEMap.lookup(&In)) {
2871 In.replaceAllUsesWith(V);
2872 In.eraseFromParent();
2873 continue;
2874 }
2875
2876 CSEMap[&In] = &In;
2877 }
2878}
2879
2882 ElementCount VF) const {
2883 // We only need to calculate a cost if the VF is scalar; for actual vectors
2884 // we should already have a pre-calculated cost at each VF.
2885 if (!VF.isScalar())
2886 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2887
2889 Type *RetTy = CI->getType();
2891 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
2892 return *RedCost;
2893
2895 for (auto &ArgOp : CI->args())
2896 Tys.push_back(ArgOp->getType());
2897
2898 InstructionCost ScalarCallCost =
2900
2901 // If this is an intrinsic we may have a lower cost for it.
2903 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2904 return std::min(ScalarCallCost, IntrinsicCost);
2905 }
2906 return ScalarCallCost;
2907}
2908
2910 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2911 return Elt;
2912 return VectorType::get(Elt, VF);
2913}
2914
2917 ElementCount VF) const {
2919 assert(ID && "Expected intrinsic call!");
2920 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
2921 FastMathFlags FMF;
2922 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2923 FMF = FPMO->getFastMathFlags();
2924
2927 SmallVector<Type *> ParamTys;
2928 std::transform(FTy->param_begin(), FTy->param_end(),
2929 std::back_inserter(ParamTys),
2930 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
2931
2932 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2933 dyn_cast<IntrinsicInst>(CI));
2934 return TTI.getIntrinsicInstrCost(CostAttrs,
2936}
2937
2939 VPlan &Plan) {
2940 // Fix widened non-induction PHIs by setting up the PHI operands.
2942 fixNonInductionPHIs(Plan, State);
2943
2944 // Forget the original basic block.
2947
2948 // After vectorization, the exit blocks of the original loop will have
2949 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2950 // looked through single-entry phis.
2951 SmallVector<BasicBlock *> ExitBlocks;
2952 OrigLoop->getExitBlocks(ExitBlocks);
2953 for (BasicBlock *Exit : ExitBlocks)
2954 for (PHINode &PN : Exit->phis())
2956
2957 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2958 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
2959 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
2960 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2961 // No edge from the middle block to the unique exit block has been inserted
2962 // and there is nothing to fix from vector loop; phis should have incoming
2963 // from scalar loop only.
2964 } else {
2965 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
2966 // the cost model.
2967
2968 // If we inserted an edge from the middle block to the unique exit block,
2969 // update uses outside the loop (phis) to account for the newly inserted
2970 // edge.
2971
2972 // Fix-up external users of the induction variables.
2973 for (const auto &Entry : Legal->getInductionVars())
2974 fixupIVUsers(Entry.first, Entry.second,
2976 IVEndValues[Entry.first], LoopMiddleBlock,
2977 VectorLoop->getHeader(), Plan, State);
2978 }
2979
2980 // Fix live-out phis not already fixed earlier.
2981 for (const auto &KV : Plan.getLiveOuts())
2982 KV.second->fixPhi(Plan, State);
2983
2985 sinkScalarOperands(&*PI);
2986
2987 // Remove redundant induction instructions.
2988 cse(VectorLoop->getHeader());
2989
2990 // Set/update profile weights for the vector and remainder loops as original
2991 // loop iterations are now distributed among them. Note that original loop
2992 // represented by LoopScalarBody becomes remainder loop after vectorization.
2993 //
2994 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2995 // end up getting slightly roughened result but that should be OK since
2996 // profile is not inherently precise anyway. Note also possible bypass of
2997 // vector code caused by legality checks is ignored, assigning all the weight
2998 // to the vector loop, optimistically.
2999 //
3000 // For scalable vectorization we can't know at compile time how many iterations
3001 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3002 // vscale of '1'.
3005 VF.getKnownMinValue() * UF);
3006}
3007
3009 // The basic block and loop containing the predicated instruction.
3010 auto *PredBB = PredInst->getParent();
3011 auto *VectorLoop = LI->getLoopFor(PredBB);
3012
3013 // Initialize a worklist with the operands of the predicated instruction.
3014 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3015
3016 // Holds instructions that we need to analyze again. An instruction may be
3017 // reanalyzed if we don't yet know if we can sink it or not.
3018 SmallVector<Instruction *, 8> InstsToReanalyze;
3019
3020 // Returns true if a given use occurs in the predicated block. Phi nodes use
3021 // their operands in their corresponding predecessor blocks.
3022 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3023 auto *I = cast<Instruction>(U.getUser());
3024 BasicBlock *BB = I->getParent();
3025 if (auto *Phi = dyn_cast<PHINode>(I))
3026 BB = Phi->getIncomingBlock(
3027 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3028 return BB == PredBB;
3029 };
3030
3031 // Iteratively sink the scalarized operands of the predicated instruction
3032 // into the block we created for it. When an instruction is sunk, it's
3033 // operands are then added to the worklist. The algorithm ends after one pass
3034 // through the worklist doesn't sink a single instruction.
3035 bool Changed;
3036 do {
3037 // Add the instructions that need to be reanalyzed to the worklist, and
3038 // reset the changed indicator.
3039 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3040 InstsToReanalyze.clear();
3041 Changed = false;
3042
3043 while (!Worklist.empty()) {
3044 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3045
3046 // We can't sink an instruction if it is a phi node, is not in the loop,
3047 // may have side effects or may read from memory.
3048 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3049 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3050 I->mayHaveSideEffects() || I->mayReadFromMemory())
3051 continue;
3052
3053 // If the instruction is already in PredBB, check if we can sink its
3054 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3055 // sinking the scalar instruction I, hence it appears in PredBB; but it
3056 // may have failed to sink I's operands (recursively), which we try
3057 // (again) here.
3058 if (I->getParent() == PredBB) {
3059 Worklist.insert(I->op_begin(), I->op_end());
3060 continue;
3061 }
3062
3063 // It's legal to sink the instruction if all its uses occur in the
3064 // predicated block. Otherwise, there's nothing to do yet, and we may
3065 // need to reanalyze the instruction.
3066 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3067 InstsToReanalyze.push_back(I);
3068 continue;
3069 }
3070
3071 // Move the instruction to the beginning of the predicated block, and add
3072 // it's operands to the worklist.
3073 I->moveBefore(&*PredBB->getFirstInsertionPt());
3074 Worklist.insert(I->op_begin(), I->op_end());
3075
3076 // The sinking may have enabled other instructions to be sunk, so we will
3077 // need to iterate.
3078 Changed = true;
3079 }
3080 } while (Changed);
3081}
3082
3084 VPTransformState &State) {
3085 auto Iter = vp_depth_first_deep(Plan.getEntry());
3086 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3087 for (VPRecipeBase &P : VPBB->phis()) {
3088 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3089 if (!VPPhi)
3090 continue;
3091 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3092 // Make sure the builder has a valid insert point.
3093 Builder.SetInsertPoint(NewPhi);
3094 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3095 VPValue *Inc = VPPhi->getIncomingValue(i);
3096 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3097 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3098 }
3099 }
3100 }
3101}
3102
3103void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3104 // We should not collect Scalars more than once per VF. Right now, this
3105 // function is called from collectUniformsAndScalars(), which already does
3106 // this check. Collecting Scalars for VF=1 does not make any sense.
3107 assert(VF.isVector() && !Scalars.contains(VF) &&
3108 "This function should not be visited twice for the same VF");
3109
3110 // This avoids any chances of creating a REPLICATE recipe during planning
3111 // since that would result in generation of scalarized code during execution,
3112 // which is not supported for scalable vectors.
3113 if (VF.isScalable()) {
3114 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3115 return;
3116 }
3117
3119
3120 // These sets are used to seed the analysis with pointers used by memory
3121 // accesses that will remain scalar.
3123 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3124 auto *Latch = TheLoop->getLoopLatch();
3125
3126 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3127 // The pointer operands of loads and stores will be scalar as long as the
3128 // memory access is not a gather or scatter operation. The value operand of a
3129 // store will remain scalar if the store is scalarized.
3130 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3131 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3132 assert(WideningDecision != CM_Unknown &&
3133 "Widening decision should be ready at this moment");
3134 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3135 if (Ptr == Store->getValueOperand())
3136 return WideningDecision == CM_Scalarize;
3137 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3138 "Ptr is neither a value or pointer operand");
3139 return WideningDecision != CM_GatherScatter;
3140 };
3141
3142 // A helper that returns true if the given value is a getelementptr
3143 // instruction contained in the loop.
3144 auto isLoopVaryingGEP = [&](Value *V) {
3145 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3146 };
3147
3148 // A helper that evaluates a memory access's use of a pointer. If the use will
3149 // be a scalar use and the pointer is only used by memory accesses, we place
3150 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3151 // PossibleNonScalarPtrs.
3152 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3153 // We only care about bitcast and getelementptr instructions contained in
3154 // the loop.
3155 if (!isLoopVaryingGEP(Ptr))
3156 return;
3157
3158 // If the pointer has already been identified as scalar (e.g., if it was
3159 // also identified as uniform), there's nothing to do.
3160 auto *I = cast<Instruction>(Ptr);
3161 if (Worklist.count(I))
3162 return;
3163
3164 // If the use of the pointer will be a scalar use, and all users of the
3165 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3166 // place the pointer in PossibleNonScalarPtrs.
3167 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3168 return isa<LoadInst>(U) || isa<StoreInst>(U);
3169 }))
3170 ScalarPtrs.insert(I);
3171 else
3172 PossibleNonScalarPtrs.insert(I);
3173 };
3174
3175 // We seed the scalars analysis with three classes of instructions: (1)
3176 // instructions marked uniform-after-vectorization and (2) bitcast,
3177 // getelementptr and (pointer) phi instructions used by memory accesses
3178 // requiring a scalar use.
3179 //
3180 // (1) Add to the worklist all instructions that have been identified as
3181 // uniform-after-vectorization.
3182 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3183
3184 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3185 // memory accesses requiring a scalar use. The pointer operands of loads and
3186 // stores will be scalar as long as the memory accesses is not a gather or
3187 // scatter operation. The value operand of a store will remain scalar if the
3188 // store is scalarized.
3189 for (auto *BB : TheLoop->blocks())
3190 for (auto &I : *BB) {
3191 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3192 evaluatePtrUse(Load, Load->getPointerOperand());
3193 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3194 evaluatePtrUse(Store, Store->getPointerOperand());
3195 evaluatePtrUse(Store, Store->getValueOperand());
3196 }
3197 }
3198 for (auto *I : ScalarPtrs)
3199 if (!PossibleNonScalarPtrs.count(I)) {
3200 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3201 Worklist.insert(I);
3202 }
3203
3204 // Insert the forced scalars.
3205 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3206 // induction variable when the PHI user is scalarized.
3207 auto ForcedScalar = ForcedScalars.find(VF);
3208 if (ForcedScalar != ForcedScalars.end())
3209 for (auto *I : ForcedScalar->second) {
3210 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3211 Worklist.insert(I);
3212 }
3213
3214 // Expand the worklist by looking through any bitcasts and getelementptr
3215 // instructions we've already identified as scalar. This is similar to the
3216 // expansion step in collectLoopUniforms(); however, here we're only
3217 // expanding to include additional bitcasts and getelementptr instructions.
3218 unsigned Idx = 0;
3219 while (Idx != Worklist.size()) {
3220 Instruction *Dst = Worklist[Idx++];
3221 if (!isLoopVaryingGEP(Dst->getOperand(0)))
3222 continue;
3223 auto *Src = cast<Instruction>(Dst->getOperand(0));
3224 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3225 auto *J = cast<Instruction>(U);
3226 return !TheLoop->contains(J) || Worklist.count(J) ||
3227 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3228 isScalarUse(J, Src));
3229 })) {
3230 Worklist.insert(Src);
3231 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3232 }
3233 }
3234
3235 // An induction variable will remain scalar if all users of the induction
3236 // variable and induction variable update remain scalar.
3237 for (const auto &Induction : Legal->getInductionVars()) {
3238 auto *Ind = Induction.first;
3239 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3240
3241 // If tail-folding is applied, the primary induction variable will be used
3242 // to feed a vector compare.
3243 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3244 continue;
3245
3246 // Returns true if \p Indvar is a pointer induction that is used directly by
3247 // load/store instruction \p I.
3248 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3249 Instruction *I) {
3250 return Induction.second.getKind() ==
3252 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3253 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3254 };
3255
3256 // Determine if all users of the induction variable are scalar after
3257 // vectorization.
3258 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3259 auto *I = cast<Instruction>(U);
3260 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3261 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3262 });
3263 if (!ScalarInd)
3264 continue;
3265
3266 // If the induction variable update is a fixed-order recurrence, neither the
3267 // induction variable or its update should be marked scalar after
3268 // vectorization.
3269 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3270 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3271 continue;
3272
3273 // Determine if all users of the induction variable update instruction are
3274 // scalar after vectorization.
3275 auto ScalarIndUpdate =
3276 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3277 auto *I = cast<Instruction>(U);
3278 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3279 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3280 });
3281 if (!ScalarIndUpdate)
3282 continue;
3283
3284 // The induction variable and its update instruction will remain scalar.
3285 Worklist.insert(Ind);
3286 Worklist.insert(IndUpdate);
3287 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3288 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3289 << "\n");
3290 }
3291
3292 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3293}
3294
3296 Instruction *I, ElementCount VF) const {
3297 if (!isPredicatedInst(I))
3298 return false;
3299
3300 // Do we have a non-scalar lowering for this predicated
3301 // instruction? No - it is scalar with predication.
3302 switch(I->getOpcode()) {
3303 default:
3304 return true;
3305 case Instruction::Call:
3306 if (VF.isScalar())
3307 return true;
3308 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3309 .Kind == CM_Scalarize;
3310 case Instruction::Load:
3311 case Instruction::Store: {
3313 auto *Ty = getLoadStoreType(I);
3314 Type *VTy = Ty;
3315 if (VF.isVector())
3316 VTy = VectorType::get(Ty, VF);
3317 const Align Alignment = getLoadStoreAlignment(I);
3318 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3319 TTI.isLegalMaskedGather(VTy, Alignment))
3320 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3321 TTI.isLegalMaskedScatter(VTy, Alignment));
3322 }
3323 case Instruction::UDiv:
3324 case Instruction::SDiv:
3325 case Instruction::SRem:
3326 case Instruction::URem: {
3327 // We have the option to use the safe-divisor idiom to avoid predication.
3328 // The cost based decision here will always select safe-divisor for
3329 // scalable vectors as scalarization isn't legal.
3330 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3331 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3332 }
3333 }
3334}
3335
3336// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3338 // If predication is not needed, avoid it.
3339 // TODO: We can use the loop-preheader as context point here and get
3340 // context sensitive reasoning for isSafeToSpeculativelyExecute.
3341 if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3343 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3344 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3345 return false;
3346
3347 // If the instruction was executed conditionally in the original scalar loop,
3348 // predication is needed with a mask whose lanes are all possibly inactive.
3349 if (Legal->blockNeedsPredication(I->getParent()))
3350 return true;
3351
3352 // All that remain are instructions with side-effects originally executed in
3353 // the loop unconditionally, but now execute under a tail-fold mask (only)
3354 // having at least one active lane (the first). If the side-effects of the
3355 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3356 // - it will cause the same side-effects as when masked.
3357 switch(I->getOpcode()) {
3358 default:
3360 "instruction should have been considered by earlier checks");
3361 case Instruction::Call:
3362 // Side-effects of a Call are assumed to be non-invariant, needing a
3363 // (fold-tail) mask.
3365 "should have returned earlier for calls not needing a mask");
3366 return true;
3367 case Instruction::Load:
3368 // If the address is loop invariant no predication is needed.
3370 case Instruction::Store: {
3371 // For stores, we need to prove both speculation safety (which follows from
3372 // the same argument as loads), but also must prove the value being stored
3373 // is correct. The easiest form of the later is to require that all values
3374 // stored are the same.
3376 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3377 }
3378 case Instruction::UDiv:
3379 case Instruction::SDiv:
3380 case Instruction::SRem:
3381 case Instruction::URem:
3382 // If the divisor is loop-invariant no predication is needed.
3383 return !TheLoop->isLoopInvariant(I->getOperand(1));
3384 }
3385}
3386
3387std::pair<InstructionCost, InstructionCost>
3389 ElementCount VF) const {
3390 assert(I->getOpcode() == Instruction::UDiv ||
3391 I->getOpcode() == Instruction::SDiv ||
3392 I->getOpcode() == Instruction::SRem ||
3393 I->getOpcode() == Instruction::URem);
3395
3397
3398 // Scalarization isn't legal for scalable vector types
3399 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3400 if (!VF.isScalable()) {
3401 // Get the scalarization cost and scale this amount by the probability of
3402 // executing the predicated block. If the instruction is not predicated,
3403 // we fall through to the next case.
3404 ScalarizationCost = 0;
3405
3406 // These instructions have a non-void type, so account for the phi nodes
3407 // that we will create. This cost is likely to be zero. The phi node
3408 // cost, if any, should be scaled by the block probability because it
3409 // models a copy at the end of each predicated block.
3410 ScalarizationCost += VF.getKnownMinValue() *
3411 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3412
3413 // The cost of the non-predicated instruction.
3414 ScalarizationCost += VF.getKnownMinValue() *
3415 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3416
3417 // The cost of insertelement and extractelement instructions needed for
3418 // scalarization.
3419 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3420
3421 // Scale the cost by the probability of executing the predicated blocks.
3422 // This assumes the predicated block for each vector lane is equally
3423 // likely.
3424 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3425 }
3426 InstructionCost SafeDivisorCost = 0;
3427
3428 auto *VecTy = ToVectorTy(I->getType(), VF);
3429
3430 // The cost of the select guard to ensure all lanes are well defined
3431 // after we speculate above any internal control flow.
3432 SafeDivisorCost += TTI.getCmpSelInstrCost(
3433 Instruction::Select, VecTy,
3434 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
3436
3437 // Certain instructions can be cheaper to vectorize if they have a constant
3438 // second vector operand. One example of this are shifts on x86.
3439 Value *Op2 = I->getOperand(1);
3440 auto Op2Info = TTI.getOperandInfo(Op2);
3441 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3442 Legal->isInvariant(Op2))
3444
3445 SmallVector<const Value *, 4> Operands(I->operand_values());
3446 SafeDivisorCost += TTI.getArithmeticInstrCost(
3447 I->getOpcode(), VecTy, CostKind,
3448 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3449 Op2Info, Operands, I);
3450 return {ScalarizationCost, SafeDivisorCost};
3451}
3452
3454 Instruction *I, ElementCount VF) const {
3455 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3457 "Decision should not be set yet.");
3458 auto *Group = getInterleavedAccessGroup(I);
3459 assert(Group && "Must have a group.");
3460
3461 // If the instruction's allocated size doesn't equal it's type size, it
3462 // requires padding and will be scalarized.
3463 auto &DL = I->getDataLayout();
3464 auto *ScalarTy = getLoadStoreType(I);
3465 if (hasIrregularType(ScalarTy, DL))
3466 return false;
3467
3468 // If the group involves a non-integral pointer, we may not be able to
3469 // losslessly cast all values to a common type.
3470 unsigned InterleaveFactor = Group->getFactor();
3471 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3472 for (unsigned i = 0; i < InterleaveFactor; i++) {
3473 Instruction *Member = Group->getMember(i);
3474 if (!Member)
3475 continue;
3476 auto *MemberTy = getLoadStoreType(Member);
3477 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3478 // Don't coerce non-integral pointers to integers or vice versa.
3479 if (MemberNI != ScalarNI) {
3480 // TODO: Consider adding special nullptr value case here
3481 return false;
3482 } else if (MemberNI && ScalarNI &&
3483 ScalarTy->getPointerAddressSpace() !=
3484 MemberTy->getPointerAddressSpace()) {
3485 return false;
3486 }
3487 }
3488
3489 // Check if masking is required.
3490 // A Group may need masking for one of two reasons: it resides in a block that
3491 // needs predication, or it was decided to use masking to deal with gaps
3492 // (either a gap at the end of a load-access that may result in a speculative
3493 // load, or any gaps in a store-access).
3494 bool PredicatedAccessRequiresMasking =
3495 blockNeedsPredicationForAnyReason(I->getParent()) &&
3497 bool LoadAccessWithGapsRequiresEpilogMasking =
3498 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3500 bool StoreAccessWithGapsRequiresMasking =
3501 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3502 if (!PredicatedAccessRequiresMasking &&
3503 !LoadAccessWithGapsRequiresEpilogMasking &&
3504 !StoreAccessWithGapsRequiresMasking)
3505 return true;
3506
3507 // If masked interleaving is required, we expect that the user/target had
3508 // enabled it, because otherwise it either wouldn't have been created or
3509 // it should have been invalidated by the CostModel.
3511 "Masked interleave-groups for predicated accesses are not enabled.");
3512
3513 if (Group->isReverse())
3514 return false;
3515
3516 auto *Ty = getLoadStoreType(I);
3517 const Align Alignment = getLoadStoreAlignment(I);
3518 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3519 : TTI.isLegalMaskedStore(Ty, Alignment);
3520}
3521
3523 Instruction *I, ElementCount VF) {
3524 // Get and ensure we have a valid memory instruction.
3525 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3526
3528 auto *ScalarTy = getLoadStoreType(I);
3529
3530 // In order to be widened, the pointer should be consecutive, first of all.
3531 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3532 return false;
3533
3534 // If the instruction is a store located in a predicated block, it will be
3535 // scalarized.
3536 if (isScalarWithPredication(I, VF))
3537 return false;
3538
3539 // If the instruction's allocated size doesn't equal it's type size, it
3540 // requires padding and will be scalarized.
3541 auto &DL = I->getDataLayout();
3542 if (hasIrregularType(ScalarTy, DL))
3543 return false;
3544
3545 return true;
3546}
3547
3548void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3549 // We should not collect Uniforms more than once per VF. Right now,
3550 // this function is called from collectUniformsAndScalars(), which
3551 // already does this check. Collecting Uniforms for VF=1 does not make any
3552 // sense.
3553
3554 assert(VF.isVector() && !Uniforms.contains(VF) &&
3555 "This function should not be visited twice for the same VF");
3556
3557 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
3558 // not analyze again. Uniforms.count(VF) will return 1.
3559 Uniforms[VF].clear();
3560
3561 // We now know that the loop is vectorizable!
3562 // Collect instructions inside the loop that will remain uniform after
3563 // vectorization.
3564
3565 // Global values, params and instructions outside of current loop are out of
3566 // scope.
3567 auto isOutOfScope = [&](Value *V) -> bool {
3568 Instruction *I = dyn_cast<Instruction>(V);
3569 return (!I || !TheLoop->contains(I));
3570 };
3571
3572 // Worklist containing uniform instructions demanding lane 0.
3573 SetVector<Instruction *> Worklist;
3574
3575 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3576 // that require predication must not be considered uniform after
3577 // vectorization, because that would create an erroneous replicating region
3578 // where only a single instance out of VF should be formed.
3579 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
3580 if (isOutOfScope(I)) {
3581 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3582 << *I << "\n");
3583 return;
3584 }
3585 if (isPredicatedInst(I)) {
3586 LLVM_DEBUG(
3587 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3588 << "\n");
3589 return;
3590 }
3591 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3592 Worklist.insert(I);
3593 };
3594
3595 // Start with the conditional branches exiting the loop. If the branch
3596 // condition is an instruction contained in the loop that is only used by the
3597 // branch, it is uniform.
3599 TheLoop->getExitingBlocks(Exiting);
3600 for (BasicBlock *E : Exiting) {
3601 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3602 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3603 addToWorklistIfAllowed(Cmp);
3604 }
3605
3606 auto PrevVF = VF.divideCoefficientBy(2);
3607 // Return true if all lanes perform the same memory operation, and we can
3608 // thus chose to execute only one.
3609 auto isUniformMemOpUse = [&](Instruction *I) {
3610 // If the value was already known to not be uniform for the previous
3611 // (smaller VF), it cannot be uniform for the larger VF.
3612 if (PrevVF.isVector()) {
3613 auto Iter = Uniforms.find(PrevVF);
3614 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3615 return false;
3616 }
3617 if (!Legal->isUniformMemOp(*I, VF))
3618 return false;
3619 if (isa<LoadInst>(I))
3620 // Loading the same address always produces the same result - at least
3621 // assuming aliasing and ordering which have already been checked.
3622 return true;
3623 // Storing the same value on every iteration.
3624 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3625 };
3626
3627 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
3628 InstWidening WideningDecision = getWideningDecision(I, VF);
3629 assert(WideningDecision != CM_Unknown &&
3630 "Widening decision should be ready at this moment");
3631
3632 if (isUniformMemOpUse(I))
3633 return true;
3634
3635 return (WideningDecision == CM_Widen ||
3636 WideningDecision == CM_Widen_Reverse ||
3637 WideningDecision == CM_Interleave);
3638 };
3639
3640 // Returns true if Ptr is the pointer operand of a memory access instruction
3641 // I, I is known to not require scalarization, and the pointer is not also
3642 // stored.
3643 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3644 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3645 return false;
3646 return getLoadStorePointerOperand(I) == Ptr &&
3647 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3648 };
3649
3650 // Holds a list of values which are known to have at least one uniform use.
3651 // Note that there may be other uses which aren't uniform. A "uniform use"
3652 // here is something which only demands lane 0 of the unrolled iterations;
3653 // it does not imply that all lanes produce the same value (e.g. this is not
3654 // the usual meaning of uniform)
3655 SetVector<Value *> HasUniformUse;
3656
3657 // Scan the loop for instructions which are either a) known to have only
3658 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3659 for (auto *BB : TheLoop->blocks())
3660 for (auto &I : *BB) {
3661 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3662 switch (II->getIntrinsicID()) {
3663 case Intrinsic::sideeffect:
3664 case Intrinsic::experimental_noalias_scope_decl:
3665 case Intrinsic::assume:
3666 case Intrinsic::lifetime_start:
3667 case Intrinsic::lifetime_end:
3669 addToWorklistIfAllowed(&I);
3670 break;
3671 default:
3672 break;
3673 }
3674 }
3675
3676 // ExtractValue instructions must be uniform, because the operands are
3677 // known to be loop-invariant.
3678 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3679 assert(isOutOfScope(EVI->getAggregateOperand()) &&
3680 "Expected aggregate value to be loop invariant");
3681 addToWorklistIfAllowed(EVI);
3682 continue;
3683 }
3684
3685 // If there's no pointer operand, there's nothing to do.
3687 if (!Ptr)
3688 continue;
3689
3690 if (isUniformMemOpUse(&I))
3691 addToWorklistIfAllowed(&I);
3692
3693 if (isVectorizedMemAccessUse(&I, Ptr))
3694 HasUniformUse.insert(Ptr);
3695 }
3696
3697 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3698 // demanding) users. Since loops are assumed to be in LCSSA form, this
3699 // disallows uses outside the loop as well.
3700 for (auto *V : HasUniformUse) {
3701 if (isOutOfScope(V))
3702 continue;
3703 auto *I = cast<Instruction>(V);
3704 auto UsersAreMemAccesses =
3705 llvm::all_of(I->users(), [&](User *U) -> bool {
3706 auto *UI = cast<Instruction>(U);
3707 return TheLoop->contains(UI) && isVectorizedMemAccessUse(UI, V);
3708 });
3709 if (UsersAreMemAccesses)
3710 addToWorklistIfAllowed(I);
3711 }
3712
3713 // Expand Worklist in topological order: whenever a new instruction
3714 // is added , its users should be already inside Worklist. It ensures
3715 // a uniform instruction will only be used by uniform instructions.
3716 unsigned idx = 0;
3717 while (idx != Worklist.size()) {
3718 Instruction *I = Worklist[idx++];
3719
3720 for (auto *OV : I->operand_values()) {
3721 // isOutOfScope operands cannot be uniform instructions.
3722 if (isOutOfScope(OV))
3723 continue;
3724 // First order recurrence Phi's should typically be considered
3725 // non-uniform.
3726 auto *OP = dyn_cast<PHINode>(OV);
3728 continue;
3729 // If all the users of the operand are uniform, then add the
3730 // operand into the uniform worklist.
3731 auto *OI = cast<Instruction>(OV);
3732 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3733 auto *J = cast<Instruction>(U);
3734 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
3735 }))
3736 addToWorklistIfAllowed(OI);
3737 }
3738 }
3739
3740 // For an instruction to be added into Worklist above, all its users inside
3741 // the loop should also be in Worklist. However, this condition cannot be
3742 // true for phi nodes that form a cyclic dependence. We must process phi
3743 // nodes separately. An induction variable will remain uniform if all users
3744 // of the induction variable and induction variable update remain uniform.
3745 // The code below handles both pointer and non-pointer induction variables.
3746 BasicBlock *Latch = TheLoop->getLoopLatch();
3747 for (const auto &Induction : Legal->getInductionVars()) {
3748 auto *Ind = Induction.first;
3749 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3750
3751 // Determine if all users of the induction variable are uniform after
3752 // vectorization.
3753 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3754 auto *I = cast<Instruction>(U);
3755 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3756 isVectorizedMemAccessUse(I, Ind);
3757 });
3758 if (!UniformInd)
3759 continue;
3760
3761 // Determine if all users of the induction variable update instruction are
3762 // uniform after vectorization.
3763 auto UniformIndUpdate =
3764 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3765 auto *I = cast<Instruction>(U);
3766 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3767 isVectorizedMemAccessUse(I, IndUpdate);
3768 });
3769 if (!UniformIndUpdate)
3770 continue;
3771
3772 // The induction variable and its update instruction will remain uniform.
3773 addToWorklistIfAllowed(Ind);
3774 addToWorklistIfAllowed(IndUpdate);
3775 }
3776
3777 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3778}
3779
3781 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3782
3784 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3785 "runtime pointer checks needed. Enable vectorization of this "
3786 "loop with '#pragma clang loop vectorize(enable)' when "
3787 "compiling with -Os/-Oz",
3788 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3789 return true;
3790 }
3791
3792 if (!PSE.getPredicate().isAlwaysTrue()) {
3793 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3794 "runtime SCEV checks needed. Enable vectorization of this "
3795 "loop with '#pragma clang loop vectorize(enable)' when "
3796 "compiling with -Os/-Oz",
3797 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3798 return true;
3799 }
3800
3801 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3802 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3803 reportVectorizationFailure("Runtime stride check for small trip count",
3804 "runtime stride == 1 checks needed. Enable vectorization of "
3805 "this loop without such check by compiling with -Os/-Oz",
3806 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3807 return true;
3808 }
3809
3810 return false;
3811}
3812
3813bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3814 if (IsScalableVectorizationAllowed)
3815 return *IsScalableVectorizationAllowed;
3816
3817 IsScalableVectorizationAllowed = false;
3819 return false;
3820
3822 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3823 "ScalableVectorizationDisabled", ORE, TheLoop);
3824 return false;
3825 }
3826
3827 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3828
3829 auto MaxScalableVF = ElementCount::getScalable(
3830 std::numeric_limits<ElementCount::ScalarTy>::max());
3831
3832 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3833 // FIXME: While for scalable vectors this is currently sufficient, this should
3834 // be replaced by a more detailed mechanism that filters out specific VFs,
3835 // instead of invalidating vectorization for a whole set of VFs based on the
3836 // MaxVF.
3837
3838 // Disable scalable vectorization if the loop contains unsupported reductions.
3839 if (!canVectorizeReductions(MaxScalableVF)) {
3841 "Scalable vectorization not supported for the reduction "
3842 "operations found in this loop.",
3843 "ScalableVFUnfeasible", ORE, TheLoop);
3844 return false;
3845 }
3846
3847 // Disable scalable vectorization if the loop contains any instructions
3848 // with element types not supported for scalable vectors.
3849 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3850 return !Ty->isVoidTy() &&
3852 })) {
3853 reportVectorizationInfo("Scalable vectorization is not supported "
3854 "for all element types found in this loop.",
3855 "ScalableVFUnfeasible", ORE, TheLoop);
3856 return false;
3857 }
3858
3860 reportVectorizationInfo("The target does not provide maximum vscale value "
3861 "for safe distance analysis.",
3862 "ScalableVFUnfeasible", ORE, TheLoop);
3863 return false;
3864 }
3865
3866 IsScalableVectorizationAllowed = true;
3867 return true;
3868}
3869
3871LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3872 if (!isScalableVectorizationAllowed())
3873 return ElementCount::getScalable(0);
3874
3875 auto MaxScalableVF = ElementCount::getScalable(
3876 std::numeric_limits<ElementCount::ScalarTy>::max());
3878 return MaxScalableVF;
3879
3880 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3881 // Limit MaxScalableVF by the maximum safe dependence distance.
3882 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3883
3884 if (!MaxScalableVF)
3886 "Max legal vector width too small, scalable vectorization "
3887 "unfeasible.",
3888 "ScalableVFUnfeasible", ORE, TheLoop);
3889
3890 return MaxScalableVF;
3891}
3892
3893FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3894 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3896 unsigned SmallestType, WidestType;
3897 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3898
3899 // Get the maximum safe dependence distance in bits computed by LAA.
3900 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3901 // the memory accesses that is most restrictive (involved in the smallest
3902 // dependence distance).
3903 unsigned MaxSafeElements =
3905
3906 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3907 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3908
3909 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3910 << ".\n");
3911 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3912 << ".\n");
3913
3914 // First analyze the UserVF, fall back if the UserVF should be ignored.
3915 if (UserVF) {
3916 auto MaxSafeUserVF =
3917 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3918
3919 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3920 // If `VF=vscale x N` is safe, then so is `VF=N`
3921 if (UserVF.isScalable())
3922 return FixedScalableVFPair(
3923 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3924 else
3925 return UserVF;
3926 }
3927
3928 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3929
3930 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3931 // is better to ignore the hint and let the compiler choose a suitable VF.
3932 if (!UserVF.isScalable()) {
3933 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3934 << " is unsafe, clamping to max safe VF="
3935 << MaxSafeFixedVF << ".\n");
3936 ORE->emit([&]() {
3937 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3939 TheLoop->getHeader())
3940 << "User-specified vectorization factor "
3941 << ore::NV("UserVectorizationFactor", UserVF)
3942 << " is unsafe, clamping to maximum safe vectorization factor "
3943 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3944 });
3945 return MaxSafeFixedVF;
3946 }
3947
3949 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3950 << " is ignored because scalable vectors are not "
3951 "available.\n");
3952 ORE->emit([&]() {
3953 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3955 TheLoop->getHeader())
3956 << "User-specified vectorization factor "
3957 << ore::NV("UserVectorizationFactor", UserVF)
3958 << " is ignored because the target does not support scalable "
3959 "vectors. The compiler will pick a more suitable value.";
3960 });
3961 } else {
3962 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3963 << " is unsafe. Ignoring scalable UserVF.\n");
3964 ORE->emit([&]() {
3965 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3967 TheLoop->getHeader())
3968 << "User-specified vectorization factor "
3969 << ore::NV("UserVectorizationFactor", UserVF)
3970 << " is unsafe. Ignoring the hint to let the compiler pick a "
3971 "more suitable value.";
3972 });
3973 }
3974 }
3975
3976 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3977 << " / " << WidestType << " bits.\n");
3978
3981 if (auto MaxVF =
3982 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3983 MaxSafeFixedVF, FoldTailByMasking))
3984 Result.FixedVF = MaxVF;
3985
3986 if (auto MaxVF =
3987 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3988 MaxSafeScalableVF, FoldTailByMasking))
3989 if (MaxVF.isScalable()) {
3990 Result.ScalableVF = MaxVF;
3991 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3992 << "\n");
3993 }
3994
3995 return Result;
3996}
3997
4001 // TODO: It may by useful to do since it's still likely to be dynamically
4002 // uniform if the target can skip.
4004 "Not inserting runtime ptr check for divergent target",
4005 "runtime pointer checks needed. Not enabled for divergent target",
4006 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4008 }
4009
4010 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4011 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4012 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4013 if (TC == 1) {
4014 reportVectorizationFailure("Single iteration (non) loop",
4015 "loop trip count is one, irrelevant for vectorization",
4016 "SingleIterationLoop", ORE, TheLoop);
4018 }
4019
4020 switch (ScalarEpilogueStatus) {
4022 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4024 [[fallthrough]];
4026 LLVM_DEBUG(
4027 dbgs() << "LV: vector predicate hint/switch found.\n"
4028 << "LV: Not allowing scalar epilogue, creating predicated "
4029 << "vector loop.\n");
4030 break;
4032 // fallthrough as a special case of OptForSize
4034 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4035 LLVM_DEBUG(
4036 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4037 else
4038 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4039 << "count.\n");
4040
4041 // Bail if runtime checks are required, which are not good when optimising
4042 // for size.
4045
4046 break;
4047 }
4048
4049 // The only loops we can vectorize without a scalar epilogue, are loops with
4050 // a bottom-test and a single exiting block. We'd have to handle the fact
4051 // that not every instruction executes on the last iteration. This will
4052 // require a lane mask which varies through the vector loop body. (TODO)
4054 // If there was a tail-folding hint/switch, but we can't fold the tail by
4055 // masking, fallback to a vectorization with a scalar epilogue.
4056 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4057 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4058 "scalar epilogue instead.\n");
4059 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4060 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4061 }
4063 }
4064
4065 // Now try the tail folding
4066
4067 // Invalidate interleave groups that require an epilogue if we can't mask
4068 // the interleave-group.
4070 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4071 "No decisions should have been taken at this point");
4072 // Note: There is no need to invalidate any cost modeling decisions here, as
4073 // non where taken so far.
4075 }
4076
4077 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4078
4079 // Avoid tail folding if the trip count is known to be a multiple of any VF
4080 // we choose.
4081 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4082 MaxFactors.FixedVF.getFixedValue();
4083 if (MaxFactors.ScalableVF) {
4084 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4085 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4086 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4087 *MaxPowerOf2RuntimeVF,
4088 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4089 } else
4090 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4091 }
4092
4093 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4094 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4095 "MaxFixedVF must be a power of 2");
4096 unsigned MaxVFtimesIC =
4097 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4098 ScalarEvolution *SE = PSE.getSE();
4099 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4100 const SCEV *ExitCount = SE->getAddExpr(
4101 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4102 const SCEV *Rem = SE->getURemExpr(
4103 SE->applyLoopGuards(ExitCount, TheLoop),
4104 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4105 if (Rem->isZero()) {
4106 // Accept MaxFixedVF if we do not have a tail.
4107 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4108 return MaxFactors;
4109 }
4110 }
4111
4112 // If we don't know the precise trip count, or if the trip count that we
4113 // found modulo the vectorization factor is not zero, try to fold the tail
4114 // by masking.
4115 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4116 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4117 if (foldTailByMasking()) {
4119 LLVM_DEBUG(
4120 dbgs()
4121 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4122 "try to generate VP Intrinsics with scalable vector "
4123 "factors only.\n");
4124 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4125 // for now.
4126 // TODO: extend it for fixed vectors, if required.
4127 assert(MaxFactors.ScalableVF.isScalable() &&
4128 "Expected scalable vector factor.");
4129
4130 MaxFactors.FixedVF = ElementCount::getFixed(1);
4131 }
4132 return MaxFactors;
4133 }
4134
4135 // If there was a tail-folding hint/switch, but we can't fold the tail by
4136 // masking, fallback to a vectorization with a scalar epilogue.
4137 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4138 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4139 "scalar epilogue instead.\n");
4140 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4141 return MaxFactors;
4142 }
4143
4144 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4145 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4147 }
4148
4149 if (TC == 0) {
4151 "Unable to calculate the loop count due to complex control flow",
4152 "unable to calculate the loop count due to complex control flow",
4153 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4155 }
4156
4158 "Cannot optimize for size and vectorize at the same time.",
4159 "cannot optimize for size and vectorize at the same time. "
4160 "Enable vectorization of this loop with '#pragma clang loop "
4161 "vectorize(enable)' when compiling with -Os/-Oz",
4162 "NoTailLoopWithOptForSize", ORE, TheLoop);
4164}
4165
4166ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4167 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4168 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4169 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4170 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4171 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4173
4174 // Convenience function to return the minimum of two ElementCounts.
4175 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4176 assert((LHS.isScalable() == RHS.isScalable()) &&
4177 "Scalable flags must match");
4178 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4179 };
4180
4181 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4182 // Note that both WidestRegister and WidestType may not be a powers of 2.
4183 auto MaxVectorElementCount = ElementCount::get(
4184 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4185 ComputeScalableMaxVF);
4186 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4187 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4188 << (MaxVectorElementCount * WidestType) << " bits.\n");
4189
4190 if (!MaxVectorElementCount) {
4191 LLVM_DEBUG(dbgs() << "LV: The target has no "
4192 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4193 << " vector registers.\n");
4194 return ElementCount::getFixed(1);
4195 }
4196
4197 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4198 if (MaxVectorElementCount.isScalable() &&
4199 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4200 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4201 auto Min = Attr.getVScaleRangeMin();
4202 WidestRegisterMinEC *= Min;
4203 }
4204
4205 // When a scalar epilogue is required, at least one iteration of the scalar
4206 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4207 // max VF that results in a dead vector loop.
4208 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4209 MaxTripCount -= 1;
4210
4211 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4212 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4213 // If upper bound loop trip count (TC) is known at compile time there is no
4214 // point in choosing VF greater than TC (as done in the loop below). Select
4215 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4216 // scalable, we only fall back on a fixed VF when the TC is less than or
4217 // equal to the known number of lanes.
4218 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4219 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4220 "exceeding the constant trip count: "
4221 << ClampedUpperTripCount << "\n");
4222 return ElementCount::get(
4223 ClampedUpperTripCount,
4224 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4225 }
4226
4228 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4230 ElementCount MaxVF = MaxVectorElementCount;
4231 if (MaximizeBandwidth ||
4232 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4235 auto MaxVectorElementCountMaxBW = ElementCount::get(
4236 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4237 ComputeScalableMaxVF);
4238 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4239
4240 // Collect all viable vectorization factors larger than the default MaxVF
4241 // (i.e. MaxVectorElementCount).
4243 for (ElementCount VS = MaxVectorElementCount * 2;
4244 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4245 VFs.push_back(VS);
4246
4247 // For each VF calculate its register usage.
4248 auto RUs = calculateRegisterUsage(VFs);
4249
4250 // Select the largest VF which doesn't require more registers than existing
4251 // ones.
4252 for (int I = RUs.size() - 1; I >= 0; --I) {
4253 const auto &MLU = RUs[I].MaxLocalUsers;
4254 if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4255 return LU.second <= TTI.getNumberOfRegisters(LU.first);
4256 })) {
4257 MaxVF = VFs[I];
4258 break;
4259 }
4260 }
4261 if (ElementCount MinVF =
4262 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4263 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4264 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4265 << ") with target's minimum: " << MinVF << '\n');
4266 MaxVF = MinVF;
4267 }
4268 }
4269
4270 // Invalidate any widening decisions we might have made, in case the loop
4271 // requires prediction (decided later), but we have already made some
4272 // load/store widening decisions.
4274 }
4275 return MaxVF;
4276}
4277
4278/// Convenience function that returns the value of vscale_range iff
4279/// vscale_range.min == vscale_range.max or otherwise returns the value
4280/// returned by the corresponding TTI method.
4281static std::optional<unsigned>
4283 const Function *Fn = L->getHeader()->getParent();
4284 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4285 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4286 auto Min = Attr.getVScaleRangeMin();
4287 auto Max = Attr.getVScaleRangeMax();
4288 if (Max && Min == Max)
4289 return Max;
4290 }
4291
4292 return TTI.getVScaleForTuning();
4293}
4294
4295bool LoopVectorizationPlanner::isMoreProfitable(
4296 const VectorizationFactor &A, const VectorizationFactor &B) const {
4297 InstructionCost CostA = A.Cost;
4298 InstructionCost CostB = B.Cost;
4299
4300 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4301
4302 // Improve estimate for the vector width if it is scalable.
4303 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4304 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4305 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4306 if (A.Width.isScalable())
4307 EstimatedWidthA *= *VScale;
4308 if (B.Width.isScalable())
4309 EstimatedWidthB *= *VScale;
4310 }
4311
4312 // Assume vscale may be larger than 1 (or the value being tuned for),
4313 // so that scalable vectorization is slightly favorable over fixed-width
4314 // vectorization.
4315 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4316 A.Width.isScalable() && !B.Width.isScalable();
4317
4318 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4319 const InstructionCost &RHS) {
4320 return PreferScalable ? LHS <= RHS : LHS < RHS;
4321 };
4322
4323 // To avoid the need for FP division:
4324 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4325 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4326 if (!MaxTripCount)
4327 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4328
4329 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4330 InstructionCost VectorCost,
4331 InstructionCost ScalarCost) {
4332 // If the trip count is a known (possibly small) constant, the trip count
4333 // will be rounded up to an integer number of iterations under
4334 // FoldTailByMasking. The total cost in that case will be
4335 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4336 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4337 // some extra overheads, but for the purpose of comparing the costs of
4338 // different VFs we can use this to compare the total loop-body cost
4339 // expected after vectorization.
4340 if (CM.foldTailByMasking())
4341 return VectorCost * divideCeil(MaxTripCount, VF);
4342 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4343 };
4344
4345 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4346 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4347 return CmpFn(RTCostA, RTCostB);
4348}
4349
4352 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4353 LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
4354 SmallVector<RecipeVFPair> InvalidCosts;
4355 for (const auto &Plan : VPlans) {
4356 for (ElementCount VF : Plan->vectorFactors()) {
4357 VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx,
4358 CM);
4359 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4360 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4361 for (auto &R : *VPBB) {
4362 if (!R.cost(VF, CostCtx).isValid())
4363 InvalidCosts.emplace_back(&R, VF);
4364 }
4365 }
4366 }
4367 }
4368 if (InvalidCosts.empty())
4369 return;
4370
4371 // Emit a report of VFs with invalid costs in the loop.
4372
4373 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4375 unsigned I = 0;
4376 for (auto &Pair : InvalidCosts)
4377 if (!Numbering.count(Pair.first))
4378 Numbering[Pair.first] = I++;
4379
4380 // Sort the list, first on recipe(number) then on VF.
4381 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4382 if (Numbering[A.first] != Numbering[B.first])
4383 return Numbering[A.first] < Numbering[B.first];
4384 const auto &LHS = A.second;
4385 const auto &RHS = B.second;
4386 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4387 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4388 });
4389
4390 // For a list of ordered recipe-VF pairs:
4391 // [(load, VF1), (load, VF2), (store, VF1)]
4392 // group the recipes together to emit separate remarks for:
4393 // load (VF1, VF2)
4394 // store (VF1)
4395 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4396 auto Subset = ArrayRef<RecipeVFPair>();
4397 do {
4398 if (Subset.empty())
4399 Subset = Tail.take_front(1);
4400
4401 VPRecipeBase *R = Subset.front().first;
4402
4403 unsigned Opcode =
4406 [](const auto *R) { return Instruction::PHI; })
4407 .Case<VPWidenSelectRecipe>(
4408 [](const auto *R) { return Instruction::Select; })
4409 .Case<VPWidenStoreRecipe>(
4410 [](const auto *R) { return Instruction::Store; })
4411 .Case<VPWidenLoadRecipe>(
4412 [](const auto *R) { return Instruction::Load; })
4413 .Case<VPWidenCallRecipe>(
4414 [](const auto *R) { return Instruction::Call; })
4417 [](const auto *R) { return R->getOpcode(); })
4418 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4419 return R->getStoredValues().empty() ? Instruction::Load
4420 : Instruction::Store;
4421 });
4422
4423 // If the next recipe is different, or if there are no other pairs,
4424 // emit a remark for the collated subset. e.g.
4425 // [(load, VF1), (load, VF2))]
4426 // to emit:
4427 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4428 if (Subset == Tail || Tail[Subset.size()].first != R) {
4429 std::string OutString;
4430 raw_string_ostream OS(OutString);
4431 assert(!Subset.empty() && "Unexpected empty range");
4432 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4433 for (const auto &Pair : Subset)
4434 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4435 OS << "):";
4436 if (Opcode == Instruction::Call) {
4437 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4438 Function *CalledFn =
4439 WidenCall ? WidenCall->getCalledScalarFunction()
4440 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4441 ->getLiveInIRValue());
4442 OS << " call to " << CalledFn->getName();
4443 } else
4444 OS << " " << Instruction::getOpcodeName(Opcode);
4445 OS.flush();
4446 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4447 R->getDebugLoc());
4448 Tail = Tail.drop_front(Subset.size());
4449 Subset = {};
4450 } else
4451 // Grow the subset by one element
4452 Subset = Tail.take_front(Subset.size() + 1);
4453 } while (!Tail.empty());
4454}
4455
4456/// Check if any recipe of \p Plan will generate a vector value, which will be
4457/// assigned a vector register.
4459 const TargetTransformInfo &TTI) {
4460 assert(VF.isVector() && "Checking a scalar VF?");
4461 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
4463 DenseSet<VPRecipeBase *> EphemeralRecipes;
4464 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4465 // Set of already visited types.
4466 DenseSet<Type *> Visited;
4467 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4469 for (VPRecipeBase &R : *VPBB) {
4470 if (EphemeralRecipes.contains(&R))
4471 continue;
4472 // Continue early if the recipe is considered to not produce a vector
4473 // result. Note that this includes VPInstruction where some opcodes may
4474 // produce a vector, to preserve existing behavior as VPInstructions model
4475 // aspects not directly mapped to existing IR instructions.
4476 switch (R.getVPDefID()) {
4477 case VPDef::VPDerivedIVSC:
4478 case VPDef::VPScalarIVStepsSC:
4479 case VPDef::VPScalarCastSC:
4480 case VPDef::VPReplicateSC:
4481 case VPDef::VPInstructionSC:
4482 case VPDef::VPCanonicalIVPHISC:
4483 case VPDef::VPVectorPointerSC:
4484 case VPDef::VPExpandSCEVSC:
4485 case VPDef::VPEVLBasedIVPHISC:
4486 case VPDef::VPPredInstPHISC:
4487 case VPDef::VPBranchOnMaskSC:
4488 continue;
4489 case VPDef::VPReductionSC:
4490 case VPDef::VPActiveLaneMaskPHISC:
4491 case VPDef::VPWidenCallSC:
4492 case VPDef::VPWidenCanonicalIVSC:
4493 case VPDef::VPWidenCastSC:
4494 case VPDef::VPWidenGEPSC:
4495 case VPDef::VPWidenSC:
4496 case VPDef::VPWidenSelectSC:
4497 case VPDef::VPBlendSC:
4498 case VPDef::VPFirstOrderRecurrencePHISC:
4499 case VPDef::VPWidenPHISC:
4500 case VPDef::VPWidenIntOrFpInductionSC:
4501 case VPDef::VPWidenPointerInductionSC:
4502 case VPDef::VPReductionPHISC:
4503 case VPDef::VPInterleaveSC:
4504 case VPDef::VPWidenLoadEVLSC:
4505 case VPDef::VPWidenLoadSC:
4506 case VPDef::VPWidenStoreEVLSC:
4507 case VPDef::VPWidenStoreSC:
4508 break;
4509 default:
4510 llvm_unreachable("unhandled recipe");
4511 }
4512
4513 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4514 Type *VectorTy = ToVectorTy(ScalarTy, VF);
4515 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4516 if (!NumLegalParts)
4517 return false;
4518 if (VF.isScalable()) {
4519 // <vscale x 1 x iN> is assumed to be profitable over iN because
4520 // scalable registers are a distinct register class from scalar
4521 // ones. If we ever find a target which wants to lower scalable
4522 // vectors back to scalars, we'll need to update this code to
4523 // explicitly ask TTI about the register class uses for each part.
4524 return NumLegalParts <= VF.getKnownMinValue();
4525 }
4526 // Two or more parts that share a register - are vectorized.
4527 return NumLegalParts < VF.getKnownMinValue();
4528 };
4529
4530 // If no def nor is a store, e.g., branches, continue - no value to check.
4531 if (R.getNumDefinedValues() == 0 &&
4532 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4533 &R))
4534 continue;
4535 // For multi-def recipes, currently only interleaved loads, suffice to
4536 // check first def only.
4537 // For stores check their stored value; for interleaved stores suffice
4538 // the check first stored value only. In all cases this is the second
4539 // operand.
4540 VPValue *ToCheck =
4541 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4542 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4543 if (!Visited.insert({ScalarTy}).second)
4544 continue;
4545 if (WillWiden(ScalarTy))
4546 return true;
4547 }
4548 }
4549
4550 return false;
4551}
4552
4553VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4555 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4556 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4557 assert(any_of(VPlans,
4558 [](std::unique_ptr<VPlan> &P) {
4559 return P->hasVF(ElementCount::getFixed(1));
4560 }) &&
4561 "Expected Scalar VF to be a candidate");
4562
4563 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4564 ExpectedCost);
4565 VectorizationFactor ChosenFactor = ScalarCost;
4566
4567 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4568 if (ForceVectorization &&
4569 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4570 // Ignore scalar width, because the user explicitly wants vectorization.
4571 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4572 // evaluation.
4573 ChosenFactor.Cost = InstructionCost::getMax();
4574 }
4575
4576 for (auto &P : VPlans) {
4577 for (ElementCount VF : P->vectorFactors()) {
4578 // The cost for scalar VF=1 is already calculated, so ignore it.
4579 if (VF.isScalar())
4580 continue;
4581
4583 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4584
4585#ifndef NDEBUG
4586 unsigned AssumedMinimumVscale =
4587 getVScaleForTuning(OrigLoop, TTI).value_or(1);
4588 unsigned Width =
4589 Candidate.Width.isScalable()
4590 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
4591 : Candidate.Width.getFixedValue();
4592 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4593 << " costs: " << (Candidate.Cost / Width));
4594 if (VF.isScalable())
4595 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4596 << AssumedMinimumVscale << ")");
4597 LLVM_DEBUG(dbgs() << ".\n");
4598#endif
4599
4600 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4601 LLVM_DEBUG(
4602 dbgs()
4603 << "LV: Not considering vector loop of width " << VF
4604 << " because it will not generate any vector instructions.\n");
4605 continue;
4606 }
4607
4608 if (isMoreProfitable(Candidate, ChosenFactor))
4609 ChosenFactor = Candidate;
4610 }
4611 }
4612
4615 "There are conditional stores.",
4616 "store that is conditionally executed prevents vectorization",
4617 "ConditionalStore", ORE, OrigLoop);
4618 ChosenFactor = ScalarCost;
4619 }
4620
4621 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4622 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4623 << "LV: Vectorization seems to be not beneficial, "
4624 << "but was forced by a user.\n");
4625 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
4626 return ChosenFactor;
4627}
4628
4629bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4630 ElementCount VF) const {
4631 // Cross iteration phis such as reductions need special handling and are
4632 // currently unsupported.
4633 if (any_of(OrigLoop->getHeader()->phis(),
4634 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4635 return false;
4636
4637 // Phis with uses outside of the loop require special handling and are
4638 // currently unsupported.
4639 for (const auto &Entry : Legal->getInductionVars()) {
4640 // Look for uses of the value of the induction at the last iteration.
4641 Value *PostInc =
4642 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4643 for (User *U : PostInc->users())
4644 if (!OrigLoop->contains(cast<Instruction>(U)))
4645 return false;
4646 // Look for uses of penultimate value of the induction.
4647 for (User *U : Entry.first->users())
4648 if (!OrigLoop->contains(cast<Instruction>(U)))
4649 return false;
4650 }
4651
4652 // Epilogue vectorization code has not been auditted to ensure it handles
4653 // non-latch exits properly. It may be fine, but it needs auditted and
4654 // tested.
4655 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4656 return false;
4657
4658 return true;
4659}
4660
4662 const ElementCount VF) const {
4663 // FIXME: We need a much better cost-model to take different parameters such
4664 // as register pressure, code size increase and cost of extra branches into
4665 // account. For now we apply a very crude heuristic and only consider loops
4666 // with vectorization factors larger than a certain value.
4667
4668 // Allow the target to opt out entirely.
4670 return false;
4671
4672 // We also consider epilogue vectorization unprofitable for targets that don't
4673 // consider interleaving beneficial (eg. MVE).
4674 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4675 return false;
4676
4677 unsigned Multiplier = 1;
4678 if (VF.isScalable())
4679 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
4680 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
4681 return true;
4682 return false;
4683}
4684
4686 const ElementCount MainLoopVF, unsigned IC) {
4689 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4690 return Result;
4691 }
4692
4693 if (!CM.isScalarEpilogueAllowed()) {
4694 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4695 "epilogue is allowed.\n");
4696 return Result;
4697 }
4698
4699 // Not really a cost consideration, but check for unsupported cases here to
4700 // simplify the logic.
4701 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4702 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4703 "is not a supported candidate.\n");
4704 return Result;
4705 }
4706
4708 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4710 if (hasPlanWithVF(ForcedEC))
4711 return {ForcedEC, 0, 0};
4712 else {
4713 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4714 "viable.\n");
4715 return Result;
4716 }
4717 }
4718
4719 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4720 OrigLoop->getHeader()->getParent()->hasMinSize()) {
4721 LLVM_DEBUG(
4722 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4723 return Result;
4724 }
4725
4726 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
4727 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4728 "this loop\n");
4729 return Result;
4730 }
4731
4732 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4733 // the main loop handles 8 lanes per iteration. We could still benefit from
4734 // vectorizing the epilogue loop with VF=4.
4735 ElementCount EstimatedRuntimeVF = MainLoopVF;
4736 if (MainLoopVF.isScalable()) {
4737 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
4738 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
4739 EstimatedRuntimeVF *= *VScale;
4740 }
4741
4742 ScalarEvolution &SE = *PSE.getSE();
4743 Type *TCType = Legal->getWidestInductionType();
4744 const SCEV *RemainingIterations = nullptr;
4745 for (auto &NextVF : ProfitableVFs) {
4746 // Skip candidate VFs without a corresponding VPlan.
4747 if (!hasPlanWithVF(NextVF.Width))
4748 continue;
4749
4750 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
4751 // vectors) or the VF of the main loop (fixed vectors).
4752 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4753 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4754 ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
4755 continue;
4756
4757 // If NextVF is greater than the number of remaining iterations, the
4758 // epilogue loop would be dead. Skip such factors.
4759 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4760 // TODO: extend to support scalable VFs.
4761 if (!RemainingIterations) {
4762 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
4763 RemainingIterations = SE.getURemExpr(
4764 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4765 }
4766 if (SE.isKnownPredicate(
4768 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4769 RemainingIterations))
4770 continue;
4771 }
4772
4773 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
4774 Result = NextVF;
4775 }
4776
4777 if (Result != VectorizationFactor::Disabled())
4778 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4779 << Result.Width << "\n");
4780 return Result;
4781}
4782
4783std::pair<unsigned, unsigned>
4785 unsigned MinWidth = -1U;
4786 unsigned MaxWidth = 8;
4788 // For in-loop reductions, no element types are added to ElementTypesInLoop
4789 // if there are no loads/stores in the loop. In this case, check through the
4790 // reduction variables to determine the maximum width.
4791 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4792 // Reset MaxWidth so that we can find the smallest type used by recurrences
4793 // in the loop.
4794 MaxWidth = -1U;
4795 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4796 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4797 // When finding the min width used by the recurrence we need to account
4798 // for casts on the input operands of the recurrence.
4799 MaxWidth = std::min<unsigned>(
4800 MaxWidth, std::min<unsigned>(
4803 }
4804 } else {
4805 for (Type *T : ElementTypesInLoop) {
4806 MinWidth = std::min<unsigned>(
4807 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4808 MaxWidth = std::max<unsigned>(
4809 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4810 }
4811 }
4812 return {MinWidth, MaxWidth};
4813}
4814
4816 ElementTypesInLoop.clear();
4817 // For each block.
4818 for (BasicBlock *BB : TheLoop->blocks()) {
4819 // For each instruction in the loop.
4820 for (Instruction &I : BB->instructionsWithoutDebug()) {
4821 Type *T = I.getType();
4822
4823 // Skip ignored values.
4824 if (ValuesToIgnore.count(&I))
4825 continue;
4826
4827 // Only examine Loads, Stores and PHINodes.
4828 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4829 continue;
4830
4831 // Examine PHI nodes that are reduction variables. Update the type to
4832 // account for the recurrence type.
4833 if (auto *PN = dyn_cast<PHINode>(&I)) {
4834 if (!Legal->isReductionVariable(PN))
4835 continue;
4836 const RecurrenceDescriptor &RdxDesc =
4837 Legal->getReductionVars().find(PN)->second;
4840 RdxDesc.getRecurrenceType(),
4842 continue;
4843 T = RdxDesc.getRecurrenceType();
4844 }
4845
4846 // Examine the stored values.
4847 if (auto *ST = dyn_cast<StoreInst>(&I))
4848 T = ST->getValueOperand()->getType();
4849
4850 assert(T->isSized() &&
4851 "Expected the load/store/recurrence type to be sized");
4852
4853 ElementTypesInLoop.insert(T);
4854 }
4855 }
4856}
4857
4858unsigned
4860 InstructionCost LoopCost) {
4861 // -- The interleave heuristics --
4862 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4863 // There are many micro-architectural considerations that we can't predict
4864 // at this level. For example, frontend pressure (on decode or fetch) due to
4865 // code size, or the number and capabilities of the execution ports.
4866 //
4867 // We use the following heuristics to select the interleave count:
4868 // 1. If the code has reductions, then we interleave to break the cross
4869 // iteration dependency.
4870 // 2. If the loop is really small, then we interleave to reduce the loop
4871 // overhead.
4872 // 3. We don't interleave if we think that we will spill registers to memory
4873 // due to the increased register pressure.
4874
4876 return 1;
4877
4878 // Do not interleave if EVL is preferred and no User IC is specified.
4879 if (foldTailWithEVL()) {
4880 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4881 "Unroll factor forced to be 1.\n");
4882 return 1;
4883 }
4884
4885 // We used the distance for the interleave count.
4887 return 1;
4888
4889 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
4890 const bool HasReductions = !Legal->getReductionVars().empty();
4891
4892 // If we did not calculate the cost for VF (because the user selected the VF)
4893 // then we calculate the cost of VF here.
4894 if (LoopCost == 0) {
4895 LoopCost = expectedCost(VF);
4896 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4897
4898 // Loop body is free and there is no need for interleaving.
4899 if (LoopCost == 0)
4900 return 1;
4901 }
4902
4904 // We divide by these constants so assume that we have at least one
4905 // instruction that uses at least one register.
4906 for (auto& pair : R.MaxLocalUsers) {
4907 pair.second = std::max(pair.second, 1U);
4908 }
4909
4910 // We calculate the interleave count using the following formula.
4911 // Subtract the number of loop invariants from the number of available
4912 // registers. These registers are used by all of the interleaved instances.
4913 // Next, divide the remaining registers by the number of registers that is
4914 // required by the loop, in order to estimate how many parallel instances
4915 // fit without causing spills. All of this is rounded down if necessary to be
4916 // a power of two. We want power of two interleave count to simplify any
4917 // addressing operations or alignment considerations.
4918 // We also want power of two interleave counts to ensure that the induction
4919 // variable of the vector loop wraps to zero, when tail is folded by masking;
4920 // this currently happens when OptForSize, in which case IC is set to 1 above.
4921 unsigned IC = UINT_MAX;
4922
4923 for (auto& pair : R.MaxLocalUsers) {
4924 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4925 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4926 << " registers of "
4927 << TTI.getRegisterClassName(pair.first) << " register class\n");
4928 if (VF.isScalar()) {
4929 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4930 TargetNumRegisters = ForceTargetNumScalarRegs;
4931 } else {
4932 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4933 TargetNumRegisters = ForceTargetNumVectorRegs;
4934 }
4935 unsigned MaxLocalUsers = pair.second;
4936 unsigned LoopInvariantRegs = 0;
4937 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
4938 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
4939
4940 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4941 MaxLocalUsers);
4942 // Don't count the induction variable as interleaved.
4944 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4945 std::max(1U, (MaxLocalUsers - 1)));
4946 }
4947
4948 IC = std::min(IC, TmpIC);
4949 }
4950
4951 // Clamp the interleave ranges to reasonable counts.
4952 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4953
4954 // Check if the user has overridden the max.
4955 if (VF.isScalar()) {
4956 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4957 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4958 } else {
4959 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4960 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4961 }
4962
4963 unsigned EstimatedVF = VF.getKnownMinValue();
4964 if (VF.isScalable()) {
4965 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
4966 EstimatedVF *= *VScale;
4967 }
4968 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4969
4970 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4971 if (KnownTC > 0) {
4972 // At least one iteration must be scalar when this constraint holds. So the
4973 // maximum available iterations for interleaving is one less.
4974 unsigned AvailableTC =
4975 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
4976
4977 // If trip count is known we select between two prospective ICs, where
4978 // 1) the aggressive IC is capped by the trip count divided by VF
4979 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4980 // The final IC is selected in a way that the epilogue loop trip count is
4981 // minimized while maximizing the IC itself, so that we either run the
4982 // vector loop at least once if it generates a small epilogue loop, or else
4983 // we run the vector loop at least twice.
4984
4985 unsigned InterleaveCountUB = bit_floor(
4986 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4987 unsigned InterleaveCountLB = bit_floor(std::max(
4988 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4989 MaxInterleaveCount = InterleaveCountLB;
4990
4991 if (InterleaveCountUB != InterleaveCountLB) {
4992 unsigned TailTripCountUB =
4993 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4994 unsigned TailTripCountLB =
4995 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4996 // If both produce same scalar tail, maximize the IC to do the same work
4997 // in fewer vector loop iterations
4998 if (TailTripCountUB == TailTripCountLB)
4999 MaxInterleaveCount = InterleaveCountUB;
5000 }
5001 } else if (BestKnownTC && *BestKnownTC > 0) {
5002 // At least one iteration must be scalar when this constraint holds. So the
5003 // maximum available iterations for interleaving is one less.
5004 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5005 ? (*BestKnownTC) - 1
5006 : *BestKnownTC;
5007
5008 // If trip count is an estimated compile time constant, limit the
5009 // IC to be capped by the trip count divided by VF * 2, such that the vector
5010 // loop runs at least twice to make interleaving seem profitable when there
5011 // is an epilogue loop present. Since exact Trip count is not known we
5012 // choose to be conservative in our IC estimate.
5013 MaxInterleaveCount = bit_floor(std::max(
5014 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5015 }
5016
5017 assert(MaxInterleaveCount > 0 &&
5018 "Maximum interleave count must be greater than 0");
5019
5020 // Clamp the calculated IC to be between the 1 and the max interleave count
5021 // that the target and trip count allows.
5022 if (IC > MaxInterleaveCount)
5023 IC = MaxInterleaveCount;
5024 else
5025 // Make sure IC is greater than 0.
5026 IC = std::max(1u, IC);
5027
5028 assert(IC > 0 && "Interleave count must be greater than 0.");
5029
5030 // Interleave if we vectorized this loop and there is a reduction that could
5031 // benefit from interleaving.
5032 if (VF.isVector() && HasReductions) {
5033 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5034 return IC;
5035 }
5036
5037 // For any scalar loop that either requires runtime checks or predication we
5038 // are better off leaving this to the unroller. Note that if we've already
5039 // vectorized the loop we will have done the runtime check and so interleaving
5040 // won't require further checks.
5041 bool ScalarInterleavingRequiresPredication =
5042 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5043 return Legal->blockNeedsPredication(BB);
5044 }));
5045 bool ScalarInterleavingRequiresRuntimePointerCheck =
5047
5048 // We want to interleave small loops in order to reduce the loop overhead and
5049 // potentially expose ILP opportunities.
5050 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5051 << "LV: IC is " << IC << '\n'
5052 << "LV: VF is " << VF << '\n');
5053 const bool AggressivelyInterleaveReductions =
5054 TTI.enableAggressiveInterleaving(HasReductions);
5055 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5056 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5057 // We assume that the cost overhead is 1 and we use the cost model
5058 // to estimate the cost of the loop and interleave until the cost of the
5059 // loop overhead is about 5% of the cost of the loop.
5060 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5061 SmallLoopCost / *LoopCost.getValue()));
5062
5063 // Interleave until store/load ports (estimated by max interleave count) are
5064 // saturated.
5065 unsigned NumStores = Legal->getNumStores();
5066 unsigned NumLoads = Legal->getNumLoads();
5067 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5068 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5069
5070 // There is little point in interleaving for reductions containing selects
5071 // and compares when VF=1 since it may just create more overhead than it's
5072 // worth for loops with small trip counts. This is because we still have to
5073 // do the final reduction after the loop.
5074 bool HasSelectCmpReductions =
5075 HasReductions &&
5076 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5077 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5078 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5079 RdxDesc.getRecurrenceKind());
5080 });
5081 if (HasSelectCmpReductions) {
5082 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5083 return 1;
5084 }
5085
5086 // If we have a scalar reduction (vector reductions are already dealt with
5087 // by this point), we can increase the critical path length if the loop
5088 // we're interleaving is inside another loop. For tree-wise reductions
5089 // set the limit to 2, and for ordered reductions it's best to disable
5090 // interleaving entirely.
5091 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5092 bool HasOrderedReductions =
5093 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5094 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5095 return RdxDesc.isOrdered();
5096 });
5097 if (HasOrderedReductions) {
5098 LLVM_DEBUG(
5099 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5100 return 1;
5101 }
5102
5103 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5104 SmallIC = std::min(SmallIC, F);
5105 StoresIC = std::min(StoresIC, F);
5106 LoadsIC = std::min(LoadsIC, F);
5107 }
5108
5110 std::max(StoresIC, LoadsIC) > SmallIC) {
5111 LLVM_DEBUG(
5112 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5113 return std::max(StoresIC, LoadsIC);
5114 }
5115
5116 // If there are scalar reductions and TTI has enabled aggressive
5117 // interleaving for reductions, we will interleave to expose ILP.
5118 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5119 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5120 // Interleave no less than SmallIC but not as aggressive as the normal IC
5121 // to satisfy the rare situation when resources are too limited.
5122 return std::max(IC / 2, SmallIC);
5123 } else {
5124 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5125 return SmallIC;
5126 }
5127 }
5128
5129 // Interleave if this is a large loop (small loops are already dealt with by
5130 // this point) that could benefit from interleaving.
5131 if (AggressivelyInterleaveReductions) {
5132 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5133 return IC;
5134 }
5135
5136 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5137 return 1;
5138}
5139
5142 // This function calculates the register usage by measuring the highest number
5143 // of values that are alive at a single location. Obviously, this is a very
5144 // rough estimation. We scan the loop in a topological order in order and
5145 // assign a number to each instruction. We use RPO to ensure that defs are
5146 // met before their users. We assume that each instruction that has in-loop
5147 // users starts an interval. We record every time that an in-loop value is
5148 // used, so we have a list of the first and last occurrences of each
5149 // instruction. Next, we transpose this data structure into a multi map that
5150 // holds the list of intervals that *end* at a specific location. This multi
5151 // map allows us to perform a linear search. We scan the instructions linearly
5152 // and record each time that a new interval starts, by placing it in a set.
5153 // If we find this value in the multi-map then we remove it from the set.
5154 // The max register usage is the maximum size of the set.
5155 // We also search for instructions that are defined outside the loop, but are
5156 // used inside the loop. We need this number separately from the max-interval
5157 // usage number because when we unroll, loop-invariant values do not take
5158 // more register.
5160 DFS.perform(LI);
5161
5162 RegisterUsage RU;
5163
5164 // Each 'key' in the map opens a new interval. The values
5165 // of the map are the index of the 'last seen' usage of the
5166 // instruction that is the key.
5168
5169 // Maps instruction to its index.
5171 // Marks the end of each interval.
5172 IntervalMap EndPoint;
5173 // Saves the list of instruction indices that are used in the loop.
5175 // Saves the list of values that are used in the loop but are defined outside
5176 // the loop (not including non-instruction values such as arguments and
5177 // constants).
5178 SmallSetVector<Instruction *, 8> LoopInvariants;
5179
5180 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5181 for (Instruction &I : BB->instructionsWithoutDebug()) {
5182 IdxToInstr.push_back(&I);
5183
5184 // Save the end location of each USE.
5185 for (Value *U : I.operands()) {
5186 auto *Instr = dyn_cast<Instruction>(U);
5187
5188 // Ignore non-instruction values such as arguments, constants, etc.
5189 // FIXME: Might need some motivation why these values are ignored. If
5190 // for example an argument is used inside the loop it will increase the
5191 // register pressure (so shouldn't we add it to LoopInvariants).
5192 if (!Instr)
5193 continue;
5194
5195 // If this instruction is outside the loop then record it and continue.
5196 if (!TheLoop->contains(Instr)) {
5197 LoopInvariants.insert(Instr);
5198 continue;
5199 }
5200
5201 // Overwrite previous end points.
5202 EndPoint[Instr] = IdxToInstr.size();
5203 Ends.insert(Instr);
5204 }
5205 }
5206 }
5207
5208 // Saves the list of intervals that end with the index in 'key'.
5209 using InstrList = SmallVector<Instruction *, 2>;
5210 DenseMap<unsigned, InstrList> TransposeEnds;
5211
5212 // Transpose the EndPoints to a list of values that end at each index.
5213 for (auto &Interval : EndPoint)
5214 TransposeEnds[Interval.second].push_back(Interval.first);
5215
5216 SmallPtrSet<Instruction *, 8> OpenIntervals;
5219
5220 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5221
5222 const auto &TTICapture = TTI;
5223 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5224 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5225 return 0;
5226 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5227 };
5228
5229 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5230 Instruction *I = IdxToInstr[i];
5231
5232 // Remove all of the instructions that end at this location.
5233 InstrList &List = TransposeEnds[i];
5234 for (Instruction *ToRemove : List)
5235 OpenIntervals.erase(ToRemove);
5236
5237 // Ignore instructions that are never used within the loop.
5238 if (!Ends.count(I))
5239 continue;
5240
5241 // Skip ignored values.
5242 if (ValuesToIgnore.count(I))
5243 continue;
5244
5246
5247 // For each VF find the maximum usage of registers.
5248 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5249 // Count the number of registers used, per register class, given all open
5250 // intervals.
5251 // Note that elements in this SmallMapVector will be default constructed
5252 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5253 // there is no previous entry for ClassID.
5255
5256 if (VFs[j].isScalar()) {
5257 for (auto *Inst : OpenIntervals) {
5258 unsigned ClassID =
5259 TTI.getRegisterClassForType(false, Inst->getType());
5260 // FIXME: The target might use more than one register for the type
5261 // even in the scalar case.
5262 RegUsage[ClassID] += 1;
5263 }
5264 } else {
5266 for (auto *Inst : OpenIntervals) {
5267 // Skip ignored values for VF > 1.
5268 if (VecValuesToIgnore.count(Inst))
5269 continue;
5270 if (isScalarAfterVectorization(Inst, VFs[j])) {
5271 unsigned ClassID =
5272 TTI.getRegisterClassForType(false, Inst->getType());
5273 // FIXME: The target might use more than one register for the type
5274 // even in the scalar case.
5275 RegUsage[ClassID] += 1;
5276 } else {
5277 unsigned ClassID =
5278 TTI.getRegisterClassForType(true, Inst->getType());
5279 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5280 }
5281 }
5282 }
5283
5284 for (auto& pair : RegUsage) {
5285 auto &Entry = MaxUsages[j][pair.first];
5286 Entry = std::max(Entry, pair.second);
5287 }
5288 }
5289
5290 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5291 << OpenIntervals.size() << '\n');
5292
5293 // Add the current instruction to the list of open intervals.
5294 OpenIntervals.insert(I);
5295 }
5296
5297 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5298 // Note that elements in this SmallMapVector will be default constructed
5299 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5300 // there is no previous entry for ClassID.
5302
5303 for (auto *Inst : LoopInvariants) {
5304 // FIXME: The target might use more than one register for the type
5305 // even in the scalar case.
5306 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5307 auto *I = cast<Instruction>(U);
5308 return TheLoop != LI->getLoopFor(I->getParent()) ||
5309 isScalarAfterVectorization(I, VFs[i]);
5310 });
5311
5312 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5313 unsigned ClassID =
5314 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5315 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5316 }
5317
5318 LLVM_DEBUG({
5319 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5320 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5321 << " item\n";
5322 for (const auto &pair : MaxUsages[i]) {
5323 dbgs() << "LV(REG): RegisterClass: "
5324 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5325 << " registers\n";
5326 }
5327 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5328 << " item\n";
5329 for (const auto &pair : Invariant) {
5330 dbgs() << "LV(REG): RegisterClass: "
5331 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5332 << " registers\n";
5333 }
5334 });
5335
5336 RU.LoopInvariantRegs = Invariant;
5337 RU.MaxLocalUsers = MaxUsages[i];
5338 RUs[i] = RU;
5339 }
5340
5341 return RUs;
5342}
5343
5344bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5345 ElementCount VF) {
5346 // TODO: Cost model for emulated masked load/store is completely
5347 // broken. This hack guides the cost model to use an artificially
5348 // high enough value to practically disable vectorization with such
5349 // operations, except where previously deployed legality hack allowed
5350 // using very low cost values. This is to avoid regressions coming simply
5351 // from moving "masked load/store" check from legality to cost model.
5352 // Masked Load/Gather emulation was previously never allowed.
5353 // Limited number of Masked Store/Scatter emulation was allowed.
5355 "Expecting a scalar emulated instruction");
5356 return isa<LoadInst>(I) ||
5357 (isa<StoreInst>(I) &&
5358 NumPredStores > NumberOfStoresToPredicate);
5359}
5360
5362 // If we aren't vectorizing the loop, or if we've already collected the
5363 // instructions to scalarize, there's nothing to do. Collection may already
5364 // have occurred if we have a user-selected VF and are now computing the
5365 // expected cost for interleaving.
5366 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5367 return;
5368
5369 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5370 // not profitable to scalarize any instructions, the presence of VF in the
5371 // map will indicate that we've analyzed it already.
5372 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5373
5374 PredicatedBBsAfterVectorization[VF].clear();
5375
5376 // Find all the instructions that are scalar with predication in the loop and
5377 // determine if it would be better to not if-convert the blocks they are in.
5378 // If so, we also record the instructions to scalarize.
5379 for (BasicBlock *BB : TheLoop->blocks()) {
5381 continue;
5382 for (Instruction &I : *BB)
5383 if (isScalarWithPredication(&I, VF)) {
5384 ScalarCostsTy ScalarCosts;
5385 // Do not apply discount logic for:
5386 // 1. Scalars after vectorization, as there will only be a single copy
5387 // of the instruction.
5388 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5389 // 3. Emulated masked memrefs, if a hacked cost is needed.
5390 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5391 !useEmulatedMaskMemRefHack(&I, VF) &&
5392 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5393 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5394 // Remember that BB will remain after vectorization.
5395 PredicatedBBsAfterVectorization[VF].insert(BB);
5396 for (auto *Pred : predecessors(BB)) {
5397 if (Pred->getSingleSuccessor() == BB)
5398 PredicatedBBsAfterVectorization[VF].insert(Pred);
5399 }
5400 }
5401 }
5402}
5403
5404InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5405 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5406 assert(!isUniformAfterVectorization(PredInst, VF) &&
5407 "Instruction marked uniform-after-vectorization will be predicated");
5408
5409 // Initialize the discount to zero, meaning that the scalar version and the
5410 // vector version cost the same.
5411 InstructionCost Discount = 0;
5412
5413 // Holds instructions to analyze. The instructions we visit are mapped in
5414 // ScalarCosts. Those instructions are the ones that would be scalarized if
5415 // we find that the scalar version costs less.
5417
5418 // Returns true if the given instruction can be scalarized.
5419 auto canBeScalarized = [&](Instruction *I) -> bool {
5420 // We only attempt to scalarize instructions forming a single-use chain
5421 // from the original predicated block that would otherwise be vectorized.
5422 // Although not strictly necessary, we give up on instructions we know will
5423 // already be scalar to avoid traversing chains that are unlikely to be
5424 // beneficial.
5425 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5427 return false;
5428
5429 // If the instruction is scalar with predication, it will be analyzed
5430 // separately. We ignore it within the context of PredInst.
5431 if (isScalarWithPredication(I, VF))
5432 return false;
5433
5434 // If any of the instruction's operands are uniform after vectorization,
5435 // the instruction cannot be scalarized. This prevents, for example, a
5436 // masked load from being scalarized.
5437 //
5438 // We assume we will only emit a value for lane zero of an instruction
5439 // marked uniform after vectorization, rather than VF identical values.
5440 // Thus, if we scalarize an instruction that uses a uniform, we would
5441 // create uses of values corresponding to the lanes we aren't emitting code
5442 // for. This behavior can be changed by allowing getScalarValue to clone
5443 // the lane zero values for uniforms rather than asserting.
5444 for (Use &U : I->operands())
5445 if (auto *J = dyn_cast<Instruction>(U.get()))
5446 if (isUniformAfterVectorization(J, VF))
5447 return false;
5448
5449 // Otherwise, we can scalarize the instruction.
5450 return true;
5451 };
5452
5453 // Compute the expected cost discount from scalarizing the entire expression
5454 // feeding the predicated instruction. We currently only consider expressions
5455 // that are single-use instruction chains.
5456 Worklist.push_back(PredInst);
5457 while (!Worklist.empty()) {
5458 Instruction *I = Worklist.pop_back_val();
5459
5460 // If we've already analyzed the instruction, there's nothing to do.
5461 if (ScalarCosts.contains(I))
5462 continue;
5463
5464 // Compute the cost of the vector instruction. Note that this cost already
5465 // includes the scalarization overhead of the predicated instruction.
5466 InstructionCost VectorCost = getInstructionCost(I, VF);
5467
5468 // Compute the cost of the scalarized instruction. This cost is the cost of
5469 // the instruction as if it wasn't if-converted and instead remained in the
5470 // predicated block. We will scale this cost by block probability after
5471 // computing the scalarization overhead.
5472 InstructionCost ScalarCost =
5474
5475 // Compute the scalarization overhead of needed insertelement instructions
5476 // and phi nodes.
5478 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5479 ScalarCost += TTI.getScalarizationOverhead(
5480 cast<VectorType>(ToVectorTy(I->getType(), VF)),
5481 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5482 /*Extract*/ false, CostKind);
5483 ScalarCost +=
5484 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5485 }
5486
5487 // Compute the scalarization overhead of needed extractelement
5488 // instructions. For each of the instruction's operands, if the operand can
5489 // be scalarized, add it to the worklist; otherwise, account for the
5490 // overhead.
5491 for (Use &U : I->operands())
5492 if (auto *J = dyn_cast<Instruction>(U.get())) {
5493 assert(VectorType::isValidElementType(J->getType()) &&
5494 "Instruction has non-scalar type");
5495 if (canBeScalarized(J))
5496 Worklist.push_back(J);
5497 else if (needsExtract(J, VF)) {
5498 ScalarCost += TTI.getScalarizationOverhead(
5499 cast<VectorType>(ToVectorTy(J->getType(), VF)),
5500 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5501 /*Extract*/ true, CostKind);
5502 }
5503 }
5504
5505 // Scale the total scalar cost by block probability.
5506 ScalarCost /= getReciprocalPredBlockProb();
5507
5508 // Compute the discount. A non-negative discount means the vector version
5509 // of the instruction costs more, and scalarizing would be beneficial.
5510 Discount += VectorCost - ScalarCost;
5511 ScalarCosts[I] = ScalarCost;
5512 }
5513
5514 return Discount;
5515}
5516
5519
5520 // For each block.
5521 for (BasicBlock *BB : TheLoop->blocks()) {
5522 InstructionCost BlockCost;
5523
5524 // For each instruction in the old loop.
5525 for (Instruction &I : BB->instructionsWithoutDebug()) {
5526 // Skip ignored values.
5527 if (ValuesToIgnore.count(&I) ||
5528 (VF.isVector() && VecValuesToIgnore.count(&I)))
5529 continue;
5530
5532
5533 // Check if we should override the cost.
5534 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5536
5537 BlockCost += C;
5538 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5539 << VF << " For instruction: " << I << '\n');
5540 }
5541
5542 // If we are vectorizing a predicated block, it will have been
5543 // if-converted. This means that the block's instructions (aside from
5544 // stores and instructions that may divide by zero) will now be
5545 // unconditionally executed. For the scalar case, we may not always execute
5546 // the predicated block, if it is an if-else block. Thus, scale the block's
5547 // cost by the probability of executing it. blockNeedsPredication from
5548 // Legal is used so as to not include all blocks in tail folded loops.
5549 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5550 BlockCost /= getReciprocalPredBlockProb();
5551
5552 Cost += BlockCost;
5553 }
5554
5555 return Cost;
5556}
5557
5558/// Gets Address Access SCEV after verifying that the access pattern
5559/// is loop invariant except the induction variable dependence.
5560///
5561/// This SCEV can be sent to the Target in order to estimate the address
5562/// calculation cost.
5564 Value *Ptr,
5567 const Loop *TheLoop) {
5568
5569 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5570 if (!Gep)
5571 return nullptr;
5572
5573 // We are looking for a gep with all loop invariant indices except for one
5574 // which should be an induction variable.
5575 auto SE = PSE.getSE();
5576 unsigned NumOperands = Gep->getNumOperands();
5577 for (unsigned i = 1; i < NumOperands; ++i) {
5578 Value *Opd = Gep->getOperand(i);
5579 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5580 !Legal->isInductionVariable(Opd))
5581 return nullptr;
5582 }
5583
5584 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5585 return PSE.getSCEV(Ptr);
5586}
5587
5589LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5590 ElementCount VF) {
5591 assert(VF.isVector() &&
5592 "Scalarization cost of instruction implies vectorization.");
5593 if (VF.isScalable())
5595
5596 Type *ValTy = getLoadStoreType(I);
5597 auto SE = PSE.getSE();
5598
5599 unsigned AS = getLoadStoreAddressSpace(I);
5601 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5602 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5603 // that it is being called from this specific place.
5604
5605 // Figure out whether the access is strided and get the stride value
5606 // if it's known in compile time
5607 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5608
5609 // Get the cost of the scalar memory instruction and address computation.
5611 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5612
5613 // Don't pass *I here, since it is scalar but will actually be part of a
5614 // vectorized loop where the user of it is a vectorized instruction.
5616 const Align Alignment = getLoadStoreAlignment(I);
5617 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5618 ValTy->getScalarType(),
5619 Alignment, AS, CostKind);
5620
5621 // Get the overhead of the extractelement and insertelement instructions
5622 // we might create due to scalarization.
5623 Cost += getScalarizationOverhead(I, VF, CostKind);
5624
5625 // If we have a predicated load/store, it will need extra i1 extracts and
5626 // conditional branches, but may not be executed for each vector lane. Scale
5627 // the cost by the probability of executing the predicated block.
5628 if (isPredicatedInst(I)) {
5630
5631 // Add the cost of an i1 extract and a branch
5632 auto *Vec_i1Ty =
5635 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5636 /*Insert=*/false, /*Extract=*/true, CostKind);
5637 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5638
5639 if (useEmulatedMaskMemRefHack(I, VF))
5640 // Artificially setting to a high enough value to practically disable
5641 // vectorization with such operations.
5642 Cost = 3000000;
5643 }
5644
5645 return Cost;
5646}
5647
5649LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5650 ElementCount VF) {
5651 Type *ValTy = getLoadStoreType(I);
5652 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5654 unsigned AS = getLoadStoreAddressSpace(I);
5655 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5657
5658 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5659 "Stride should be 1 or -1 for consecutive memory access");
5660 const Align Alignment = getLoadStoreAlignment(I);
5662 if (Legal->isMaskRequired(I)) {
5663 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5664 CostKind);
5665 } else {
5666 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5667 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5668 CostKind, OpInfo, I);
5669 }
5670
5671 bool Reverse = ConsecutiveStride < 0;
5672 if (Reverse)
5674 std::nullopt, CostKind, 0);
5675 return Cost;
5676}
5677
5679LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5680 ElementCount VF) {
5681 assert(Legal->isUniformMemOp(*I, VF));
5682
5683 Type *ValTy = getLoadStoreType(I);
5684 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5685 const Align Alignment = getLoadStoreAlignment(I);
5686 unsigned AS = getLoadStoreAddressSpace(I);
5688 if (isa<LoadInst>(I)) {
5689 return TTI.getAddressComputationCost(ValTy) +
5690 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5691 CostKind) +
5693 }
5694 StoreInst *SI = cast<StoreInst>(I);
5695
5696 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5697 return TTI.getAddressComputationCost(ValTy) +
5698 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5699 CostKind) +
5700 (isLoopInvariantStoreValue
5701 ? 0
5702 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5703 CostKind, VF.getKnownMinValue() - 1));
5704}
5705
5707LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5708 ElementCount VF) {
5709 Type *ValTy = getLoadStoreType(I);
5710 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5711 const Align Alignment = getLoadStoreAlignment(I);
5713
5714 return TTI.getAddressComputationCost(VectorTy) +
5716 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
5718}
5719
5721LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5722 ElementCount VF) {
5723 Type *ValTy = getLoadStoreType(I);
5724 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5725 unsigned AS = getLoadStoreAddressSpace(I);
5727
5728 auto Group = getInterleavedAccessGroup(I);
5729 assert(Group && "Fail to get an interleaved access group.");
5730
5731 unsigned InterleaveFactor = Group->getFactor();
5732 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5733
5734 // Holds the indices of existing members in the interleaved group.
5736 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5737 if (Group->getMember(IF))
5738 Indices.push_back(IF);
5739
5740 // Calculate the cost of the whole interleaved group.
5741 bool UseMaskForGaps =
5742 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5743 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5745 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
5746 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
5747
5748 if (Group->isReverse()) {
5749 // TODO: Add support for reversed masked interleaved access.
5751 "Reverse masked interleaved access not supported.");
5752 Cost += Group->getNumMembers() *
5754 std::nullopt, CostKind, 0);
5755 }
5756 return Cost;
5757}
5758
5759std::optional<InstructionCost>
5761 Instruction *I, ElementCount VF, Type *Ty,
5763 using namespace llvm::PatternMatch;
5764 // Early exit for no inloop reductions
5765 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5766 return std::nullopt;
5767 auto *VectorTy = cast<VectorType>(Ty);
5768
5769 // We are looking for a pattern of, and finding the minimal acceptable cost:
5770 // reduce(mul(ext(A), ext(B))) or
5771 // reduce(mul(A, B)) or
5772 // reduce(ext(A)) or
5773 // reduce(A).
5774 // The basic idea is that we walk down the tree to do that, finding the root
5775 // reduction instruction in InLoopReductionImmediateChains. From there we find
5776 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5777 // of the components. If the reduction cost is lower then we return it for the
5778 // reduction instruction and 0 for the other instructions in the pattern. If
5779 // it is not we return an invalid cost specifying the orignal cost method
5780 // should be used.
5781 Instruction *RetI = I;
5782 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5783 if (!RetI->hasOneUser())
5784 return std::nullopt;
5785 RetI = RetI->user_back();
5786 }
5787
5788 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5789 RetI->user_back()->getOpcode() == Instruction::Add) {
5790 RetI = RetI->user_back();
5791 }
5792
5793 // Test if the found instruction is a reduction, and if not return an invalid
5794 // cost specifying the parent to use the original cost modelling.
5795 if (!InLoopReductionImmediateChains.count(RetI))
5796 return std::nullopt;
5797
5798 // Find the reduction this chain is a part of and calculate the basic cost of
5799 // the reduction on its own.
5800 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5801 Instruction *ReductionPhi = LastChain;
5802 while (!isa<PHINode>(ReductionPhi))
5803 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5804
5805 const RecurrenceDescriptor &RdxDesc =
5806 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5807
5808 InstructionCost BaseCost;
5809 RecurKind RK = RdxDesc.getRecurrenceKind();
5812 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5813 RdxDesc.getFastMathFlags(), CostKind);
5814 } else {
5816 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5817 }
5818
5819 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5820 // normal fmul instruction to the cost of the fadd reduction.
5821 if (RK == RecurKind::FMulAdd)
5822 BaseCost +=
5823 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5824
5825 // If we're using ordered reductions then we can just return the base cost
5826 // here, since getArithmeticReductionCost calculates the full ordered
5827 // reduction cost when FP reassociation is not allowed.
5828 if (useOrderedReductions(RdxDesc))
5829 return BaseCost;
5830
5831 // Get the operand that was not the reduction chain and match it to one of the
5832 // patterns, returning the better cost if it is found.
5833 Instruction *RedOp = RetI->getOperand(1) == LastChain
5834 ? dyn_cast<Instruction>(RetI->getOperand(0))
5835 : dyn_cast<Instruction>(RetI->getOperand(1));
5836
5837 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5838
5839 Instruction *Op0, *Op1;
5840 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5841 match(RedOp,
5843 match(Op0, m_ZExtOrSExt(m_Value())) &&
5844 Op0->getOpcode() == Op1->getOpcode() &&
5845 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5847 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5848
5849 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5850 // Note that the extend opcodes need to all match, or if A==B they will have
5851 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5852 // which is equally fine.
5853 bool IsUnsigned = isa<ZExtInst>(Op0);
5854 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5855 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5856
5857 InstructionCost ExtCost =
5858 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5860 InstructionCost MulCost =
5861 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5862 InstructionCost Ext2Cost =
5863 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5865
5867 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5868
5869 if (RedCost.isValid() &&
5870 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5871 return I == RetI ? RedCost : 0;
5872 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5873 !TheLoop->isLoopInvariant(RedOp)) {
5874 // Matched reduce(ext(A))
5875 bool IsUnsigned = isa<ZExtInst>(RedOp);
5876 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5878 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5879 RdxDesc.getFastMathFlags(), CostKind);
5880
5881 InstructionCost ExtCost =
5882 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5884 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5885 return I == RetI ? RedCost : 0;
5886 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5887 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5888 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5889 Op0->getOpcode() == Op1->getOpcode() &&
5891 bool IsUnsigned = isa<ZExtInst>(Op0);
5892 Type *Op0Ty = Op0->getOperand(0)->getType();
5893 Type *Op1Ty = Op1->getOperand(0)->getType();
5894 Type *LargestOpTy =
5895 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5896 : Op0Ty;
5897 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5898
5899 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5900 // different sizes. We take the largest type as the ext to reduce, and add
5901 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5903 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5906 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5908 InstructionCost MulCost =
5909 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5910
5912 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5913 InstructionCost ExtraExtCost = 0;
5914 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5915 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5916 ExtraExtCost = TTI.getCastInstrCost(
5917 ExtraExtOp->getOpcode(), ExtType,
5918 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5920 }
5921
5922 if (RedCost.isValid() &&
5923 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5924 return I == RetI ? RedCost : 0;
5925 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5926 // Matched reduce.add(mul())
5927 InstructionCost MulCost =
5928 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5929
5931 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5932
5933 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5934 return I == RetI ? RedCost : 0;
5935 }
5936 }
5937
5938 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5939}
5940
5942LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5943 ElementCount VF) {
5944 // Calculate scalar cost only. Vectorization cost should be ready at this
5945 // moment.
5946 if (VF.isScalar()) {
5947 Type *ValTy = getLoadStoreType(I);
5948 const Align Alignment = getLoadStoreAlignment(I);
5949 unsigned AS = getLoadStoreAddressSpace(I);
5950
5951 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5952 return TTI.getAddressComputationCost(ValTy) +
5953 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
5954 TTI::TCK_RecipThroughput, OpInfo, I);
5955 }
5956 return getWideningCost(I, VF);
5957}
5958
5959InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
5961
5962 // There is no mechanism yet to create a scalable scalarization loop,
5963 // so this is currently Invalid.
5964 if (VF.isScalable())
5966
5967 if (VF.isScalar())
5968 return 0;
5969
5971 Type *RetTy = ToVectorTy(I->getType(), VF);
5972 if (!RetTy->isVoidTy() &&
5973 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5975 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
5976 /*Insert*/ true,
5977 /*Extract*/ false, CostKind);
5978
5979 // Some targets keep addresses scalar.
5980 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5981 return Cost;
5982
5983 // Some targets support efficient element stores.
5984 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5985 return Cost;
5986
5987 // Collect operands to consider.
5988 CallInst *CI = dyn_cast<CallInst>(I);
5989 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5990
5991 // Skip operands that do not require extraction/scalarization and do not incur
5992 // any overhead.
5994 for (auto *V : filterExtractingOperands(Ops, VF))
5995 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
5997 filterExtractingOperands(Ops, VF), Tys, CostKind);
5998}
5999
6001 if (VF.isScalar())
6002 return;
6003 NumPredStores = 0;
6004 for (BasicBlock *BB : TheLoop->blocks()) {
6005 // For each instruction in the old loop.
6006 for (Instruction &I : *BB) {
6008 if (!Ptr)
6009 continue;
6010
6011 // TODO: We should generate better code and update the cost model for
6012 // predicated uniform stores. Today they are treated as any other
6013 // predicated store (see added test cases in
6014 // invariant-store-vectorization.ll).
6015 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6016 NumPredStores++;
6017
6018 if (Legal->isUniformMemOp(I, VF)) {
6019 auto isLegalToScalarize = [&]() {
6020 if (!VF.isScalable())
6021 // Scalarization of fixed length vectors "just works".
6022 return true;
6023
6024 // We have dedicated lowering for unpredicated uniform loads and
6025 // stores. Note that even with tail folding we know that at least
6026 // one lane is active (i.e. generalized predication is not possible
6027 // here), and the logic below depends on this fact.
6028 if (!foldTailByMasking())
6029 return true;
6030
6031 // For scalable vectors, a uniform memop load is always
6032 // uniform-by-parts and we know how to scalarize that.
6033 if (isa<LoadInst>(I))
6034 return true;
6035
6036 // A uniform store isn't neccessarily uniform-by-part
6037 // and we can't assume scalarization.
6038 auto &SI = cast<StoreInst>(I);
6039 return TheLoop->isLoopInvariant(SI.getValueOperand());
6040 };
6041
6042 const InstructionCost GatherScatterCost =
6044 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6045
6046 // Load: Scalar load + broadcast
6047 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6048 // FIXME: This cost is a significant under-estimate for tail folded
6049 // memory ops.
6050 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6051 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6052
6053 // Choose better solution for the current VF, Note that Invalid
6054 // costs compare as maximumal large. If both are invalid, we get
6055 // scalable invalid which signals a failure and a vectorization abort.
6056 if (GatherScatterCost < ScalarizationCost)
6057 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6058 else
6059 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6060 continue;
6061 }
6062
6063 // We assume that widening is the best solution when possible.
6064 if (memoryInstructionCanBeWidened(&I, VF)) {
6065 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6066 int ConsecutiveStride = Legal->isConsecutivePtr(
6068 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6069 "Expected consecutive stride.");
6070 InstWidening Decision =
6071 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6072 setWideningDecision(&I, VF, Decision, Cost);
6073 continue;
6074 }
6075
6076 // Choose between Interleaving, Gather/Scatter or Scalarization.
6078 unsigned NumAccesses = 1;
6079 if (isAccessInterleaved(&I)) {
6080 auto Group = getInterleavedAccessGroup(&I);
6081 assert(Group && "Fail to get an interleaved access group.");
6082
6083 // Make one decision for the whole group.
6084 if (getWideningDecision(&I, VF) != CM_Unknown)
6085 continue;
6086
6087 NumAccesses = Group->getNumMembers();
6089 InterleaveCost = getInterleaveGroupCost(&I, VF);
6090 }
6091
6092 InstructionCost GatherScatterCost =
6094 ? getGatherScatterCost(&I, VF) * NumAccesses
6096
6097 InstructionCost ScalarizationCost =
6098 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6099
6100 // Choose better solution for the current VF,
6101 // write down this decision and use it during vectorization.
6103 InstWidening Decision;
6104 if (InterleaveCost <= GatherScatterCost &&
6105 InterleaveCost < ScalarizationCost) {
6106 Decision = CM_Interleave;
6107 Cost = InterleaveCost;
6108 } else if (GatherScatterCost < ScalarizationCost) {
6109 Decision = CM_GatherScatter;
6110 Cost = GatherScatterCost;
6111 } else {
6112 Decision = CM_Scalarize;
6113 Cost = ScalarizationCost;
6114 }
6115 // If the instructions belongs to an interleave group, the whole group
6116 // receives the same decision. The whole group receives the cost, but
6117 // the cost will actually be assigned to one instruction.
6118 if (auto Group = getInterleavedAccessGroup(&I))
6119 setWideningDecision(Group, VF, Decision, Cost);
6120 else
6121 setWideningDecision(&I, VF, Decision, Cost);
6122 }
6123 }
6124
6125 // Make sure that any load of address and any other address computation
6126 // remains scalar unless there is gather/scatter support. This avoids
6127 // inevitable extracts into address registers, and also has the benefit of
6128 // activating LSR more, since that pass can't optimize vectorized
6129 // addresses.
6131 return;
6132
6133 // Start with all scalar pointer uses.
6135 for (BasicBlock *BB : TheLoop->blocks())
6136 for (Instruction &I : *BB) {
6137 Instruction *PtrDef =
6138 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6139 if (PtrDef && TheLoop->contains(PtrDef) &&
6141 AddrDefs.insert(PtrDef);
6142 }
6143
6144 // Add all instructions used to generate the addresses.
6146 append_range(Worklist, AddrDefs);
6147 while (!Worklist.empty()) {
6148 Instruction *I = Worklist.pop_back_val();
6149 for (auto &Op : I->operands())
6150 if (auto *InstOp = dyn_cast<Instruction>(Op))
6151 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6152 AddrDefs.insert(InstOp).second)
6153 Worklist.push_back(InstOp);
6154 }
6155
6156 for (auto *I : AddrDefs) {
6157 if (isa<LoadInst>(I)) {
6158 // Setting the desired widening decision should ideally be handled in
6159 // by cost functions, but since this involves the task of finding out
6160 // if the loaded register is involved in an address computation, it is
6161 // instead changed here when we know this is the case.
6162 InstWidening Decision = getWideningDecision(I, VF);
6163 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6164 // Scalarize a widened load of address.
6166 I, VF, CM_Scalarize,
6167 (VF.getKnownMinValue() *
6168 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6169 else if (auto Group = getInterleavedAccessGroup(I)) {
6170 // Scalarize an interleave group of address loads.
6171 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6172 if (Instruction *Member = Group->getMember(I))
6174 Member, VF, CM_Scalarize,
6175 (VF.getKnownMinValue() *
6176 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6177 }
6178 }
6179 } else
6180 // Make sure I gets scalarized and a cost estimate without
6181 // scalarization overhead.
6182 ForcedScalars[VF].insert(I);
6183 }
6184}
6185
6187 assert(!VF.isScalar() &&
6188 "Trying to set a vectorization decision for a scalar VF");
6189
6190 for (BasicBlock *BB : TheLoop->blocks()) {
6191 // For each instruction in the old loop.
6192 for (Instruction &I : *BB) {
6193 CallInst *CI = dyn_cast<CallInst>(&I);
6194
6195 if (!CI)
6196 continue;
6197
6202
6203 Function *ScalarFunc = CI->getCalledFunction();
6204 Type *ScalarRetTy = CI->getType();
6205 SmallVector<Type *, 4> Tys, ScalarTys;
6206 bool MaskRequired = Legal->isMaskRequired(CI);
6207 for (auto &ArgOp : CI->args())
6208 ScalarTys.push_back(ArgOp->getType());
6209
6210 // Compute corresponding vector type for return value and arguments.
6211 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6212 for (Type *ScalarTy : ScalarTys)
6213 Tys.push_back(ToVectorTy(ScalarTy, VF));
6214
6215 // An in-loop reduction using an fmuladd intrinsic is a special case;
6216 // we don't want the normal cost for that intrinsic.
6218 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6221 std::nullopt, *RedCost);
6222 continue;
6223 }
6224
6225 // Estimate cost of scalarized vector call. The source operands are
6226 // assumed to be vectors, so we need to extract individual elements from
6227 // there, execute VF scalar calls, and then gather the result into the
6228 // vector return value.
6229 InstructionCost ScalarCallCost =
6230 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6231
6232 // Compute costs of unpacking argument values for the scalar calls and
6233 // packing the return values to a vector.
6234 InstructionCost ScalarizationCost =
6235 getScalarizationOverhead(CI, VF, CostKind);
6236
6237 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6238
6239 // Find the cost of vectorizing the call, if we can find a suitable
6240 // vector variant of the function.
6241 bool UsesMask = false;
6242 VFInfo FuncInfo;
6243 Function *VecFunc = nullptr;
6244 // Search through any available variants for one we can use at this VF.
6245 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6246 // Must match requested VF.
6247 if (Info.Shape.VF != VF)
6248 continue;
6249
6250 // Must take a mask argument if one is required
6251 if (MaskRequired && !Info.isMasked())
6252 continue;
6253
6254 // Check that all parameter kinds are supported
6255 bool ParamsOk = true;
6256 for (VFParameter Param : Info.Shape.Parameters) {
6257 switch (Param.ParamKind) {
6259 break;
6261 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6262 // Make sure the scalar parameter in the loop is invariant.
6263 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6264 TheLoop))
6265 ParamsOk = false;
6266 break;
6267 }
6269 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6270 // Find the stride for the scalar parameter in this loop and see if
6271 // it matches the stride for the variant.
6272 // TODO: do we need to figure out the cost of an extract to get the
6273 // first lane? Or do we hope that it will be folded away?
6274 ScalarEvolution *SE = PSE.getSE();
6275 const auto *SAR =
6276 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6277
6278 if (!SAR || SAR->getLoop() != TheLoop) {
6279 ParamsOk = false;
6280 break;
6281 }
6282
6283 const SCEVConstant *Step =
6284 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6285
6286 if (!Step ||
6287 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6288 ParamsOk = false;
6289
6290 break;
6291 }
6293 UsesMask = true;
6294 break;
6295 default:
6296 ParamsOk = false;
6297 break;
6298 }
6299 }
6300
6301 if (!ParamsOk)
6302 continue;
6303
6304 // Found a suitable candidate, stop here.
6305 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6306 FuncInfo = Info;
6307 break;
6308 }
6309
6310 // Add in the cost of synthesizing a mask if one wasn't required.
6311 InstructionCost MaskCost = 0;
6312 if (VecFunc && UsesMask && !MaskRequired)
6313 MaskCost = TTI.getShuffleCost(
6316 VecFunc->getFunctionType()->getContext()),
6317 VF));
6318
6319 if (TLI && VecFunc && !CI->isNoBuiltin())
6320 VectorCost =
6321 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6322
6323 // Find the cost of an intrinsic; some targets may have instructions that
6324 // perform the operation without needing an actual call.
6326 if (IID != Intrinsic::not_intrinsic)
6327 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6328
6329 InstructionCost Cost = ScalarCost;
6330 InstWidening Decision = CM_Scalarize;
6331
6332 if (VectorCost <= Cost) {
6333 Cost = VectorCost;
6334 Decision = CM_VectorCall;
6335 }
6336
6337 if (IntrinsicCost <= Cost) {
6338 Cost = IntrinsicCost;
6339 Decision = CM_IntrinsicCall;
6340 }
6341
6342 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6344 }
6345 }
6346}
6347
6350 ElementCount VF) {
6351 // If we know that this instruction will remain uniform, check the cost of
6352 // the scalar version.
6354 VF = ElementCount::getFixed(1);
6355
6356 if (VF.isVector() && isProfitableToScalarize(I, VF))
6357 return InstsToScalarize[VF][I];
6358
6359 // Forced scalars do not have any scalarization overhead.
6360 auto ForcedScalar = ForcedScalars.find(VF);
6361 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6362 auto InstSet = ForcedScalar->second;
6363 if (InstSet.count(I))
6365 VF.getKnownMinValue();
6366 }
6367
6368 Type *RetTy = I->getType();
6370 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6371 auto SE = PSE.getSE();
6373
6374 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6375 ElementCount VF) -> bool {
6376 if (VF.isScalar())
6377 return true;
6378
6379 auto Scalarized = InstsToScalarize.find(VF);
6380 assert(Scalarized != InstsToScalarize.end() &&
6381 "VF not yet analyzed for scalarization profitability");
6382 return !Scalarized->second.count(I) &&
6383 llvm::all_of(I->users(), [&](User *U) {
6384 auto *UI = cast<Instruction>(U);
6385 return !Scalarized->second.count(UI);
6386 });
6387 };
6388 (void) hasSingleCopyAfterVectorization;
6389
6390 Type *VectorTy;
6391 if (isScalarAfterVectorization(I, VF)) {
6392 // With the exception of GEPs and PHIs, after scalarization there should
6393 // only be one copy of the instruction generated in the loop. This is
6394 // because the VF is either 1, or any instructions that need scalarizing
6395 // have already been dealt with by the time we get here. As a result,
6396 // it means we don't have to multiply the instruction cost by VF.
6397 assert(I->getOpcode() == Instruction::GetElementPtr ||
6398 I->getOpcode() == Instruction::PHI ||
6399 (I->getOpcode() == Instruction::BitCast &&
6400 I->getType()->isPointerTy()) ||
6401 hasSingleCopyAfterVectorization(I, VF));
6402 VectorTy = RetTy;
6403 } else
6404 VectorTy = ToVectorTy(RetTy, VF);
6405
6406 if (VF.isVector() && VectorTy->isVectorTy() &&
6407 !TTI.getNumberOfParts(VectorTy))
6409
6410 // TODO: We need to estimate the cost of intrinsic calls.
6411 switch (I->getOpcode()) {
6412 case Instruction::GetElementPtr:
6413 // We mark this instruction as zero-cost because the cost of GEPs in
6414 // vectorized code depends on whether the corresponding memory instruction
6415 // is scalarized or not. Therefore, we handle GEPs with the memory
6416 // instruction cost.
6417 return 0;
6418 case Instruction::Br: {
6419 // In cases of scalarized and predicated instructions, there will be VF
6420 // predicated blocks in the vectorized loop. Each branch around these
6421 // blocks requires also an extract of its vector compare i1 element.
6422 // Note that the conditional branch from the loop latch will be replaced by
6423 // a single branch controlling the loop, so there is no extra overhead from
6424 // scalarization.
6425 bool ScalarPredicatedBB = false;
6426 BranchInst *BI = cast<BranchInst>(I);
6427 if (VF.isVector() && BI->isConditional() &&
6428 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6429 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6430 BI->getParent() != TheLoop->getLoopLatch())
6431 ScalarPredicatedBB = true;
6432
6433 if (ScalarPredicatedBB) {
6434 // Not possible to scalarize scalable vector with predicated instructions.
6435 if (VF.isScalable())
6437 // Return cost for branches around scalarized and predicated blocks.
6438 auto *Vec_i1Ty =
6439 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6440 return (
6442 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6443 /*Insert*/ false, /*Extract*/ true, CostKind) +
6444 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6445 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6446 // The back-edge branch will remain, as will all scalar branches.
6447 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6448 else
6449 // This branch will be eliminated by if-conversion.
6450 return 0;
6451 // Note: We currently assume zero cost for an unconditional branch inside
6452 // a predicated block since it will become a fall-through, although we
6453 // may decide in the future to call TTI for all branches.
6454 }
6455 case Instruction::Switch: {
6456 if (VF.isScalar())
6457 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6458 auto *Switch = cast<SwitchInst>(I);
6459 return Switch->getNumCases() *
6461 Instruction::ICmp,
6462 ToVectorTy(Switch->getCondition()->getType(), VF),
6463 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
6465 }
6466 case Instruction::PHI: {
6467 auto *Phi = cast<PHINode>(I);
6468
6469 // First-order recurrences are replaced by vector shuffles inside the loop.
6470 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6471 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6472 // penultimate value of the recurrence.
6473 // TODO: Consider vscale_range info.
6474 if (VF.isScalable() && VF.getKnownMinValue() == 1)
6477 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6479 cast<VectorType>(VectorTy), Mask, CostKind,
6480 VF.getKnownMinValue() - 1);
6481 }
6482
6483 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6484 // converted into select instructions. We require N - 1 selects per phi
6485 // node, where N is the number of incoming values.
6486 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6487 return (Phi->getNumIncomingValues() - 1) *
6489 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6490 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6492
6493 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6494 }
6495 case Instruction::UDiv:
6496 case Instruction::SDiv:
6497 case Instruction::URem:
6498 case Instruction::SRem:
6499 if (VF.isVector() && isPredicatedInst(I)) {
6500 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6501 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6502 ScalarCost : SafeDivisorCost;
6503 }
6504 // We've proven all lanes safe to speculate, fall through.
6505 [[fallthrough]];
6506 case Instruction::Add:
6507 case Instruction::FAdd:
6508 case Instruction::Sub:
6509 case Instruction::FSub:
6510 case Instruction::Mul:
6511 case Instruction::FMul:
6512 case Instruction::FDiv:
6513 case Instruction::FRem:
6514 case Instruction::Shl:
6515 case Instruction::LShr:
6516 case Instruction::AShr:
6517 case Instruction::And:
6518 case Instruction::Or:
6519 case Instruction::Xor: {
6520 // If we're speculating on the stride being 1, the multiplication may
6521 // fold away. We can generalize this for all operations using the notion
6522 // of neutral elements. (TODO)
6523 if (I->getOpcode() == Instruction::Mul &&
6524 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6525 PSE.getSCEV(I->getOperand(1))->isOne()))
6526 return 0;
6527
6528 // Detect reduction patterns
6529 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6530 return *RedCost;
6531
6532 // Certain instructions can be cheaper to vectorize if they have a constant
6533 // second vector operand. One example of this are shifts on x86.
6534 Value *Op2 = I->getOperand(1);
6535 auto Op2Info = TTI.getOperandInfo(Op2);
6536 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6537 Legal->isInvariant(Op2))
6539
6540 SmallVector<const Value *, 4> Operands(I->operand_values());
6542 I->getOpcode(), VectorTy, CostKind,
6543 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6544 Op2Info, Operands, I, TLI);
6545 }
6546 case Instruction::FNeg: {
6548 I->getOpcode(), VectorTy, CostKind,
6549 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6550 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6551 I->getOperand(0), I);
6552 }
6553 case Instruction::Select: {
6554 SelectInst *SI = cast<SelectInst>(I);
6555 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6556 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6557
6558 const Value *Op0, *Op1;
6559 using namespace llvm::PatternMatch;
6560 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6561 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6562 // select x, y, false --> x & y
6563 // select x, true, y --> x | y
6564 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6565 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6566 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6567 Op1->getType()->getScalarSizeInBits() == 1);
6568
6571 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6572 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6573 }
6574
6575 Type *CondTy = SI->getCondition()->getType();
6576 if (!ScalarCond)
6577 CondTy = VectorType::get(CondTy, VF);
6578
6580 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6581 Pred = Cmp->getPredicate();
6582 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6583 CostKind, I);
6584 }
6585 case Instruction::ICmp:
6586 case Instruction::FCmp: {
6587 Type *ValTy = I->getOperand(0)->getType();
6588 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6589 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6590 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6591 VectorTy = ToVectorTy(ValTy, VF);
6592 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6593 cast<CmpInst>(I)->getPredicate(), CostKind,
6594 I);
6595 }
6596 case Instruction::Store:
6597 case Instruction::Load: {
6598 ElementCount Width = VF;
6599 if (Width.isVector()) {
6600 InstWidening Decision = getWideningDecision(I, Width);
6601 assert(Decision != CM_Unknown &&
6602 "CM decision should be taken at this point");
6605 if (Decision == CM_Scalarize)
6606 Width = ElementCount::getFixed(1);
6607 }
6608 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
6609 return getMemoryInstructionCost(I, VF);
6610 }
6611 case Instruction::BitCast:
6612 if (I->getType()->isPointerTy())
6613 return 0;
6614 [[fallthrough]];
6615 case Instruction::ZExt:
6616 case Instruction::SExt:
6617 case Instruction::FPToUI:
6618 case Instruction::FPToSI:
6619 case Instruction::FPExt:
6620 case Instruction::PtrToInt:
6621 case Instruction::IntToPtr:
6622 case Instruction::SIToFP:
6623 case Instruction::UIToFP:
6624 case Instruction::Trunc:
6625 case Instruction::FPTrunc: {
6626 // Computes the CastContextHint from a Load/Store instruction.
6627 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6628 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6629 "Expected a load or a store!");
6630
6631 if (VF.isScalar() || !TheLoop->contains(I))
6633
6634 switch (getWideningDecision(I, VF)) {
6646 llvm_unreachable("Instr did not go through cost modelling?");
6649 llvm_unreachable_internal("Instr has invalid widening decision");
6650 }
6651
6652 llvm_unreachable("Unhandled case!");
6653 };
6654
6655 unsigned Opcode = I->getOpcode();
6657 // For Trunc, the context is the only user, which must be a StoreInst.
6658 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6659 if (I->hasOneUse())
6660 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6661 CCH = ComputeCCH(Store);
6662 }
6663 // For Z/Sext, the context is the operand, which must be a LoadInst.
6664 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6665 Opcode == Instruction::FPExt) {
6666 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6667 CCH = ComputeCCH(Load);
6668 }
6669
6670 // We optimize the truncation of induction variables having constant
6671 // integer steps. The cost of these truncations is the same as the scalar
6672 // operation.
6673 if (isOptimizableIVTruncate(I, VF)) {
6674 auto *Trunc = cast<TruncInst>(I);
6675 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6676 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6677 }
6678
6679 // Detect reduction patterns
6680 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6681 return *RedCost;
6682
6683 Type *SrcScalarTy = I->getOperand(0)->getType();
6684 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6685 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6686 SrcScalarTy =
6687 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6688 Type *SrcVecTy =
6689 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6690
6692 // If the result type is <= the source type, there will be no extend
6693 // after truncating the users to the minimal required bitwidth.
6694 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6695 (I->getOpcode() == Instruction::ZExt ||
6696 I->getOpcode() == Instruction::SExt))
6697 return 0;
6698 }
6699
6700 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6701 }
6702 case Instruction::Call:
6703 return getVectorCallCost(cast<CallInst>(I), VF);
6704 case Instruction::ExtractValue:
6706 case Instruction::Alloca:
6707 // We cannot easily widen alloca to a scalable alloca, as
6708 // the result would need to be a vector of pointers.
6709 if (VF.isScalable())
6711 [[fallthrough]];
6712 default:
6713 // This opcode is unknown. Assume that it is the same as 'mul'.
6714 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6715 } // end of switch.
6716}
6717
6719 // Ignore ephemeral values.
6721
6722 SmallVector<Value *, 4> DeadInterleavePointerOps;
6724
6725 // If a scalar epilogue is required, users outside the loop won't use
6726 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6727 // that is the case.
6728 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6729 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6730 return RequiresScalarEpilogue &&
6731 !TheLoop->contains(cast<Instruction>(U)->getParent());
6732 };
6733 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6734 for (BasicBlock *BB : TheLoop->blocks())
6735 for (Instruction &I : *BB) {
6736 // Find all stores to invariant variables. Since they are going to sink
6737 // outside the loop we do not need calculate cost for them.
6738 StoreInst *SI;
6739 if ((SI = dyn_cast<StoreInst>(&I)) &&
6740 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6741 ValuesToIgnore.insert(&I);
6742 auto I = DeadInvariantStoreOps.insert({SI->getPointerOperand(), {}});
6743 I.first->second.push_back(SI->getValueOperand());
6744 }
6745
6746 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6747 continue;
6748
6749 // Add instructions that would be trivially dead and are only used by
6750 // values already ignored to DeadOps to seed worklist.
6752 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6753 return VecValuesToIgnore.contains(U) ||
6754 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6755 }))
6756 DeadOps.push_back(&I);
6757
6758 // For interleave groups, we only create a pointer for the start of the
6759 // interleave group. Queue up addresses of group members except the insert
6760 // position for further processing.
6761 if (isAccessInterleaved(&I)) {
6762 auto *Group = getInterleavedAccessGroup(&I);
6763 if (Group->getInsertPos() == &I)
6764 continue;
6765 Value *PointerOp = getLoadStorePointerOperand(&I);
6766 DeadInterleavePointerOps.push_back(PointerOp);
6767 }
6768 }
6769
6770 // Mark ops feeding interleave group members as free, if they are only used
6771 // by other dead computations.
6772 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6773 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6774 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6775 Instruction *UI = cast<Instruction>(U);
6776 return !VecValuesToIgnore.contains(U) &&
6777 (!isAccessInterleaved(UI) ||
6778 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6779 }))
6780 continue;
6781 VecValuesToIgnore.insert(Op);
6782 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6783 }
6784
6785 for (const auto &[_, Ops] : DeadInvariantStoreOps) {
6786 for (Value *Op : ArrayRef(Ops).drop_back())
6787 DeadOps.push_back(Op);
6788 }
6789 // Mark ops that would be trivially dead and are only used by ignored
6790 // instructions as free.
6791 BasicBlock *Header = TheLoop->getHeader();
6792 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6793 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6794 // Skip any op that shouldn't be considered dead.
6795 if (!Op || !TheLoop->contains(Op) ||
6796 (isa<PHINode>(Op) && Op->getParent() == Header) ||
6798 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6799 return !VecValuesToIgnore.contains(U) &&
6800 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6801 }))
6802 continue;
6803
6804 if (!TheLoop->contains(Op->getParent()))
6805 continue;
6806
6807 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6808 // which applies for both scalar and vector versions. Otherwise it is only
6809 // dead in vector versions, so only add it to VecValuesToIgnore.
6810 if (all_of(Op->users(),
6811 [this](User *U) { return ValuesToIgnore.contains(U); }))
6812 ValuesToIgnore.insert(Op);
6813
6814 VecValuesToIgnore.insert(Op);
6815 DeadOps.append(Op->op_begin(), Op->op_end());
6816 }
6817
6818 // Ignore type-promoting instructions we identified during reduction
6819 // detection.
6820 for (const auto &Reduction : Legal->getReductionVars()) {
6821 const RecurrenceDescriptor &RedDes = Reduction.second;
6822 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6823 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6824 }
6825 // Ignore type-casting instructions we identified during induction
6826 // detection.
6827 for (const auto &Induction : Legal->getInductionVars()) {
6828 const InductionDescriptor &IndDes = Induction.second;
6829 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6830 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6831 }
6832}
6833
6835 for (const auto &Reduction : Legal->getReductionVars()) {
6836 PHINode *Phi = Reduction.first;
6837 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6838
6839 // We don't collect reductions that are type promoted (yet).
6840 if (RdxDesc.getRecurrenceType() != Phi->getType())
6841 continue;
6842
6843 // If the target would prefer this reduction to happen "in-loop", then we
6844 // want to record it as such.
6845 unsigned Opcode = RdxDesc.getOpcode();
6846 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6847 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6849 continue;
6850
6851 // Check that we can correctly put the reductions into the loop, by
6852 // finding the chain of operations that leads from the phi to the loop
6853 // exit value.
6854 SmallVector<Instruction *, 4> ReductionOperations =
6855 RdxDesc.getReductionOpChain(Phi, TheLoop);
6856 bool InLoop = !ReductionOperations.empty();
6857
6858 if (InLoop) {
6859 InLoopReductions.insert(Phi);
6860 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6861 Instruction *LastChain = Phi;
6862 for (auto *I : ReductionOperations) {
6863 InLoopReductionImmediateChains[I] = LastChain;
6864 LastChain = I;
6865 }
6866 }
6867 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6868 << " reduction for phi: " << *Phi << "\n");
6869 }
6870}
6871
6873 DebugLoc DL, const Twine &Name) {
6875 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
6876 return tryInsertInstruction(
6877 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
6878}
6879
6880// This function will select a scalable VF if the target supports scalable
6881// vectors and a fixed one otherwise.
6882// TODO: we could return a pair of values that specify the max VF and
6883// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6884// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6885// doesn't have a cost model that can choose which plan to execute if
6886// more than one is generated.
6889 unsigned WidestType;
6890 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6891
6896
6898 unsigned N = RegSize.getKnownMinValue() / WidestType;
6899 return ElementCount::get(N, RegSize.isScalable());
6900}
6901
6904 ElementCount VF = UserVF;
6905 // Outer loop handling: They may require CFG and instruction level
6906 // transformations before even evaluating whether vectorization is profitable.
6907 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6908 // the vectorization pipeline.
6909 if (!OrigLoop->isInnermost()) {
6910 // If the user doesn't provide a vectorization factor, determine a
6911 // reasonable one.
6912 if (UserVF.isZero()) {
6913 VF = determineVPlanVF(TTI, CM);
6914 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6915
6916 // Make sure we have a VF > 1 for stress testing.
6917 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6918 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6919 << "overriding computed VF.\n");
6920 VF = ElementCount::getFixed(4);
6921 }
6922 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6924 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6925 << "not supported by the target.\n");
6927 "Scalable vectorization requested but not supported by the target",
6928 "the scalable user-specified vectorization width for outer-loop "
6929 "vectorization cannot be used because the target does not support "
6930 "scalable vectors.",
6931 "ScalableVFUnfeasible", ORE, OrigLoop);
6933 }
6934 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6936 "VF needs to be a power of two");
6937 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6938 << "VF " << VF << " to build VPlans.\n");
6939 buildVPlans(VF, VF);
6940
6941 // For VPlan build stress testing, we bail out after VPlan construction.
6944
6945 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6946 }
6947
6948 LLVM_DEBUG(
6949 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6950 "VPlan-native path.\n");
6952}
6953
6954std::optional<VectorizationFactor>
6956 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6959
6960 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6961 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6962 return std::nullopt;
6963
6964 // Invalidate interleave groups if all blocks of loop will be predicated.
6965 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
6967 LLVM_DEBUG(
6968 dbgs()
6969 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6970 "which requires masked-interleaved support.\n");
6972 // Invalidating interleave groups also requires invalidating all decisions
6973 // based on them, which includes widening decisions and uniform and scalar
6974 // values.
6976 }
6977
6978 if (CM.foldTailByMasking())
6980
6981 ElementCount MaxUserVF =
6982 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6983 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
6984 if (!UserVF.isZero() && UserVFIsLegal) {
6986 "VF needs to be a power of two");
6987 // Collect the instructions (and their associated costs) that will be more
6988 // profitable to scalarize.
6990 if (CM.selectUserVectorizationFactor(UserVF)) {
6991 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6992 buildVPlansWithVPRecipes(UserVF, UserVF);
6993 if (!hasPlanWithVF(UserVF)) {
6994 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
6995 << ".\n");
6996 return std::nullopt;
6997 }
6998
7000 return {{UserVF, 0, 0}};
7001 } else
7002 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7003 "InvalidCost", ORE, OrigLoop);
7004 }
7005
7006 // Collect the Vectorization Factor Candidates.
7007 SmallVector<ElementCount> VFCandidates;
7008 for (auto VF = ElementCount::getFixed(1);
7009 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7010 VFCandidates.push_back(VF);
7011 for (auto VF = ElementCount::getScalable(1);
7012 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7013 VFCandidates.push_back(VF);
7014
7016 for (const auto &VF : VFCandidates) {
7017 // Collect Uniform and Scalar instructions after vectorization with VF.
7019
7020 // Collect the instructions (and their associated costs) that will be more
7021 // profitable to scalarize.
7022 if (VF.isVector())
7024 }
7025
7026 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7027 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7028
7030 if (VPlans.empty())
7031 return std::nullopt;
7032 if (all_of(VPlans,
7033 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))
7035
7036 // Select the optimal vectorization factor according to the legacy cost-model.
7037 // This is now only used to verify the decisions by the new VPlan-based
7038 // cost-model and will be retired once the VPlan-based cost-model is
7039 // stabilized.
7040 VectorizationFactor VF = selectVectorizationFactor();
7041 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7042 if (!hasPlanWithVF(VF.Width)) {
7043 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7044 << ".\n");
7045 return std::nullopt;
7046 }
7047 return VF;
7048}
7049
7051 ElementCount VF) const {
7052 return CM.getInstructionCost(UI, VF);
7053}
7054
7055bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7056 return CM.ValuesToIgnore.contains(UI) ||
7057 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7058 SkipCostComputation.contains(UI);
7059}
7060
7061InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7062 ElementCount VF) const {
7064 LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
7065 VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
7066
7067 // Cost modeling for inductions is inaccurate in the legacy cost model
7068 // compared to the recipes that are generated. To match here initially during
7069 // VPlan cost model bring up directly use the induction costs from the legacy
7070 // cost model. Note that we do this as pre-processing; the VPlan may not have
7071 // any recipes associated with the original induction increment instruction
7072 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7073 // the cost of induction phis and increments (both that are represented by
7074 // recipes and those that are not), to avoid distinguishing between them here,
7075 // and skip all recipes that represent induction phis and increments (the
7076 // former case) later on, if they exist, to avoid counting them twice.
7077 // Similarly we pre-compute the cost of any optimized truncates.
7078 // TODO: Switch to more accurate costing based on VPlan.
7079 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7080 Instruction *IVInc = cast<Instruction>(
7081 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7082 SmallVector<Instruction *> IVInsts = {IVInc};
7083 for (unsigned I = 0; I != IVInsts.size(); I++) {
7084 for (Value *Op : IVInsts[I]->operands()) {
7085 auto *OpI = dyn_cast<Instruction>(Op);
7086 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7087 continue;
7088 IVInsts.push_back(OpI);
7089 }
7090 }
7091 IVInsts.push_back(IV);
7092 for (User *U : IV->users()) {
7093 auto *CI = cast<Instruction>(U);
7094 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7095 continue;
7096 IVInsts.push_back(CI);
7097 }
7098 for (Instruction *IVInst : IVInsts) {
7099 if (!CostCtx.SkipCostComputation.insert(IVInst).second)
7100 continue;
7101 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7102 LLVM_DEBUG({
7103 dbgs() << "Cost of " << InductionCost << " for VF " << VF
7104 << ": induction instruction " << *IVInst << "\n";
7105 });
7106 Cost += InductionCost;
7107 }
7108 }
7109
7110 /// Compute the cost of all exiting conditions of the loop using the legacy
7111 /// cost model. This is to match the legacy behavior, which adds the cost of
7112 /// all exit conditions. Note that this over-estimates the cost, as there will
7113 /// be a single condition to control the vector loop.
7115 CM.TheLoop->getExitingBlocks(Exiting);
7116 SetVector<Instruction *> ExitInstrs;
7117 // Collect all exit conditions.
7118 for (BasicBlock *EB : Exiting) {
7119 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7120 if (!Term)
7121 continue;
7122 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7123 ExitInstrs.insert(CondI);
7124 }
7125 }
7126 // Compute the cost of all instructions only feeding the exit conditions.
7127 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7128 Instruction *CondI = ExitInstrs[I];
7129 if (!OrigLoop->contains(CondI) ||
7130 !CostCtx.SkipCostComputation.insert(CondI).second)
7131 continue;
7132 Cost += CostCtx.getLegacyCost(CondI, VF);
7133 for (Value *Op : CondI->operands()) {
7134 auto *OpI = dyn_cast<Instruction>(Op);
7135 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7136 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7137 !ExitInstrs.contains(cast<Instruction>(U));
7138 }))
7139 continue;
7140 ExitInstrs.insert(OpI);
7141 }
7142 }
7143
7144 // The legacy cost model has special logic to compute the cost of in-loop
7145 // reductions, which may be smaller than the sum of all instructions involved
7146 // in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7147 // which the legacy cost model uses to assign cost. Pre-compute their costs
7148 // for now.
7149 // TODO: Switch to costing based on VPlan once the logic has been ported.
7150 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7151 if (!CM.isInLoopReduction(RedPhi) &&
7153 RdxDesc.getRecurrenceKind()))
7154 continue;
7155
7156 // AnyOf reduction codegen may remove the select. To match the legacy cost
7157 // model, pre-compute the cost for AnyOf reductions here.
7159 RdxDesc.getRecurrenceKind())) {
7160 auto *Select = cast<SelectInst>(*find_if(
7161 RedPhi->users(), [](User *U) { return isa<SelectInst>(U); }));
7162 assert(!CostCtx.SkipCostComputation.contains(Select) &&
7163 "reduction op visited multiple times");
7164 CostCtx.SkipCostComputation.insert(Select);
7165 auto ReductionCost = CostCtx.getLegacyCost(Select, VF);
7166 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7167 << ":\n any-of reduction " << *Select << "\n");
7168 Cost += ReductionCost;
7169 continue;
7170 }
7171
7172 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7173 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7174 ChainOps.end());
7175 // Also include the operands of instructions in the chain, as the cost-model
7176 // may mark extends as free.
7177 for (auto *ChainOp : ChainOps) {
7178 for (Value *Op : ChainOp->operands()) {
7179 if (auto *I = dyn_cast<Instruction>(Op))
7180 ChainOpsAndOperands.insert(I);
7181 }
7182 }
7183
7184 // Pre-compute the cost for I, if it has a reduction pattern cost.
7185 for (Instruction *I : ChainOpsAndOperands) {
7186 auto ReductionCost = CM.getReductionPatternCost(
7187 I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7188 if (!ReductionCost)
7189 continue;
7190
7191 assert(!CostCtx.SkipCostComputation.contains(I) &&
7192 "reduction op visited multiple times");
7193 CostCtx.SkipCostComputation.insert(I);
7194 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7195 << ":\n in-loop reduction " << *I << "\n");
7196 Cost += *ReductionCost;
7197 }
7198 }
7199
7200 // Pre-compute the costs for branches except for the backedge, as the number
7201 // of replicate regions in a VPlan may not directly match the number of
7202 // branches, which would lead to different decisions.
7203 // TODO: Compute cost of branches for each replicate region in the VPlan,
7204 // which is more accurate than the legacy cost model.
7205 for (BasicBlock *BB : OrigLoop->blocks()) {
7206 if (BB == OrigLoop->getLoopLatch())
7207 continue;
7208 CostCtx.SkipCostComputation.insert(BB->getTerminator());
7209 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7210 Cost += BranchCost;
7211 }
7212 // Now compute and add the VPlan-based cost.
7213 Cost += Plan.cost(VF, CostCtx);
7214 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
7215 return Cost;
7216}
7217
7219 // If there is a single VPlan with a single VF, return it directly.
7220 VPlan &FirstPlan = *VPlans[0];
7221 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7222 return *FirstPlan.vectorFactors().begin();
7223
7225 assert(hasPlanWithVF(ScalarVF) &&
7226 "More than a single plan/VF w/o any plan having scalar VF");
7227
7228 // TODO: Compute scalar cost using VPlan-based cost model.
7229 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7230 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7231 VectorizationFactor BestFactor = ScalarFactor;
7232
7233 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7234 if (ForceVectorization) {
7235 // Ignore scalar width, because the user explicitly wants vectorization.
7236 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7237 // evaluation.
7238 BestFactor.Cost = InstructionCost::getMax();
7239 }
7240
7241 for (auto &P : VPlans) {
7242 for (ElementCount VF : P->vectorFactors()) {
7243 if (VF.isScalar())
7244 continue;
7245 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7246 LLVM_DEBUG(
7247 dbgs()
7248 << "LV: Not considering vector loop of width " << VF
7249 << " because it will not generate any vector instructions.\n");
7250 continue;
7251 }
7252
7253 InstructionCost Cost = cost(*P, VF);
7254 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7255 if (isMoreProfitable(CurrentFactor, BestFactor))
7256 BestFactor = CurrentFactor;
7257
7258 // If profitable add it to ProfitableVF list.
7259 if (isMoreProfitable(CurrentFactor, ScalarFactor))
7260 ProfitableVFs.push_back(CurrentFactor);
7261 }
7262 }
7263 return BestFactor.Width;
7264}
7265
7267 assert(count_if(VPlans,
7268 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7269 1 &&
7270 "Best VF has not a single VPlan.");
7271
7272 for (const VPlanPtr &Plan : VPlans) {
7273 if (Plan->hasVF(VF))
7274 return *Plan.get();
7275 }
7276 llvm_unreachable("No plan found!");
7277}
7278
7281 // Reserve first location for self reference to the LoopID metadata node.
7282 MDs.push_back(nullptr);
7283 bool IsUnrollMetadata = false;
7284 MDNode *LoopID = L->getLoopID();
7285 if (LoopID) {
7286 // First find existing loop unrolling disable metadata.
7287 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7288 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7289 if (MD) {
7290 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7291 IsUnrollMetadata =
7292 S && S->getString().starts_with("llvm.loop.unroll.disable");
7293 }
7294 MDs.push_back(LoopID->getOperand(i));
7295 }
7296 }
7297
7298 if (!IsUnrollMetadata) {
7299 // Add runtime unroll disable metadata.
7300 LLVMContext &Context = L->getHeader()->getContext();
7301 SmallVector<Metadata *, 1> DisableOperands;
7302 DisableOperands.push_back(
7303 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7304 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7305 MDs.push_back(DisableNode);
7306 MDNode *NewLoopID = MDNode::get(Context, MDs);
7307 // Set operand 0 to refer to the loop id itself.
7308 NewLoopID->replaceOperandWith(0, NewLoopID);
7309 L->setLoopID(NewLoopID);
7310 }
7311}
7312
7313// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7314// create a merge phi node for it and add it to \p ReductionResumeValues.
7316 VPInstruction *RedResult,
7318 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
7319 bool VectorizingEpilogue) {
7320 if (!RedResult ||
7322 return;
7323
7324 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7325 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7326
7327 Value *FinalValue =
7328 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7329 auto *ResumePhi =
7330 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7331 if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
7332 RdxDesc.getRecurrenceKind())) {
7333 auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
7334 assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
7335 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
7336 ResumePhi = cast<PHINode>(Cmp->getOperand(0));
7337 }
7338 assert((!VectorizingEpilogue || ResumePhi) &&
7339 "when vectorizing the epilogue loop, we need a resume phi from main "
7340 "vector loop");
7341
7342 // TODO: bc.merge.rdx should not be created here, instead it should be
7343 // modeled in VPlan.
7344 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7345 // Create a phi node that merges control-flow from the backedge-taken check
7346 // block and the middle block.
7347 auto *BCBlockPhi =
7348 PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7349 LoopScalarPreHeader->getTerminator()->getIterator());
7350
7351 // If we are fixing reductions in the epilogue loop then we should already
7352 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7353 // we carry over the incoming values correctly.
7354 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7355 if (Incoming == LoopMiddleBlock)
7356 BCBlockPhi->addIncoming(FinalValue, Incoming);
7357 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7358 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7359 Incoming);
7360 else
7361 BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
7362 }
7363
7364 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7365 // TODO: This fixup should instead be modeled in VPlan.
7366 // Fix the scalar loop reduction variable with the incoming reduction sum
7367 // from the vector body and from the backedge value.
7368 int IncomingEdgeBlockIdx =
7369 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7370 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7371 // Pick the other block.
7372 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7373 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7374 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7375 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7376
7377 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7378}
7379
7380std::pair<DenseMap<const SCEV *, Value *>,
7383 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7384 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7385 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7386 assert(BestVPlan.hasVF(BestVF) &&
7387 "Trying to execute plan with unsupported VF");
7388 assert(BestVPlan.hasUF(BestUF) &&
7389 "Trying to execute plan with unsupported UF");
7390 assert(
7391 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7392 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7393 (void)IsEpilogueVectorization;
7394
7395 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7396
7397 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7398 << ", UF=" << BestUF << '\n');
7399 BestVPlan.setName("Final VPlan");
7400 LLVM_DEBUG(BestVPlan.dump());
7401
7402 // Perform the actual loop transformation.
7403 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7404 OrigLoop->getHeader()->getContext());
7405
7406 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7407 // before making any changes to the CFG.
7408 if (!BestVPlan.getPreheader()->empty()) {
7409 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7411 BestVPlan.getPreheader()->execute(&State);
7412 }
7413 if (!ILV.getTripCount())
7414 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7415 else
7416 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7417 "count during epilogue vectorization");
7418
7419 // 1. Set up the skeleton for vectorization, including vector pre-header and
7420 // middle block. The vector loop is created during VPlan execution.
7421 Value *CanonicalIVStartValue;
7422 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7423 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7424 : State.ExpandedSCEVs);
7425#ifdef EXPENSIVE_CHECKS
7426 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7427#endif
7428
7429 // Only use noalias metadata when using memory checks guaranteeing no overlap
7430 // across all iterations.
7431 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7432 std::unique_ptr<LoopVersioning> LVer = nullptr;
7433 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7435
7436 // We currently don't use LoopVersioning for the actual loop cloning but we
7437 // still use it to add the noalias metadata.
7438 // TODO: Find a better way to re-use LoopVersioning functionality to add
7439 // metadata.
7440 LVer = std::make_unique<LoopVersioning>(
7441 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7442 PSE.getSE());
7443 State.LVer = &*LVer;
7445 }
7446
7448
7449 //===------------------------------------------------===//
7450 //
7451 // Notice: any optimization or new instruction that go
7452 // into the code below should also be implemented in
7453 // the cost-model.
7454 //
7455 //===------------------------------------------------===//
7456
7457 // 2. Copy and widen instructions from the old loop into the new loop.
7458 BestVPlan.prepareToExecute(ILV.getTripCount(),
7459 ILV.getOrCreateVectorTripCount(nullptr),
7460 CanonicalIVStartValue, State);
7461
7462 BestVPlan.execute(&State);
7463
7464 // 2.5 Collect reduction resume values.
7466 auto *ExitVPBB =
7467 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7468 for (VPRecipeBase &R : *ExitVPBB) {
7470 dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
7471 State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
7472 }
7473
7474 // 2.6. Maintain Loop Hints
7475 // Keep all loop hints from the original loop on the vector loop (we'll
7476 // replace the vectorizer-specific hints below).
7477 MDNode *OrigLoopID = OrigLoop->getLoopID();
7478
7479 std::optional<MDNode *> VectorizedLoopID =
7482
7483 VPBasicBlock *HeaderVPBB =
7485 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7486 if (VectorizedLoopID)
7487 L->setLoopID(*VectorizedLoopID);
7488 else {
7489 // Keep all loop hints from the original loop on the vector loop (we'll
7490 // replace the vectorizer-specific hints below).
7491 if (MDNode *LID = OrigLoop->getLoopID())
7492 L->setLoopID(LID);
7493
7494 LoopVectorizeHints Hints(L, true, *ORE);
7495 Hints.setAlreadyVectorized();
7496 }
7498 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7499 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7501
7502 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7503 // predication, updating analyses.
7504 ILV.fixVectorizedLoop(State, BestVPlan);
7505
7507
7508 // 4. Adjust branch weight of the branch in the middle block.
7509 auto *MiddleTerm =
7510 cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7511 if (MiddleTerm->isConditional() &&
7512 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7513 // Assume that `Count % VectorTripCount` is equally distributed.
7514 unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7515 assert(TripCount > 0 && "trip count should not be zero");
7516 const uint32_t Weights[] = {1, TripCount - 1};
7517 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7518 }
7519
7520 return {State.ExpandedSCEVs, ReductionResumeValues};
7521}
7522
7523#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7525 for (const auto &Plan : VPlans)
7527 Plan->printDOT(O);
7528 else
7529 Plan->print(O);
7530}
7531#endif
7532
7533//===--------------------------------------------------------------------===//
7534// EpilogueVectorizerMainLoop
7535//===--------------------------------------------------------------------===//
7536
7537/// This function is partially responsible for generating the control flow
7538/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7539std::pair<BasicBlock *, Value *>
7541 const SCEV2ValueTy &ExpandedSCEVs) {
7543
7544 // Generate the code to check the minimum iteration count of the vector
7545 // epilogue (see below).
7549
7550 // Generate the code to check any assumptions that we've made for SCEV
7551 // expressions.
7553
7554 // Generate the code that checks at runtime if arrays overlap. We put the
7555 // checks into a separate block to make the more common case of few elements
7556 // faster.
7558
7559 // Generate the iteration count check for the main loop, *after* the check
7560 // for the epilogue loop, so that the path-length is shorter for the case
7561 // that goes directly through the vector epilogue. The longer-path length for
7562 // the main loop is compensated for, by the gain from vectorizing the larger
7563 // trip count. Note: the branch will get updated later on when we vectorize
7564 // the epilogue.
7567
7568 // Generate the induction variable.
7570
7571 // Skip induction resume value creation here because they will be created in
7572 // the second pass for the scalar loop. The induction resume values for the
7573 // inductions in the epilogue loop are created before executing the plan for
7574 // the epilogue loop.
7575
7576 return {LoopVectorPreHeader, nullptr};
7577}
7578
7580 LLVM_DEBUG({
7581 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7582 << "Main Loop VF:" << EPI.MainLoopVF
7583 << ", Main Loop UF:" << EPI.MainLoopUF
7584 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7585 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7586 });
7587}
7588
7591 dbgs() << "intermediate fn:\n"
7592 << *OrigLoop->getHeader()->getParent() << "\n";
7593 });
7594}
7595
7596BasicBlock *
7598 bool ForEpilogue) {
7599 assert(Bypass && "Expected valid bypass basic block.");
7600 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7601 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7602 Value *Count = getTripCount();
7603 // Reuse existing vector loop preheader for TC checks.
7604 // Note that new preheader block is generated for vector loop.
7605 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7606 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7607
7608 // Generate code to check if the loop's trip count is less than VF * UF of the
7609 // main vector loop.
7610 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7611 : VF.isVector())
7614
7615 Value *CheckMinIters = Builder.CreateICmp(
7616 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7617 "min.iters.check");
7618
7619 if (!ForEpilogue)
7620 TCCheckBlock->setName("vector.main.loop.iter.check");
7621
7622 // Create new preheader for vector loop.
7623 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7624 DT, LI, nullptr, "vector.ph");
7625
7626 if (ForEpilogue) {
7627 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7628 DT->getNode(Bypass)->getIDom()) &&
7629 "TC check is expected to dominate Bypass");
7630
7631 // Update dominator for Bypass.
7632 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7633 LoopBypassBlocks.push_back(TCCheckBlock);
7634
7635 // Save the trip count so we don't have to regenerate it in the
7636 // vec.epilog.iter.check. This is safe to do because the trip count
7637 // generated here dominates the vector epilog iter check.
7638 EPI.TripCount = Count;
7639 }
7640
7641 BranchInst &BI =
7642 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7644 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7645 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7646
7647 return TCCheckBlock;
7648}
7649
7650//===--------------------------------------------------------------------===//
7651// EpilogueVectorizerEpilogueLoop
7652//===--------------------------------------------------------------------===//
7653
7654/// This function is partially responsible for generating the control flow
7655/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7656std::pair<BasicBlock *, Value *>
7658 const SCEV2ValueTy &ExpandedSCEVs) {
7659 createVectorLoopSkeleton("vec.epilog.");
7660
7661 // Now, compare the remaining count and if there aren't enough iterations to
7662 // execute the vectorized epilogue skip to the scalar part.
7663 LoopVectorPreHeader->setName("vec.epilog.ph");
7664 BasicBlock *VecEpilogueIterationCountCheck =
7666 nullptr, "vec.epilog.iter.check", true);
7668 VecEpilogueIterationCountCheck);
7669
7670 // Adjust the control flow taking the state info from the main loop
7671 // vectorization into account.
7673 "expected this to be saved from the previous pass.");
7675 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7676
7679
7681 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7682
7683 if (EPI.SCEVSafetyCheck)
7685 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7686 if (EPI.MemSafetyCheck)
7688 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7689
7691 VecEpilogueIterationCountCheck,
7692 VecEpilogueIterationCountCheck->getSinglePredecessor());
7693
7696 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7697 // If there is an epilogue which must run, there's no edge from the
7698 // middle block to exit blocks and thus no need to update the immediate
7699 // dominator of the exit blocks.
7702
7703 // Keep track of bypass blocks, as they feed start values to the induction and
7704 // reduction phis in the scalar loop preheader.
7705 if (EPI.SCEVSafetyCheck)
7707 if (EPI.MemSafetyCheck)
7710
7711 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7712 // reductions which merge control-flow from the latch block and the middle
7713 // block. Update the incoming values here and move the Phi into the preheader.
7714 SmallVector<PHINode *, 4> PhisInBlock;
7715 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7716 PhisInBlock.push_back(&Phi);
7717
7718 for (PHINode *Phi : PhisInBlock) {
7719 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7720 Phi->replaceIncomingBlockWith(
7721 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7722 VecEpilogueIterationCountCheck);
7723
7724 // If the phi doesn't have an incoming value from the
7725 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7726 // value and also those from other check blocks. This is needed for
7727 // reduction phis only.
7728 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7729 return EPI.EpilogueIterationCountCheck == IncB;
7730 }))
7731 continue;
7732 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7733 if (EPI.SCEVSafetyCheck)
7734 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7735 if (EPI.MemSafetyCheck)
7736 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7737 }
7738
7739 // Generate a resume induction for the vector epilogue and put it in the
7740 // vector epilogue preheader
7741 Type *IdxTy = Legal->getWidestInductionType();
7742 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7744 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7745 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7747
7748 // Generate induction resume values. These variables save the new starting
7749 // indexes for the scalar loop. They are used to test if there are any tail
7750 // iterations left once the vector loop has completed.
7751 // Note that when the vectorized epilogue is skipped due to iteration count
7752 // check, then the resume value for the induction variable comes from
7753 // the trip count of the main vector loop, hence passing the AdditionalBypass
7754 // argument.
7755 createInductionResumeValues(ExpandedSCEVs,
7756 {VecEpilogueIterationCountCheck,
7757 EPI.VectorTripCount} /* AdditionalBypass */);
7758
7759 return {LoopVectorPreHeader, EPResumeVal};
7760}
7761
7762BasicBlock *
7764 BasicBlock *Bypass, BasicBlock *Insert) {
7765
7767 "Expected trip count to have been safed in the first pass.");
7768 assert(
7769 (!isa<Instruction>(EPI.TripCount) ||
7770 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7771 "saved trip count does not dominate insertion point.");
7772 Value *TC = EPI.TripCount;
7773 IRBuilder<> Builder(Insert->getTerminator());
7774 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7775
7776 // Generate code to check if the loop's trip count is less than VF * UF of the
7777 // vector epilogue loop.
7778 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7781
7782 Value *CheckMinIters =
7783 Builder.CreateICmp(P, Count,
7786 "min.epilog.iters.check");
7787
7788 BranchInst &BI =
7789 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7791 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7792 unsigned EpilogueLoopStep =
7794 // We assume the remaining `Count` is equally distributed in
7795 // [0, MainLoopStep)
7796 // So the probability for `Count < EpilogueLoopStep` should be
7797 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7798 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7799 const uint32_t Weights[] = {EstimatedSkipCount,
7800 MainLoopStep - EstimatedSkipCount};
7801 setBranchWeights(BI, Weights, /*IsExpected=*/false);
7802 }
7803 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7804 LoopBypassBlocks.push_back(Insert);
7805 return Insert;
7806}
7807
7809 LLVM_DEBUG({
7810 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7811 << "Epilogue Loop VF:" << EPI.EpilogueVF
7812 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7813 });
7814}
7815
7818 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7819 });
7820}
7821
7823 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7824 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7825 bool PredicateAtRangeStart = Predicate(Range.Start);
7826
7827 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7828 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7829 Range.End = TmpVF;
7830 break;
7831 }
7832
7833 return PredicateAtRangeStart;
7834}
7835
7836/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7837/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7838/// of VF's starting at a given VF and extending it as much as possible. Each
7839/// vectorization decision can potentially shorten this sub-range during
7840/// buildVPlan().
7842 ElementCount MaxVF) {
7843 auto MaxVFTimes2 = MaxVF * 2;
7844 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7845 VFRange SubRange = {VF, MaxVFTimes2};
7846 auto Plan = buildVPlan(SubRange);
7847 VPlanTransforms::optimize(*Plan, *PSE.getSE());
7848 VPlans.push_back(std::move(Plan));
7849 VF = SubRange.End;
7850 }
7851}
7852
7853iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
7855 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
7856 if (auto *I = dyn_cast<Instruction>(Op)) {
7857 if (auto *R = Ingredient2Recipe.lookup(I))
7858 return R->getVPSingleValue();
7859 }
7860 return Plan.getOrAddLiveIn(Op);
7861 };
7862 return map_range(Operands, Fn);
7863}
7864
7866 BasicBlock *Src = SI->getParent();
7867 assert(!OrigLoop->isLoopExiting(Src) &&
7868 all_of(successors(Src),
7869 [this](BasicBlock *Succ) {
7870 return OrigLoop->getHeader() != Succ;
7871 }) &&
7872 "unsupported switch either exiting loop or continuing to header");
7873 // Create masks where the terminator in Src is a switch. We create mask for
7874 // all edges at the same time. This is more efficient, as we can create and
7875 // collect compares for all cases once.
7876 VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition(), Plan);
7877 BasicBlock *DefaultDst = SI->getDefaultDest();
7879 for (auto &C : SI->cases()) {
7880 BasicBlock *Dst = C.getCaseSuccessor();
7881 assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
7882 // Cases whose destination is the same as default are redundant and can be
7883 // ignored - they will get there anyhow.
7884 if (Dst == DefaultDst)
7885 continue;
7886 auto I = Dst2Compares.insert({Dst, {}});
7887 VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue(), Plan);
7888 I.first->second.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
7889 }
7890
7891 // We need to handle 2 separate cases below for all entries in Dst2Compares,
7892 // which excludes destinations matching the default destination.
7893 VPValue *SrcMask = getBlockInMask(Src);
7894 VPValue *DefaultMask = nullptr;
7895 for (const auto &[Dst, Conds] : Dst2Compares) {
7896 // 1. Dst is not the default destination. Dst is reached if any of the cases
7897 // with destination == Dst are taken. Join the conditions for each case
7898 // whose destination == Dst using an OR.
7899 VPValue *Mask = Conds[0];
7900 for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
7901 Mask = Builder.createOr(Mask, V);
7902 if (SrcMask)
7903 Mask = Builder.createLogicalAnd(SrcMask, Mask);
7904 EdgeMaskCache[{Src, Dst}] = Mask;
7905
7906 // 2. Create the mask for the default destination, which is reached if none
7907 // of the cases with destination != default destination are taken. Join the
7908 // conditions for each case where the destination is != Dst using an OR and
7909 // negate it.
7910 DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
7911 }
7912
7913 if (DefaultMask) {
7914 DefaultMask = Builder.createNot(DefaultMask);
7915 if (SrcMask)
7916 DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
7917 }
7918 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
7919}
7920
7922 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7923
7924 // Look for cached value.
7925 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7926 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7927 if (ECEntryIt != EdgeMaskCache.end())
7928 return ECEntryIt->second;
7929
7930 if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
7932 assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
7933 return EdgeMaskCache[Edge];
7934 }
7935
7936 VPValue *SrcMask = getBlockInMask(Src);
7937
7938 // The terminator has to be a branch inst!
7939 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7940 assert(BI && "Unexpected terminator found");
7941 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7942 return EdgeMaskCache[Edge] = SrcMask;
7943
7944 // If source is an exiting block, we know the exit edge is dynamically dead
7945 // in the vector loop, and thus we don't need to restrict the mask. Avoid
7946 // adding uses of an otherwise potentially dead instruction.
7947 if (OrigLoop->isLoopExiting(Src))
7948 return EdgeMaskCache[Edge] = SrcMask;
7949
7950 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);
7951 assert(EdgeMask && "No Edge Mask found for condition");
7952
7953 if (BI->getSuccessor(0) != Dst)
7954 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7955
7956 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7957 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
7958 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
7959 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7960 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
7961 }
7962
7963 return EdgeMaskCache[Edge] = EdgeMask;
7964}
7965
7967 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7968
7969 // Look for cached value.
7970 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7971 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
7972 assert(ECEntryIt != EdgeMaskCache.end() &&
7973 "looking up mask for edge which has not been created");
7974 return ECEntryIt->second;
7975}
7976
7978 BasicBlock *Header = OrigLoop->getHeader();
7979
7980 // When not folding the tail, use nullptr to model all-true mask.
7981 if (!CM.foldTailByMasking()) {
7982 BlockMaskCache[Header] = nullptr;
7983 return;
7984 }
7985
7986 // Introduce the early-exit compare IV <= BTC to form header block mask.
7987 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
7988 // constructing the desired canonical IV in the header block as its first
7989 // non-phi instructions.
7990
7991 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7992 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
7993 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
7994 HeaderVPBB->insert(IV, NewInsertionPoint);
7995
7996 VPBuilder::InsertPointGuard Guard(Builder);
7997 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
7998 VPValue *BlockMask = nullptr;
8000 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8001 BlockMaskCache[Header] = BlockMask;
8002}
8003
8005 // Return the cached value.
8006 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8007 assert(BCEntryIt != BlockMaskCache.end() &&
8008 "Trying to access mask for block without one.");
8009 return BCEntryIt->second;
8010}
8011
8013 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8014 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8015 assert(OrigLoop->getHeader() != BB &&
8016 "Loop header must have cached block mask");
8017
8018 // All-one mask is modelled as no-mask following the convention for masked
8019 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8020 VPValue *BlockMask = nullptr;
8021 // This is the block mask. We OR all unique incoming edges.
8022 for (auto *Predecessor :
8024 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8025 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8026 BlockMaskCache[BB] = EdgeMask;
8027 return;
8028 }
8029
8030 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8031 BlockMask = EdgeMask;
8032 continue;
8033 }
8034
8035 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8036 }
8037
8038 BlockMaskCache[BB] = BlockMask;
8039}
8040
8042VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8043 VFRange &Range) {
8044 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8045 "Must be called with either a load or store");
8046
8047 auto willWiden = [&](ElementCount VF) -> bool {
8049 CM.getWideningDecision(I, VF);
8051 "CM decision should be taken at this point.");
8053 return true;
8054 if (CM.isScalarAfterVectorization(I, VF) ||
8055 CM.isProfitableToScalarize(I, VF))
8056 return false;
8058 };
8059
8061 return nullptr;
8062
8063 VPValue *Mask = nullptr;
8064 if (Legal->isMaskRequired(I))
8065 Mask = getBlockInMask(I->getParent());
8066
8067 // Determine if the pointer operand of the access is either consecutive or
8068 // reverse consecutive.
8070 CM.getWideningDecision(I, Range.Start);
8072 bool Consecutive =
8074
8075 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8076 if (Consecutive) {
8077 auto *GEP = dyn_cast<GetElementPtrInst>(
8078 Ptr->getUnderlyingValue()->stripPointerCasts());
8079 auto *VectorPtr = new VPVectorPointerRecipe(
8080 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8081 I->getDebugLoc());
8082 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8083 Ptr = VectorPtr;
8084 }
8085 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8086 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8087 I->getDebugLoc());
8088
8089 StoreInst *Store = cast<StoreInst>(I);
8090 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8091 Reverse, I->getDebugLoc());
8092}
8093
8094/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8095/// insert a recipe to expand the step for the induction recipe.
8098 VPValue *Start, const InductionDescriptor &IndDesc,
8099 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8100 assert(IndDesc.getStartValue() ==
8101 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8102 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8103 "step must be loop invariant");
8104
8105 VPValue *Step =
8107 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8108 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8109 }
8110 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8111 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8112}
8113
8114VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8116
8117 // Check if this is an integer or fp induction. If so, build the recipe that
8118 // produces its scalar and vector values.
8119 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8120 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8121 *PSE.getSE(), *OrigLoop);
8122
8123 // Check if this is pointer induction. If so, build the recipe for it.
8124 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8125 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8126 *PSE.getSE());
8128 Phi, Operands[0], Step, *II,
8130 [&](ElementCount VF) {
8131 return CM.isScalarAfterVectorization(Phi, VF);
8132 },
8133 Range));
8134 }
8135 return nullptr;
8136}
8137
8138VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8140 // Optimize the special case where the source is a constant integer
8141 // induction variable. Notice that we can only optimize the 'trunc' case
8142 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8143 // (c) other casts depend on pointer size.
8144
8145 // Determine whether \p K is a truncation based on an induction variable that
8146 // can be optimized.
8147 auto isOptimizableIVTruncate =
8148 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8149 return [=](ElementCount VF) -> bool {
8150 return CM.isOptimizableIVTruncate(K, VF);
8151 };
8152 };
8153
8155 isOptimizableIVTruncate(I), Range)) {
8156
8157 auto *Phi = cast<PHINode>(I->getOperand(0));
8159 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8160 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8161 *OrigLoop);
8162 }
8163 return nullptr;
8164}
8165
8166VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8168 unsigned NumIncoming = Phi->getNumIncomingValues();
8169
8170 // We know that all PHIs in non-header blocks are converted into selects, so
8171 // we don't have to worry about the insertion order and we can just use the
8172 // builder. At this point we generate the predication tree. There may be
8173 // duplications since this is a simple recursive scan, but future
8174 // optimizations will clean it up.
8175 // TODO: At the moment the first mask is always skipped, but it would be
8176 // better to skip the most expensive mask.
8177 SmallVector<VPValue *, 2> OperandsWithMask;
8178
8179 for (unsigned In = 0; In < NumIncoming; In++) {
8180 OperandsWithMask.push_back(Operands[In]);
8181 VPValue *EdgeMask =
8182 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8183 if (!EdgeMask) {
8184 assert(In == 0 && "Both null and non-null edge masks found");
8186 "Distinct incoming values with one having a full mask");
8187 break;
8188 }
8189 if (In == 0)
8190 continue;
8191 OperandsWithMask.push_back(EdgeMask);
8192 }
8193 return new VPBlendRecipe(Phi, OperandsWithMask);
8194}
8195
8196VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8198 VFRange &Range) {
8200 [this, CI](ElementCount VF) {
8201 return CM.isScalarWithPredication(CI, VF);
8202 },
8203 Range);
8204
8205 if (IsPredicated)
8206 return nullptr;
8207
8209 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8210 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8211 ID == Intrinsic::pseudoprobe ||
8212 ID == Intrinsic::experimental_noalias_scope_decl))
8213 return nullptr;
8214
8215 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8216 Ops.push_back(Operands.back());
8217
8218 // Is it beneficial to perform intrinsic call compared to lib call?
8219 bool ShouldUseVectorIntrinsic =
8221 [&](ElementCount VF) -> bool {
8222 return CM.getCallWideningDecision(CI, VF).Kind ==
8224 },
8225 Range);
8226 if (ShouldUseVectorIntrinsic)
8227 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID,
8228 CI->getDebugLoc());
8229
8230 Function *Variant = nullptr;
8231 std::optional<unsigned> MaskPos;
8232 // Is better to call a vectorized version of the function than to to scalarize
8233 // the call?
8234 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8235 [&](ElementCount VF) -> bool {
8236 // The following case may be scalarized depending on the VF.
8237 // The flag shows whether we can use a usual Call for vectorized
8238 // version of the instruction.
8239
8240 // If we've found a variant at a previous VF, then stop looking. A
8241 // vectorized variant of a function expects input in a certain shape
8242 // -- basically the number of input registers, the number of lanes
8243 // per register, and whether there's a mask required.
8244 // We store a pointer to the variant in the VPWidenCallRecipe, so
8245 // once we have an appropriate variant it's only valid for that VF.
8246 // This will force a different vplan to be generated for each VF that
8247 // finds a valid variant.
8248 if (Variant)
8249 return false;
8251 CM.getCallWideningDecision(CI, VF);
8253 Variant = Decision.Variant;
8254 MaskPos = Decision.MaskPos;
8255 return true;
8256 }
8257
8258 return false;
8259 },
8260 Range);
8261 if (ShouldUseVectorCall) {
8262 if (MaskPos.has_value()) {
8263 // We have 2 cases that would require a mask:
8264 // 1) The block needs to be predicated, either due to a conditional
8265 // in the scalar loop or use of an active lane mask with
8266 // tail-folding, and we use the appropriate mask for the block.
8267 // 2) No mask is required for the block, but the only available
8268 // vector variant at this VF requires a mask, so we synthesize an
8269 // all-true mask.
8270 VPValue *Mask = nullptr;
8271 if (Legal->isMaskRequired(CI))
8272 Mask = getBlockInMask(CI->getParent());
8273 else
8275 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8276
8277 Ops.insert(Ops.begin() + *MaskPos, Mask);
8278 }
8279
8280 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()),
8282 Variant);
8283 }
8284
8285 return nullptr;
8286}
8287
8288bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8289 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8290 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8291 // Instruction should be widened, unless it is scalar after vectorization,
8292 // scalarization is profitable or it is predicated.
8293 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8294 return CM.isScalarAfterVectorization(I, VF) ||
8295 CM.isProfitableToScalarize(I, VF) ||
8296 CM.isScalarWithPredication(I, VF);
8297 };
8299 Range);
8300}
8301
8302VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8304 VPBasicBlock *VPBB) {
8305 switch (I->getOpcode()) {
8306 default:
8307 return nullptr;
8308 case Instruction::SDiv:
8309 case Instruction::UDiv:
8310 case Instruction::SRem:
8311 case Instruction::URem: {
8312 // If not provably safe, use a select to form a safe divisor before widening the
8313 // div/rem operation itself. Otherwise fall through to general handling below.
8314 if (CM.isPredicatedInst(I)) {
8316 VPValue *Mask = getBlockInMask(I->getParent());
8317 VPValue *One =
8318 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8319 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8320 Ops[1] = SafeRHS;
8321 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8322 }
8323 [[fallthrough]];
8324 }
8325 case Instruction::Add:
8326 case Instruction::And:
8327 case Instruction::AShr:
8328 case Instruction::FAdd:
8329 case Instruction::FCmp:
8330 case Instruction::FDiv:
8331 case Instruction::FMul:
8332 case Instruction::FNeg:
8333 case Instruction::FRem:
8334 case Instruction::FSub:
8335 case Instruction::ICmp:
8336 case Instruction::LShr:
8337 case Instruction::Mul:
8338 case Instruction::Or:
8339 case Instruction::Select:
8340 case Instruction::Shl:
8341 case Instruction::Sub:
8342 case Instruction::Xor:
8343 case Instruction::Freeze:
8344 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8345 };
8346}
8347
8349 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8350 for (VPHeaderPHIRecipe *R : PhisToFix) {
8351 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8352 VPRecipeBase *IncR =
8353 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8354 R->addOperand(IncR->getVPSingleValue());
8355 }
8356}
8357
8359 VFRange &Range) {
8361 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8362 Range);
8363
8364 bool IsPredicated = CM.isPredicatedInst(I);
8365
8366 // Even if the instruction is not marked as uniform, there are certain
8367 // intrinsic calls that can be effectively treated as such, so we check for
8368 // them here. Conservatively, we only do this for scalable vectors, since
8369 // for fixed-width VFs we can always fall back on full scalarization.
8370 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8371 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8372 case Intrinsic::assume:
8373 case Intrinsic::lifetime_start:
8374 case Intrinsic::lifetime_end:
8375 // For scalable vectors if one of the operands is variant then we still
8376 // want to mark as uniform, which will generate one instruction for just
8377 // the first lane of the vector. We can't scalarize the call in the same
8378 // way as for fixed-width vectors because we don't know how many lanes
8379 // there are.
8380 //
8381 // The reasons for doing it this way for scalable vectors are:
8382 // 1. For the assume intrinsic generating the instruction for the first
8383 // lane is still be better than not generating any at all. For
8384 // example, the input may be a splat across all lanes.
8385 // 2. For the lifetime start/end intrinsics the pointer operand only
8386 // does anything useful when the input comes from a stack object,
8387 // which suggests it should always be uniform. For non-stack objects
8388 // the effect is to poison the object, which still allows us to
8389 // remove the call.
8390 IsUniform = true;
8391 break;
8392 default:
8393 break;
8394 }
8395 }
8396 VPValue *BlockInMask = nullptr;
8397 if (!IsPredicated) {
8398 // Finalize the recipe for Instr, first if it is not predicated.
8399 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8400 } else {
8401 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8402 // Instructions marked for predication are replicated and a mask operand is
8403 // added initially. Masked replicate recipes will later be placed under an
8404 // if-then construct to prevent side-effects. Generate recipes to compute
8405 // the block mask for this region.
8406 BlockInMask = getBlockInMask(I->getParent());
8407 }
8408
8409 // Note that there is some custom logic to mark some intrinsics as uniform
8410 // manually above for scalable vectors, which this assert needs to account for
8411 // as well.
8412 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8413 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8414 "Should not predicate a uniform recipe");
8415 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8416 IsUniform, BlockInMask);
8417 return Recipe;
8418}
8419
8423 VFRange &Range, VPBasicBlock *VPBB) {
8424 // First, check for specific widening recipes that deal with inductions, Phi
8425 // nodes, calls and memory operations.
8426 VPRecipeBase *Recipe;
8427 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8428 if (Phi->getParent() != OrigLoop->getHeader())
8429 return tryToBlend(Phi, Operands);
8430
8431 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8432 return Recipe;
8433
8434 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8435 assert((Legal->isReductionVariable(Phi) ||
8436 Legal->isFixedOrderRecurrence(Phi)) &&
8437 "can only widen reductions and fixed-order recurrences here");
8438 VPValue *StartV = Operands[0];
8439 if (Legal->isReductionVariable(Phi)) {
8440 const RecurrenceDescriptor &RdxDesc =
8441 Legal->getReductionVars().find(Phi)->second;
8442 assert(RdxDesc.getRecurrenceStartValue() ==
8443 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8444 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8445 CM.isInLoopReduction(Phi),
8446 CM.useOrderedReductions(RdxDesc));
8447 } else {
8448 // TODO: Currently fixed-order recurrences are modeled as chains of
8449 // first-order recurrences. If there are no users of the intermediate
8450 // recurrences in the chain, the fixed order recurrence should be modeled
8451 // directly, enabling more efficient codegen.
8452 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8453 }
8454
8455 PhisToFix.push_back(PhiRecipe);
8456 return PhiRecipe;
8457 }
8458
8459 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8460 cast<TruncInst>(Instr), Operands, Range)))
8461 return Recipe;
8462
8463 // All widen recipes below deal only with VF > 1.
8465 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8466 return nullptr;
8467
8468 if (auto *CI = dyn_cast<CallInst>(Instr))
8469 return tryToWidenCall(CI, Operands, Range);
8470
8471 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8472 return tryToWidenMemory(Instr, Operands, Range);
8473
8474 if (!shouldWiden(Instr, Range))
8475 return nullptr;
8476
8477 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8478 return new VPWidenGEPRecipe(GEP,
8479 make_range(Operands.begin(), Operands.end()));
8480
8481 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8482 return new VPWidenSelectRecipe(
8483 *SI, make_range(Operands.begin(), Operands.end()));
8484 }
8485
8486 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8487 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8488 *CI);
8489 }
8490
8491 return tryToWiden(Instr, Operands, VPBB);
8492}
8493
8494void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8495 ElementCount MaxVF) {
8496 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8497
8498 auto MaxVFTimes2 = MaxVF * 2;
8499 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8500 VFRange SubRange = {VF, MaxVFTimes2};
8501 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8502 // Now optimize the initial VPlan.
8503 if (!Plan->hasVF(ElementCount::getFixed(1)))
8505 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8506 VPlanTransforms::optimize(*Plan, *PSE.getSE());
8507 // TODO: try to put it close to addActiveLaneMask().
8508 // Discard the plan if it is not EVL-compatible
8509 if (CM.foldTailWithEVL() &&
8511 break;
8512 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8513 VPlans.push_back(std::move(Plan));
8514 }
8515 VF = SubRange.End;
8516 }
8517}
8518
8519// Add the necessary canonical IV and branch recipes required to control the
8520// loop.
8521static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8522 DebugLoc DL) {
8523 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8524 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8525
8526 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8527 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8528 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8529 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8530 Header->insert(CanonicalIVPHI, Header->begin());
8531
8532 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8533 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8534 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8535 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8536 "index.next");
8537 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8538
8539 // Add the BranchOnCount VPInstruction to the latch.
8541 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8542}
8543
8544// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8545// original exit block.
8547 Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8549 auto MiddleVPBB =
8550 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
8551 // No edge from the middle block to the unique exit block has been inserted
8552 // and there is nothing to fix from vector loop; phis should have incoming
8553 // from scalar loop only.
8554 if (MiddleVPBB->getNumSuccessors() != 2)
8555 return;
8556
8557 // Introduce VPUsers modeling the exit values.
8558 BasicBlock *ExitBB =
8559 cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock();
8560 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8561 for (PHINode &ExitPhi : ExitBB->phis()) {
8562 Value *IncomingValue =
8563 ExitPhi.getIncomingValueForBlock(ExitingBB);
8564 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
8565 // Exit values for inductions are computed and updated outside of VPlan and
8566 // independent of induction recipes.
8567 // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8568 // live-outs.
8569 if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8570 !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8571 isa<VPWidenPointerInductionRecipe>(V) ||
8572 (isa<Instruction>(IncomingValue) &&
8573 any_of(IncomingValue->users(), [&Inductions](User *U) {
8574 auto *P = dyn_cast<PHINode>(U);
8575 return P && Inductions.contains(P);
8576 })))
8577 continue;
8578 Plan.addLiveOut(&ExitPhi, V);
8579 }
8580}
8581
8582/// Feed a resume value for every FOR from the vector loop to the scalar loop,
8583/// if middle block branches to scalar preheader, by introducing ExtractFromEnd
8584/// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the
8585/// latter and corresponds to the scalar header.
8587 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8588
8589 // Start by finding out if middle block branches to scalar preheader, which is
8590 // not a VPIRBasicBlock, unlike Exit block - the other possible successor of
8591 // middle block.
8592 // TODO: Should be replaced by
8593 // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
8594 // scalar region is modeled as well.
8595 VPBasicBlock *ScalarPHVPBB = nullptr;
8596 auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
8597 for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) {
8598 if (isa<VPIRBasicBlock>(Succ))
8599 continue;
8600 assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?");
8601 ScalarPHVPBB = cast<VPBasicBlock>(Succ);
8602 }
8603 if (!ScalarPHVPBB)
8604 return;
8605
8606 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8607 VPBuilder MiddleBuilder(MiddleVPBB);
8608 // Reset insert point so new recipes are inserted before terminator and
8609 // condition, if there is either the former or both.
8610 if (auto *Terminator = MiddleVPBB->getTerminator()) {
8611 auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
8612 assert((!Condition || Condition->getParent() == MiddleVPBB) &&
8613 "Condition expected in MiddleVPBB");
8614 MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator);
8615 }
8616 VPValue *OneVPV = Plan.getOrAddLiveIn(
8617 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
8618
8619 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8620 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
8621 if (!FOR)
8622 continue;
8623
8624 // Extract the resume value and create a new VPLiveOut for it.
8625 auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd,
8626 {FOR->getBackedgeValue(), OneVPV},
8627 {}, "vector.recur.extract");
8628 auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
8629 VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
8630 "scalar.recur.init");
8631 Plan.addLiveOut(cast<PHINode>(FOR->getUnderlyingInstr()), ResumePhiRecipe);
8632 }
8633}
8634
8636LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8637
8639
8640 // ---------------------------------------------------------------------------
8641 // Build initial VPlan: Scan the body of the loop in a topological order to
8642 // visit each basic block after having visited its predecessor basic blocks.
8643 // ---------------------------------------------------------------------------
8644
8645 // Create initial VPlan skeleton, having a basic block for the pre-header
8646 // which contains SCEV expansions that need to happen before the CFG is
8647 // modified; a basic block for the vector pre-header, followed by a region for
8648 // the vector loop, followed by the middle basic block. The skeleton vector
8649 // loop region contains a header and latch basic blocks.
8650
8651 bool RequiresScalarEpilogueCheck =
8653 [this](ElementCount VF) {
8654 return !CM.requiresScalarEpilogue(VF.isVector());
8655 },
8656 Range);
8658 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8659 *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),
8660 OrigLoop);
8661
8662 // Don't use getDecisionAndClampRange here, because we don't know the UF
8663 // so this function is better to be conservative, rather than to split
8664 // it up into different VPlans.
8665 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8666 bool IVUpdateMayOverflow = false;
8667 for (ElementCount VF : Range)
8668 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8669
8671 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8672 // When not folding the tail, we know that the induction increment will not
8673 // overflow.
8674 bool HasNUW = Style == TailFoldingStyle::None;
8675 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8676
8677 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8678
8679 // ---------------------------------------------------------------------------
8680 // Pre-construction: record ingredients whose recipes we'll need to further
8681 // process after constructing the initial VPlan.
8682 // ---------------------------------------------------------------------------
8683
8684 // For each interleave group which is relevant for this (possibly trimmed)
8685 // Range, add it to the set of groups to be later applied to the VPlan and add
8686 // placeholders for its members' Recipes which we'll be replacing with a
8687 // single VPInterleaveRecipe.
8689 auto applyIG = [IG, this](ElementCount VF) -> bool {
8690 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8691 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8693 // For scalable vectors, the only interleave factor currently supported
8694 // is 2 since we require the (de)interleave2 intrinsics instead of
8695 // shufflevectors.
8696 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8697 "Unsupported interleave factor for scalable vectors");
8698 return Result;
8699 };
8700 if (!getDecisionAndClampRange(applyIG, Range))
8701 continue;
8702 InterleaveGroups.insert(IG);
8703 };
8704
8705 // ---------------------------------------------------------------------------
8706 // Construct recipes for the instructions in the loop
8707 // ---------------------------------------------------------------------------
8708
8709 // Scan the body of the loop in a topological order to visit each basic block
8710 // after having visited its predecessor basic blocks.
8711 LoopBlocksDFS DFS(OrigLoop);
8712 DFS.perform(LI);
8713
8714 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
8715 VPBasicBlock *VPBB = HeaderVPBB;
8716 BasicBlock *HeaderBB = OrigLoop->getHeader();
8717 bool NeedsMasks =
8718 CM.foldTailByMasking() ||
8719 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
8720 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8721 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8722 });
8723 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8724 // Relevant instructions from basic block BB will be grouped into VPRecipe
8725 // ingredients and fill a new VPBasicBlock.
8726 if (VPBB != HeaderVPBB)
8727 VPBB->setName(BB->getName());
8728 Builder.setInsertPoint(VPBB);
8729
8730 if (VPBB == HeaderVPBB)
8731 RecipeBuilder.createHeaderMask();
8732 else if (NeedsMasks)
8733 RecipeBuilder.createBlockInMask(BB);
8734
8735 // Introduce each ingredient into VPlan.
8736 // TODO: Model and preserve debug intrinsics in VPlan.
8737 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8738 Instruction *Instr = &I;
8740 auto *Phi = dyn_cast<PHINode>(Instr);
8741 if (Phi && Phi->getParent() == HeaderBB) {
8742 Operands.push_back(Plan->getOrAddLiveIn(
8743 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8744 } else {
8745 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
8746 Operands = {OpRange.begin(), OpRange.end()};
8747 }
8748
8749 // Invariant stores inside loop will be deleted and a single store
8750 // with the final reduction value will be added to the exit block
8751 StoreInst *SI;
8752 if ((SI = dyn_cast<StoreInst>(&I)) &&
8753 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8754 continue;
8755
8756 VPRecipeBase *Recipe =
8757 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8758 if (!Recipe)
8759 Recipe = RecipeBuilder.handleReplication(Instr, Range);
8760
8761 RecipeBuilder.setRecipe(Instr, Recipe);
8762 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8763 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8764 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8765 // recipes and need to be moved to the phi section of HeaderVPBB:
8766 // * tail-folding (non-phi recipes computing the header mask are
8767 // introduced earlier than regular header phi recipes, and should appear
8768 // after them)
8769 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8770
8771 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8772 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8773 "unexpected recipe needs moving");
8774 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8775 } else
8776 VPBB->appendRecipe(Recipe);
8777 }
8778
8780 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8781 }
8782
8783 // After here, VPBB should not be used.
8784 VPBB = nullptr;
8785
8786 addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
8787 Legal->getInductionVars());
8788
8789 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8790 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8791 "entry block must be set to a VPRegionBlock having a non-empty entry "
8792 "VPBasicBlock");
8793 RecipeBuilder.fixHeaderPhis();
8794
8796
8797 // ---------------------------------------------------------------------------
8798 // Transform initial VPlan: Apply previously taken decisions, in order, to
8799 // bring the VPlan to its final state.
8800 // ---------------------------------------------------------------------------
8801
8802 // Adjust the recipes for any inloop reductions.
8803 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
8804
8805 // Interleave memory: for each Interleave Group we marked earlier as relevant
8806 // for this VPlan, replace the Recipes widening its memory instructions with a
8807 // single VPInterleaveRecipe at its insertion point.
8808 for (const auto *IG : InterleaveGroups) {
8809 auto *Recipe =
8810 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
8811 SmallVector<VPValue *, 4> StoredValues;
8812 for (unsigned i = 0; i < IG->getFactor(); ++i)
8813 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8814 auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
8815 StoredValues.push_back(StoreR->getStoredValue());
8816 }
8817
8818 bool NeedsMaskForGaps =
8819 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8820 assert((!NeedsMaskForGaps || useMaskedInterleavedAccesses(CM.TTI)) &&
8821 "masked interleaved groups are not allowed.");
8822 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8823 Recipe->getMask(), NeedsMaskForGaps);
8824 VPIG->insertBefore(Recipe);
8825 unsigned J = 0;
8826 for (unsigned i = 0; i < IG->getFactor(); ++i)
8827 if (Instruction *Member = IG->getMember(i)) {
8828 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8829 if (!Member->getType()->isVoidTy()) {
8830 VPValue *OriginalV = MemberR->getVPSingleValue();
8831 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8832 J++;
8833 }
8834 MemberR->eraseFromParent();
8835 }
8836 }
8837
8838 for (ElementCount VF : Range)
8839 Plan->addVF(VF);
8840 Plan->setName("Initial VPlan");
8841
8842 // Replace VPValues for known constant strides guaranteed by predicate scalar
8843 // evolution.
8844 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8845 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8846 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8847 // Only handle constant strides for now.
8848 if (!ScevStride)
8849 continue;
8850
8851 auto *CI = Plan->getOrAddLiveIn(
8852 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
8853 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
8854 StrideVPV->replaceAllUsesWith(CI);
8855
8856 // The versioned value may not be used in the loop directly but through a
8857 // sext/zext. Add new live-ins in those cases.
8858 for (Value *U : StrideV->users()) {
8859 if (!isa<SExtInst, ZExtInst>(U))
8860 continue;
8861 VPValue *StrideVPV = Plan->getLiveIn(U);
8862 if (!StrideVPV)
8863 continue;
8864 unsigned BW = U->getType()->getScalarSizeInBits();
8865 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
8866 : ScevStride->getAPInt().zext(BW);
8867 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
8868 StrideVPV->replaceAllUsesWith(CI);
8869 }
8870 }
8871
8873 return Legal->blockNeedsPredication(BB);
8874 });
8875
8876 // Sink users of fixed-order recurrence past the recipe defining the previous
8877 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8879 return nullptr;
8880
8881 if (useActiveLaneMask(Style)) {
8882 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8883 // TailFoldingStyle is visible there.
8884 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8885 bool WithoutRuntimeCheck =
8887 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8888 WithoutRuntimeCheck);
8889 }
8890 return Plan;
8891}
8892
8893VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8894 // Outer loop handling: They may require CFG and instruction level
8895 // transformations before even evaluating whether vectorization is profitable.
8896 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8897 // the vectorization pipeline.
8898 assert(!OrigLoop->isInnermost());
8899 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8900
8901 // Create new empty VPlan
8902 auto Plan = VPlan::createInitialVPlan(
8903 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8904 *PSE.getSE(), true, false, OrigLoop);
8905
8906 // Build hierarchical CFG
8907 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8908 HCFGBuilder.buildHierarchicalCFG();
8909
8910 for (ElementCount VF : Range)
8911 Plan->addVF(VF);
8912
8914 Plan,
8915 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8916 *PSE.getSE(), *TLI);
8917
8918 // Remove the existing terminator of the exiting block of the top-most region.
8919 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8920 auto *Term =
8921 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8922 Term->eraseFromParent();
8923
8924 // Tail folding is not supported for outer loops, so the induction increment
8925 // is guaranteed to not wrap.
8926 bool HasNUW = true;
8927 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8928 DebugLoc());
8929 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8930 return Plan;
8931}
8932
8933// Adjust the recipes for reductions. For in-loop reductions the chain of
8934// instructions leading from the loop exit instr to the phi need to be converted
8935// to reductions, with one operand being vector and the other being the scalar
8936// reduction chain. For other reductions, a select is introduced between the phi
8937// and live-out recipes when folding the tail.
8938//
8939// A ComputeReductionResult recipe is added to the middle block, also for
8940// in-loop reductions which compute their result in-loop, because generating
8941// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8942//
8943// Adjust AnyOf reductions; replace the reduction phi for the selected value
8944// with a boolean reduction phi node to check if the condition is true in any
8945// iteration. The final value is selected by the final ComputeReductionResult.
8946void LoopVectorizationPlanner::adjustRecipesForReductions(
8947 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8948 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8949 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8950 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8951 // sank outside of the loop would keep the same order as they had in the
8952 // original loop.
8953 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8954 for (VPRecipeBase &R : Header->phis()) {
8955 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8956 ReductionPHIList.emplace_back(ReductionPhi);
8957 }
8958 bool HasIntermediateStore = false;
8959 stable_sort(ReductionPHIList,
8960 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8961 const VPReductionPHIRecipe *R2) {
8962 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8963 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8964 HasIntermediateStore |= IS1 || IS2;
8965
8966 // If neither of the recipes has an intermediate store, keep the
8967 // order the same.
8968 if (!IS1 && !IS2)
8969 return false;
8970
8971 // If only one of the recipes has an intermediate store, then
8972 // move it towards the beginning of the list.
8973 if (IS1 && !IS2)
8974 return true;
8975
8976 if (!IS1 && IS2)
8977 return false;
8978
8979 // If both recipes have an intermediate store, then the recipe
8980 // with the later store should be processed earlier. So it
8981 // should go to the beginning of the list.
8982 return DT->dominates(IS2, IS1);
8983 });
8984
8985 if (HasIntermediateStore && ReductionPHIList.size() > 1)
8986 for (VPRecipeBase *R : ReductionPHIList)
8987 R->moveBefore(*Header, Header->getFirstNonPhi());
8988
8989 for (VPRecipeBase &R : Header->phis()) {
8990 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8991 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8992 continue;
8993
8994 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8995 RecurKind Kind = RdxDesc.getRecurrenceKind();
8997 "AnyOf reductions are not allowed for in-loop reductions");
8998
8999 // Collect the chain of "link" recipes for the reduction starting at PhiR.
9001 Worklist.insert(PhiR);
9002 for (unsigned I = 0; I != Worklist.size(); ++I) {
9003 VPSingleDefRecipe *Cur = Worklist[I];
9004 for (VPUser *U : Cur->users()) {
9005 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
9006 if (!UserRecipe) {
9007 assert(isa<VPLiveOut>(U) &&
9008 "U must either be a VPSingleDef or VPLiveOut");
9009 continue;
9010 }
9011 Worklist.insert(UserRecipe);
9012 }
9013 }
9014
9015 // Visit operation "Links" along the reduction chain top-down starting from
9016 // the phi until LoopExitValue. We keep track of the previous item
9017 // (PreviousLink) to tell which of the two operands of a Link will remain
9018 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9019 // the select instructions. Blend recipes of in-loop reduction phi's will
9020 // get folded to their non-phi operand, as the reduction recipe handles the
9021 // condition directly.
9022 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9023 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9024 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9025
9026 // Index of the first operand which holds a non-mask vector operand.
9027 unsigned IndexOfFirstOperand;
9028 // Recognize a call to the llvm.fmuladd intrinsic.
9029 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9030 VPValue *VecOp;
9031 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9032 if (IsFMulAdd) {
9033 assert(
9035 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9036 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9037 isa<VPWidenCallRecipe>(CurrentLink)) &&
9038 CurrentLink->getOperand(2) == PreviousLink &&
9039 "expected a call where the previous link is the added operand");
9040
9041 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9042 // need to create an fmul recipe (multiplying the first two operands of
9043 // the fmuladd together) to use as the vector operand for the fadd
9044 // reduction.
9045 VPInstruction *FMulRecipe = new VPInstruction(
9046 Instruction::FMul,
9047 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9048 CurrentLinkI->getFastMathFlags());
9049 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9050 VecOp = FMulRecipe;
9051 } else {
9052 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9053 if (PhiR->isInLoop() && Blend) {
9054 assert(Blend->getNumIncomingValues() == 2 &&
9055 "Blend must have 2 incoming values");
9056 if (Blend->getIncomingValue(0) == PhiR)
9057 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9058 else {
9059 assert(Blend->getIncomingValue(1) == PhiR &&
9060 "PhiR must be an operand of the blend");
9061 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9062 }
9063 continue;
9064 }
9065
9067 if (isa<VPWidenRecipe>(CurrentLink)) {
9068 assert(isa<CmpInst>(CurrentLinkI) &&
9069 "need to have the compare of the select");
9070 continue;
9071 }
9072 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9073 "must be a select recipe");
9074 IndexOfFirstOperand = 1;
9075 } else {
9076 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9077 "Expected to replace a VPWidenSC");
9078 IndexOfFirstOperand = 0;
9079 }
9080 // Note that for non-commutable operands (cmp-selects), the semantics of
9081 // the cmp-select are captured in the recurrence kind.
9082 unsigned VecOpId =
9083 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9084 ? IndexOfFirstOperand + 1
9085 : IndexOfFirstOperand;
9086 VecOp = CurrentLink->getOperand(VecOpId);
9087 assert(VecOp != PreviousLink &&
9088 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9089 (VecOpId - IndexOfFirstOperand)) ==
9090 PreviousLink &&
9091 "PreviousLink must be the operand other than VecOp");
9092 }
9093
9094 BasicBlock *BB = CurrentLinkI->getParent();
9095 VPValue *CondOp = nullptr;
9097 CondOp = RecipeBuilder.getBlockInMask(BB);
9098
9099 VPReductionRecipe *RedRecipe =
9100 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
9101 CondOp, CM.useOrderedReductions(RdxDesc));
9102 // Append the recipe to the end of the VPBasicBlock because we need to
9103 // ensure that it comes after all of it's inputs, including CondOp.
9104 // Note that this transformation may leave over dead recipes (including
9105 // CurrentLink), which will be cleaned by a later VPlan transform.
9106 LinkVPBB->appendRecipe(RedRecipe);
9107 CurrentLink->replaceAllUsesWith(RedRecipe);
9108 PreviousLink = RedRecipe;
9109 }
9110 }
9111 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9112 Builder.setInsertPoint(&*LatchVPBB->begin());
9113 VPBasicBlock *MiddleVPBB =
9114 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
9115 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9116 for (VPRecipeBase &R :
9117 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9118 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9119 if (!PhiR)
9120 continue;
9121
9122 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9123 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9124 // with a boolean reduction phi node to check if the condition is true in
9125 // any iteration. The final value is selected by the final
9126 // ComputeReductionResult.
9128 RdxDesc.getRecurrenceKind())) {
9129 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9130 return isa<VPWidenSelectRecipe>(U) ||
9131 (isa<VPReplicateRecipe>(U) &&
9132 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9133 Instruction::Select);
9134 }));
9135 VPValue *Cmp = Select->getOperand(0);
9136 // If the compare is checking the reduction PHI node, adjust it to check
9137 // the start value.
9138 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9139 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9140 if (CmpR->getOperand(I) == PhiR)
9141 CmpR->setOperand(I, PhiR->getStartValue());
9142 }
9143 VPBuilder::InsertPointGuard Guard(Builder);
9144 Builder.setInsertPoint(Select);
9145
9146 // If the true value of the select is the reduction phi, the new value is
9147 // selected if the negated condition is true in any iteration.
9148 if (Select->getOperand(1) == PhiR)
9149 Cmp = Builder.createNot(Cmp);
9150 VPValue *Or = Builder.createOr(PhiR, Cmp);
9151 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9152
9153 // Convert the reduction phi to operate on bools.
9154 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9155 OrigLoop->getHeader()->getContext())));
9156 }
9157
9158 // If tail is folded by masking, introduce selects between the phi
9159 // and the live-out instruction of each reduction, at the beginning of the
9160 // dedicated latch block.
9161 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9162 auto *NewExitingVPV = PhiR->getBackedgeValue();
9163 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9164 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9165 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9166 "reduction recipe must be defined before latch");
9167 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9168 std::optional<FastMathFlags> FMFs =
9169 PhiTy->isFloatingPointTy()
9170 ? std::make_optional(RdxDesc.getFastMathFlags())
9171 : std::nullopt;
9172 NewExitingVPV =
9173 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9174 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9175 return isa<VPInstruction>(&U) &&
9176 cast<VPInstruction>(&U)->getOpcode() ==
9178 });
9181 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9183 PhiR->setOperand(1, NewExitingVPV);
9184 }
9185
9186 // If the vector reduction can be performed in a smaller type, we truncate
9187 // then extend the loop exit value to enable InstCombine to evaluate the
9188 // entire expression in the smaller type.
9189 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9190 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9192 RdxDesc.getRecurrenceKind())) {
9193 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9194 Type *RdxTy = RdxDesc.getRecurrenceType();
9195 auto *Trunc =
9196 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9197 auto *Extnd =
9198 RdxDesc.isSigned()
9199 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9200 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9201
9202 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9203 Extnd->insertAfter(Trunc);
9204 if (PhiR->getOperand(1) == NewExitingVPV)
9205 PhiR->setOperand(1, Extnd->getVPSingleValue());
9206 NewExitingVPV = Extnd;
9207 }
9208
9209 // We want code in the middle block to appear to execute on the location of
9210 // the scalar loop's latch terminator because: (a) it is all compiler
9211 // generated, (b) these instructions are always executed after evaluating
9212 // the latch conditional branch, and (c) other passes may add new
9213 // predecessors which terminate on this line. This is the easiest way to
9214 // ensure we don't accidentally cause an extra step back into the loop while
9215 // debugging.
9216 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9217
9218 // TODO: At the moment ComputeReductionResult also drives creation of the
9219 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9220 // even for in-loop reductions, until the reduction resume value handling is
9221 // also modeled in VPlan.
9222 auto *FinalReductionResult = new VPInstruction(
9223 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9224 FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9225 OrigExitingVPV->replaceUsesWithIf(
9226 FinalReductionResult,
9227 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9228 }
9229
9231}
9232
9234 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9235
9236 // Fast-math-flags propagate from the original induction instruction.
9238 if (FPBinOp)
9239 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9240
9241 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9242 Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
9243 Value *DerivedIV = emitTransformedIndex(
9244 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9245 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9246 DerivedIV->setName("offset.idx");
9247 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9248
9249 State.set(this, DerivedIV, VPIteration(0, 0));
9250}
9251
9254 if (State.Instance) { // Generate a single instance.
9255 assert((State.VF.isScalar() || !isUniform()) &&
9256 "uniform recipe shouldn't be predicated");
9257 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9258 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9259 // Insert scalar instance packing it into a vector.
9260 if (State.VF.isVector() && shouldPack()) {
9261 // If we're constructing lane 0, initialize to start from poison.
9262 if (State.Instance->Lane.isFirstLane()) {
9263 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9265 VectorType::get(UI->getType(), State.VF));
9266 State.set(this, Poison, State.Instance->Part);
9267 }
9268 State.packScalarIntoVectorValue(this, *State.Instance);
9269 }
9270 return;
9271 }
9272
9273 if (IsUniform) {
9274 // If the recipe is uniform across all parts (instead of just per VF), only
9275 // generate a single instance.
9276 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9277 all_of(operands(), [](VPValue *Op) {
9278 return Op->isDefinedOutsideVectorRegions();
9279 })) {
9280 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9281 if (user_begin() != user_end()) {
9282 for (unsigned Part = 1; Part < State.UF; ++Part)
9283 State.set(this, State.get(this, VPIteration(0, 0)),
9284 VPIteration(Part, 0));
9285 }
9286 return;
9287 }
9288
9289 // Uniform within VL means we need to generate lane 0 only for each
9290 // unrolled copy.
9291 for (unsigned Part = 0; Part < State.UF; ++Part)
9292 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9293 return;
9294 }
9295
9296 // A store of a loop varying value to a uniform address only needs the last
9297 // copy of the store.
9298 if (isa<StoreInst>(UI) &&
9300 auto Lane = VPLane::getLastLaneForVF(State.VF);
9301 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9302 State);
9303 return;
9304 }
9305
9306 // Generate scalar instances for all VF lanes of all UF parts.
9307 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9308 const unsigned EndLane = State.VF.getKnownMinValue();
9309 for (unsigned Part = 0; Part < State.UF; ++Part)
9310 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9311 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9312}
9313
9314/// Use all-true mask for reverse rather than actual mask, as it avoids a
9315/// dependence w/o affecting the result.
9317 Value *EVL, const Twine &Name) {
9318 VectorType *ValTy = cast<VectorType>(Operand->getType());
9319 Value *AllTrueMask =
9320 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
9321 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
9322 {Operand, AllTrueMask, EVL}, nullptr, Name);
9323}
9324
9326 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9327 "explicit vector length.");
9328 auto *LI = cast<LoadInst>(&Ingredient);
9329
9330 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9331 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9332 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9333 bool CreateGather = !isConsecutive();
9334
9335 auto &Builder = State.Builder;
9337 CallInst *NewLI;
9338 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9339 Value *Addr = State.get(getAddr(), 0, !CreateGather);
9340 Value *Mask = nullptr;
9341 if (VPValue *VPMask = getMask()) {
9342 Mask = State.get(VPMask, 0);
9343 if (isReverse())
9344 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9345 } else {
9346 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9347 }
9348
9349 if (CreateGather) {
9350 NewLI =
9351 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9352 nullptr, "wide.masked.gather");
9353 } else {
9354 VectorBuilder VBuilder(Builder);
9355 VBuilder.setEVL(EVL).setMask(Mask);
9356 NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
9357 Instruction::Load, DataTy, Addr, "vp.op.load"));
9358 }
9359 NewLI->addParamAttr(
9360 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
9361 State.addMetadata(NewLI, LI);
9362 Instruction *Res = NewLI;
9363 if (isReverse())
9364 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
9365 State.set(this, Res, 0);
9366}
9367
9369 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9370 "explicit vector length.");
9371 auto *SI = cast<StoreInst>(&Ingredient);
9372
9373 VPValue *StoredValue = getStoredValue();
9374 bool CreateScatter = !isConsecutive();
9375 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9376
9377 auto &Builder = State.Builder;
9379
9380 CallInst *NewSI = nullptr;
9381 Value *StoredVal = State.get(StoredValue, 0);
9382 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9383 if (isReverse())
9384 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
9385 Value *Mask = nullptr;
9386 if (VPValue *VPMask = getMask()) {
9387 Mask = State.get(VPMask, 0);
9388 if (isReverse())
9389 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9390 } else {
9391 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9392 }
9393 Value *Addr = State.get(getAddr(), 0, !CreateScatter);
9394 if (CreateScatter) {
9395 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9396 Intrinsic::vp_scatter,
9397 {StoredVal, Addr, Mask, EVL});
9398 } else {
9399 VectorBuilder VBuilder(Builder);
9400 VBuilder.setEVL(EVL).setMask(Mask);
9401 NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
9402 Instruction::Store, Type::getVoidTy(EVL->getContext()),
9403 {StoredVal, Addr}));
9404 }
9405 NewSI->addParamAttr(
9406 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
9407 State.addMetadata(NewSI, SI);
9408}
9409
9410// Determine how to lower the scalar epilogue, which depends on 1) optimising
9411// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9412// predication, and 4) a TTI hook that analyses whether the loop is suitable
9413// for predication.
9418 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9419 // don't look at hints or options, and don't request a scalar epilogue.
9420 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9421 // LoopAccessInfo (due to code dependency and not being able to reliably get
9422 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9423 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9424 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9425 // back to the old way and vectorize with versioning when forced. See D81345.)
9426 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9430
9431 // 2) If set, obey the directives
9432 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9440 };
9441 }
9442
9443 // 3) If set, obey the hints
9444 switch (Hints.getPredicate()) {
9449 };
9450
9451 // 4) if the TTI hook indicates this is profitable, request predication.
9452 TailFoldingInfo TFI(TLI, &LVL, IAI);
9455
9457}
9458
9459// Process the loop in the VPlan-native vectorization path. This path builds
9460// VPlan upfront in the vectorization pipeline, which allows to apply
9461// VPlan-to-VPlan transformations from the very beginning without modifying the
9462// input LLVM IR.
9469 LoopVectorizationRequirements &Requirements) {
9470
9471 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9472 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9473 return false;
9474 }
9475 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9476 Function *F = L->getHeader()->getParent();
9477 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9478
9480 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9481
9482 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9483 &Hints, IAI);
9484 // Use the planner for outer loop vectorization.
9485 // TODO: CM is not used at this point inside the planner. Turn CM into an
9486 // optional argument if we don't need it in the future.
9487 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9488 ORE);
9489
9490 // Get user vectorization factor.
9491 ElementCount UserVF = Hints.getWidth();
9492
9494
9495 // Plan how to best vectorize, return the best VF and its cost.
9496 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9497
9498 // If we are stress testing VPlan builds, do not attempt to generate vector
9499 // code. Masked vector code generation support will follow soon.
9500 // Also, do not attempt to vectorize if no vector code will be produced.
9502 return false;
9503
9504 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9505
9506 {
9507 bool AddBranchWeights =
9508 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9509 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9510 F->getDataLayout(), AddBranchWeights);
9511 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9512 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9513 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9514 << L->getHeader()->getParent()->getName() << "\"\n");
9515 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9516 }
9517
9518 reportVectorization(ORE, L, VF, 1);
9519
9520 // Mark the loop as already vectorized to avoid vectorizing again.
9521 Hints.setAlreadyVectorized();
9522 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9523 return true;
9524}
9525
9526// Emit a remark if there are stores to floats that required a floating point
9527// extension. If the vectorized loop was generated with floating point there
9528// will be a performance penalty from the conversion overhead and the change in
9529// the vector width.
9532 for (BasicBlock *BB : L->getBlocks()) {
9533 for (Instruction &Inst : *BB) {
9534 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9535 if (S->getValueOperand()->getType()->isFloatTy())
9536 Worklist.push_back(S);
9537 }
9538 }
9539 }
9540
9541 // Traverse the floating point stores upwards searching, for floating point
9542 // conversions.
9545 while (!Worklist.empty()) {
9546 auto *I = Worklist.pop_back_val();
9547 if (!L->contains(I))
9548 continue;
9549 if (!Visited.insert(I).second)
9550 continue;
9551
9552 // Emit a remark if the floating point store required a floating
9553 // point conversion.
9554 // TODO: More work could be done to identify the root cause such as a
9555 // constant or a function return type and point the user to it.
9556 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9557 ORE->emit([&]() {
9558 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9559 I->getDebugLoc(), L->getHeader())
9560 << "floating point conversion changes vector width. "
9561 << "Mixed floating point precision requires an up/down "
9562 << "cast that will negatively impact performance.";
9563 });
9564
9565 for (Use &Op : I->operands())
9566 if (auto *OpI = dyn_cast<Instruction>(Op))
9567 Worklist.push_back(OpI);
9568 }
9569}
9570
9571static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9573 std::optional<unsigned> VScale, Loop *L,
9574 ScalarEvolution &SE,
9576 InstructionCost CheckCost = Checks.getCost();
9577 if (!CheckCost.isValid())
9578 return false;
9579
9580 // When interleaving only scalar and vector cost will be equal, which in turn
9581 // would lead to a divide by 0. Fall back to hard threshold.
9582 if (VF.Width.isScalar()) {
9583 if (CheckCost > VectorizeMemoryCheckThreshold) {
9584 LLVM_DEBUG(
9585 dbgs()
9586 << "LV: Interleaving only is not profitable due to runtime checks\n");
9587 return false;
9588 }
9589 return true;
9590 }
9591
9592 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9593 uint64_t ScalarC = *VF.ScalarCost.getValue();
9594 if (ScalarC == 0)
9595 return true;
9596
9597 // First, compute the minimum iteration count required so that the vector
9598 // loop outperforms the scalar loop.
9599 // The total cost of the scalar loop is
9600 // ScalarC * TC
9601 // where
9602 // * TC is the actual trip count of the loop.
9603 // * ScalarC is the cost of a single scalar iteration.
9604 //
9605 // The total cost of the vector loop is
9606 // RtC + VecC * (TC / VF) + EpiC
9607 // where
9608 // * RtC is the cost of the generated runtime checks
9609 // * VecC is the cost of a single vector iteration.
9610 // * TC is the actual trip count of the loop
9611 // * VF is the vectorization factor
9612 // * EpiCost is the cost of the generated epilogue, including the cost
9613 // of the remaining scalar operations.
9614 //
9615 // Vectorization is profitable once the total vector cost is less than the
9616 // total scalar cost:
9617 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9618 //
9619 // Now we can compute the minimum required trip count TC as
9620 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9621 //
9622 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9623 // the computations are performed on doubles, not integers and the result
9624 // is rounded up, hence we get an upper estimate of the TC.
9625 unsigned IntVF = VF.Width.getKnownMinValue();
9626 if (VF.Width.isScalable()) {
9627 unsigned AssumedMinimumVscale = 1;
9628 if (VScale)
9629 AssumedMinimumVscale = *VScale;
9630 IntVF *= AssumedMinimumVscale;
9631 }
9632 uint64_t RtC = *CheckCost.getValue();
9633 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9634 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9635
9636 // Second, compute a minimum iteration count so that the cost of the
9637 // runtime checks is only a fraction of the total scalar loop cost. This
9638 // adds a loop-dependent bound on the overhead incurred if the runtime
9639 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9640 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9641 // cost, compute
9642 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9643 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9644
9645 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9646 // epilogue is allowed, choose the next closest multiple of VF. This should
9647 // partly compensate for ignoring the epilogue cost.
9648 uint64_t MinTC = std::max(MinTC1, MinTC2);
9649 if (SEL == CM_ScalarEpilogueAllowed)
9650 MinTC = alignTo(MinTC, IntVF);
9652
9653 LLVM_DEBUG(
9654 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9655 << VF.MinProfitableTripCount << "\n");
9656
9657 // Skip vectorization if the expected trip count is less than the minimum
9658 // required trip count.
9659 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9662 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9663 "trip count < minimum profitable VF ("
9664 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9665 << ")\n");
9666
9667 return false;
9668 }
9669 }
9670 return true;
9671}
9672
9674 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9676 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9678
9680 assert((EnableVPlanNativePath || L->isInnermost()) &&
9681 "VPlan-native path is not enabled. Only process inner loops.");
9682
9683 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9684 << L->getHeader()->getParent()->getName() << "' from "
9685 << L->getLocStr() << "\n");
9686
9687 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9688
9689 LLVM_DEBUG(
9690 dbgs() << "LV: Loop hints:"
9691 << " force="
9693 ? "disabled"
9695 ? "enabled"
9696 : "?"))
9697 << " width=" << Hints.getWidth()
9698 << " interleave=" << Hints.getInterleave() << "\n");
9699
9700 // Function containing loop
9701 Function *F = L->getHeader()->getParent();
9702
9703 // Looking at the diagnostic output is the only way to determine if a loop
9704 // was vectorized (other than looking at the IR or machine code), so it
9705 // is important to generate an optimization remark for each loop. Most of
9706 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9707 // generated as OptimizationRemark and OptimizationRemarkMissed are
9708 // less verbose reporting vectorized loops and unvectorized loops that may
9709 // benefit from vectorization, respectively.
9710
9711 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9712 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9713 return false;
9714 }
9715
9716 PredicatedScalarEvolution PSE(*SE, *L);
9717
9718 // Check if it is legal to vectorize the loop.
9719 LoopVectorizationRequirements Requirements;
9720 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9721 &Requirements, &Hints, DB, AC, BFI, PSI);
9723 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9724 Hints.emitRemarkWithHints();
9725 return false;
9726 }
9727
9728 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9729 // here. They may require CFG and instruction level transformations before
9730 // even evaluating whether vectorization is profitable. Since we cannot modify
9731 // the incoming IR, we need to build VPlan upfront in the vectorization
9732 // pipeline.
9733 if (!L->isInnermost())
9734 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9735 ORE, BFI, PSI, Hints, Requirements);
9736
9737 assert(L->isInnermost() && "Inner loop expected.");
9738
9739 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9740 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9741
9742 // If an override option has been passed in for interleaved accesses, use it.
9743 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9744 UseInterleaved = EnableInterleavedMemAccesses;
9745
9746 // Analyze interleaved memory accesses.
9747 if (UseInterleaved)
9749
9750 // Check the function attributes and profiles to find out if this function
9751 // should be optimized for size.
9753 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9754
9755 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9756 // count by optimizing for size, to minimize overheads.
9757 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9758 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9759 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9760 << "This loop is worth vectorizing only if no scalar "
9761 << "iteration overheads are incurred.");
9763 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9764 else {
9765 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9766 LLVM_DEBUG(dbgs() << "\n");
9767 // Predicate tail-folded loops are efficient even when the loop
9768 // iteration count is low. However, setting the epilogue policy to
9769 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9770 // with runtime checks. It's more effective to let
9771 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9772 // for the loop.
9775 } else {
9776 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9777 "small to consider vectorizing.\n");
9779 "The trip count is below the minial threshold value.",
9780 "loop trip count is too low, avoiding vectorization",
9781 "LowTripCount", ORE, L);
9782 Hints.emitRemarkWithHints();
9783 return false;
9784 }
9785 }
9786 }
9787
9788 // Check the function attributes to see if implicit floats or vectors are
9789 // allowed.
9790 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9792 "Can't vectorize when the NoImplicitFloat attribute is used",
9793 "loop not vectorized due to NoImplicitFloat attribute",
9794 "NoImplicitFloat", ORE, L);
9795 Hints.emitRemarkWithHints();
9796 return false;
9797 }
9798
9799 // Check if the target supports potentially unsafe FP vectorization.
9800 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9801 // for the target we're vectorizing for, to make sure none of the
9802 // additional fp-math flags can help.
9803 if (Hints.isPotentiallyUnsafe() &&
9806 "Potentially unsafe FP op prevents vectorization",
9807 "loop not vectorized due to unsafe FP support.",
9808 "UnsafeFP", ORE, L);
9809 Hints.emitRemarkWithHints();
9810 return false;
9811 }
9812
9813 bool AllowOrderedReductions;
9814 // If the flag is set, use that instead and override the TTI behaviour.
9815 if (ForceOrderedReductions.getNumOccurrences() > 0)
9816 AllowOrderedReductions = ForceOrderedReductions;
9817 else
9818 AllowOrderedReductions = TTI->enableOrderedReductions();
9819 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9820 ORE->emit([&]() {
9821 auto *ExactFPMathInst = Requirements.getExactFPInst();
9822 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9823 ExactFPMathInst->getDebugLoc(),
9824 ExactFPMathInst->getParent())
9825 << "loop not vectorized: cannot prove it is safe to reorder "
9826 "floating-point operations";
9827 });
9828 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9829 "reorder floating-point operations\n");
9830 Hints.emitRemarkWithHints();
9831 return false;
9832 }
9833
9834 // Use the cost model.
9835 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9836 F, &Hints, IAI);
9837 // Use the planner for vectorization.
9838 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9839 ORE);
9840
9841 // Get user vectorization factor and interleave count.
9842 ElementCount UserVF = Hints.getWidth();
9843 unsigned UserIC = Hints.getInterleave();
9844
9845 // Plan how to best vectorize, return the best VF and its cost.
9846 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9847
9850
9852 unsigned IC = 1;
9853
9854 bool AddBranchWeights =
9855 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9856 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9857 F->getDataLayout(), AddBranchWeights);
9858 if (MaybeVF) {
9859 VF = *MaybeVF;
9860 // Select the interleave count.
9861 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9862
9863 unsigned SelectedIC = std::max(IC, UserIC);
9864 // Optimistically generate runtime checks if they are needed. Drop them if
9865 // they turn out to not be profitable.
9866 if (VF.Width.isVector() || SelectedIC > 1)
9867 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9868
9869 // Check if it is profitable to vectorize with runtime checks.
9870 bool ForceVectorization =
9872 if (!ForceVectorization &&
9874 *PSE.getSE(), SEL)) {
9875 ORE->emit([&]() {
9877 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9878 L->getHeader())
9879 << "loop not vectorized: cannot prove it is safe to reorder "
9880 "memory operations";
9881 });
9882 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9883 Hints.emitRemarkWithHints();
9884 return false;
9885 }
9886 }
9887
9888 // Identify the diagnostic messages that should be produced.
9889 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9890 bool VectorizeLoop = true, InterleaveLoop = true;
9891 if (VF.Width.isScalar()) {
9892 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9893 VecDiagMsg = std::make_pair(
9894 "VectorizationNotBeneficial",
9895 "the cost-model indicates that vectorization is not beneficial");
9896 VectorizeLoop = false;
9897 }
9898
9899 if (!MaybeVF && UserIC > 1) {
9900 // Tell the user interleaving was avoided up-front, despite being explicitly
9901 // requested.
9902 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9903 "interleaving should be avoided up front\n");
9904 IntDiagMsg = std::make_pair(
9905 "InterleavingAvoided",
9906 "Ignoring UserIC, because interleaving was avoided up front");
9907 InterleaveLoop = false;
9908 } else if (IC == 1 && UserIC <= 1) {
9909 // Tell the user interleaving is not beneficial.
9910 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9911 IntDiagMsg = std::make_pair(
9912 "InterleavingNotBeneficial",
9913 "the cost-model indicates that interleaving is not beneficial");
9914 InterleaveLoop = false;
9915 if (UserIC == 1) {
9916 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9917 IntDiagMsg.second +=
9918 " and is explicitly disabled or interleave count is set to 1";
9919 }
9920 } else if (IC > 1 && UserIC == 1) {
9921 // Tell the user interleaving is beneficial, but it explicitly disabled.
9922 LLVM_DEBUG(
9923 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9924 IntDiagMsg = std::make_pair(
9925 "InterleavingBeneficialButDisabled",
9926 "the cost-model indicates that interleaving is beneficial "
9927 "but is explicitly disabled or interleave count is set to 1");
9928 InterleaveLoop = false;
9929 }
9930
9931 // Override IC if user provided an interleave count.
9932 IC = UserIC > 0 ? UserIC : IC;
9933
9934 // Emit diagnostic messages, if any.
9935 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9936 if (!VectorizeLoop && !InterleaveLoop) {
9937 // Do not vectorize or interleaving the loop.
9938 ORE->emit([&]() {
9939 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9940 L->getStartLoc(), L->getHeader())
9941 << VecDiagMsg.second;
9942 });
9943 ORE->emit([&]() {
9944 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9945 L->getStartLoc(), L->getHeader())
9946 << IntDiagMsg.second;
9947 });
9948 return false;
9949 } else if (!VectorizeLoop && InterleaveLoop) {
9950 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9951 ORE->emit([&]() {
9952 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9953 L->getStartLoc(), L->getHeader())
9954 << VecDiagMsg.second;
9955 });
9956 } else if (VectorizeLoop && !InterleaveLoop) {
9957 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9958 << ") in " << L->getLocStr() << '\n');
9959 ORE->emit([&]() {
9960 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9961 L->getStartLoc(), L->getHeader())
9962 << IntDiagMsg.second;
9963 });
9964 } else if (VectorizeLoop && InterleaveLoop) {
9965 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9966 << ") in " << L->getLocStr() << '\n');
9967 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9968 }
9969
9970 bool DisableRuntimeUnroll = false;
9971 MDNode *OrigLoopID = L->getLoopID();
9972 {
9973 using namespace ore;
9974 if (!VectorizeLoop) {
9975 assert(IC > 1 && "interleave count should not be 1 or 0");
9976 // If we decided that it is not legal to vectorize the loop, then
9977 // interleave it.
9978 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
9979 &CM, BFI, PSI, Checks);
9980
9981 ElementCount BestVF = LVP.getBestVF();
9982 assert(BestVF.isScalar() &&
9983 "VPlan cost model and legacy cost model disagreed");
9984 VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
9985 LVP.executePlan(BestVF, IC, BestPlan, Unroller, DT, false);
9986
9987 ORE->emit([&]() {
9988 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9989 L->getHeader())
9990 << "interleaved loop (interleaved count: "
9991 << NV("InterleaveCount", IC) << ")";
9992 });
9993 } else {
9994 // If we decided that it is *legal* to vectorize the loop, then do it.
9995
9996 ElementCount BestVF = LVP.getBestVF();
9997 LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << BestVF << "\n");
9998 assert(VF.Width == BestVF &&
9999 "VPlan cost model and legacy cost model disagreed");
10000 VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
10001 // Consider vectorizing the epilogue too if it's profitable.
10002 VectorizationFactor EpilogueVF =
10003 LVP.selectEpilogueVectorizationFactor(BestVF, IC);
10004 if (EpilogueVF.Width.isVector()) {
10005
10006 // The first pass vectorizes the main loop and creates a scalar epilogue
10007 // to be vectorized by executing the plan (potentially with a different
10008 // factor) again shortly afterwards.
10009 EpilogueLoopVectorizationInfo EPI(BestVF, IC, EpilogueVF.Width, 1);
10010 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10011 EPI, &LVL, &CM, BFI, PSI, Checks);
10012
10013 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10014 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10015 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
10016 ++LoopsVectorized;
10017
10018 // Second pass vectorizes the epilogue and adjusts the control flow
10019 // edges from the first pass.
10020 EPI.MainLoopVF = EPI.EpilogueVF;
10021 EPI.MainLoopUF = EPI.EpilogueUF;
10022 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10023 ORE, EPI, &LVL, &CM, BFI, PSI,
10024 Checks);
10025
10026 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10027 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10028 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10029 Header->setName("vec.epilog.vector.body");
10030
10031 // Re-use the trip count and steps expanded for the main loop, as
10032 // skeleton creation needs it as a value that dominates both the scalar
10033 // and vector epilogue loops
10034 // TODO: This is a workaround needed for epilogue vectorization and it
10035 // should be removed once induction resume value creation is done
10036 // directly in VPlan.
10037 EpilogILV.setTripCount(MainILV.getTripCount());
10038 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10039 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10040 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
10041 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10042 ExpandR->replaceAllUsesWith(ExpandedVal);
10043 if (BestEpiPlan.getTripCount() == ExpandR)
10044 BestEpiPlan.resetTripCount(ExpandedVal);
10045 ExpandR->eraseFromParent();
10046 }
10047
10048 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10049 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10050 // before vectorizing the epilogue loop.
10051 for (VPRecipeBase &R : Header->phis()) {
10052 if (isa<VPCanonicalIVPHIRecipe>(&R))
10053 continue;
10054
10055 Value *ResumeV = nullptr;
10056 // TODO: Move setting of resume values to prepareToExecute.
10057 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10058 const RecurrenceDescriptor &RdxDesc =
10059 ReductionPhi->getRecurrenceDescriptor();
10060 RecurKind RK = RdxDesc.getRecurrenceKind();
10061 ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
10063 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10064 // start value; compare the final value from the main vector loop
10065 // to the start value.
10066 IRBuilder<> Builder(
10067 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10068 ResumeV = Builder.CreateICmpNE(ResumeV,
10069 RdxDesc.getRecurrenceStartValue());
10070 }
10071 } else {
10072 // Create induction resume values for both widened pointer and
10073 // integer/fp inductions and update the start value of the induction
10074 // recipes to use the resume value.
10075 PHINode *IndPhi = nullptr;
10076 const InductionDescriptor *ID;
10077 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10078 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10079 ID = &Ind->getInductionDescriptor();
10080 } else {
10081 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10082 IndPhi = WidenInd->getPHINode();
10083 ID = &WidenInd->getInductionDescriptor();
10084 }
10085
10086 ResumeV = MainILV.createInductionResumeValue(
10087 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10089 }
10090 assert(ResumeV && "Must have a resume value");
10091 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
10092 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10093 }
10094
10095 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10096 "DT not preserved correctly");
10097 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10098 DT, true, &ExpandedSCEVs);
10099 ++LoopsEpilogueVectorized;
10100
10101 if (!MainILV.areSafetyChecksAdded())
10102 DisableRuntimeUnroll = true;
10103 } else {
10104 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, BestVF,
10105 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10106 PSI, Checks);
10107 LVP.executePlan(BestVF, IC, BestPlan, LB, DT, false);
10108 ++LoopsVectorized;
10109
10110 // Add metadata to disable runtime unrolling a scalar loop when there
10111 // are no runtime checks about strides and memory. A scalar loop that is
10112 // rarely used is not worth unrolling.
10113 if (!LB.areSafetyChecksAdded())
10114 DisableRuntimeUnroll = true;
10115 }
10116 // Report the vectorization decision.
10117 reportVectorization(ORE, L, VF, IC);
10118 }
10119
10122 }
10123
10124 std::optional<MDNode *> RemainderLoopID =
10127 if (RemainderLoopID) {
10128 L->setLoopID(*RemainderLoopID);
10129 } else {
10130 if (DisableRuntimeUnroll)
10132
10133 // Mark the loop as already vectorized to avoid vectorizing again.
10134 Hints.setAlreadyVectorized();
10135 }
10136
10137 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10138 return true;
10139}
10140
10142
10143 // Don't attempt if
10144 // 1. the target claims to have no vector registers, and
10145 // 2. interleaving won't help ILP.
10146 //
10147 // The second condition is necessary because, even if the target has no
10148 // vector registers, loop vectorization may still enable scalar
10149 // interleaving.
10152 return LoopVectorizeResult(false, false);
10153
10154 bool Changed = false, CFGChanged = false;
10155
10156 // The vectorizer requires loops to be in simplified form.
10157 // Since simplification may add new inner loops, it has to run before the
10158 // legality and profitability checks. This means running the loop vectorizer
10159 // will simplify all loops, regardless of whether anything end up being
10160 // vectorized.
10161 for (const auto &L : *LI)
10162 Changed |= CFGChanged |=
10163 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10164
10165 // Build up a worklist of inner-loops to vectorize. This is necessary as
10166 // the act of vectorizing or partially unrolling a loop creates new loops
10167 // and can invalidate iterators across the loops.
10168 SmallVector<Loop *, 8> Worklist;
10169
10170 for (Loop *L : *LI)
10171 collectSupportedLoops(*L, LI, ORE, Worklist);
10172
10173 LoopsAnalyzed += Worklist.size();
10174
10175 // Now walk the identified inner loops.
10176 while (!Worklist.empty()) {
10177 Loop *L = Worklist.pop_back_val();
10178
10179 // For the inner loops we actually process, form LCSSA to simplify the
10180 // transform.
10181 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10182
10183 Changed |= CFGChanged |= processLoop(L);
10184
10185 if (Changed) {
10186 LAIs->clear();
10187
10188#ifndef NDEBUG
10189 if (VerifySCEV)
10190 SE->verify();
10191#endif
10192 }
10193 }
10194
10195 // Process each loop nest in the function.
10196 return LoopVectorizeResult(Changed, CFGChanged);
10197}
10198
10201 LI = &AM.getResult<LoopAnalysis>(F);
10202 // There are no loops in the function. Return before computing other
10203 // expensive analyses.
10204 if (LI->empty())
10205 return PreservedAnalyses::all();
10214
10215 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10216 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10217 BFI = nullptr;
10218 if (PSI && PSI->hasProfileSummary())
10220 LoopVectorizeResult Result = runImpl(F);
10221 if (!Result.MadeAnyChange)
10222 return PreservedAnalyses::all();
10224
10225 if (isAssignmentTrackingEnabled(*F.getParent())) {
10226 for (auto &BB : F)
10228 }
10229
10230 PA.preserve<LoopAnalysis>();
10234
10235 if (Result.MadeCFGChange) {
10236 // Making CFG changes likely means a loop got vectorized. Indicate that
10237 // extra simplification passes should be run.
10238 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10239 // be run if runtime checks have been added.
10242 } else {
10244 }
10245 return PA;
10246}
10247
10249 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10250 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10251 OS, MapClassName2PassName);
10252
10253 OS << '<';
10254 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10255 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10256 OS << '>';
10257}
@ Poison
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
uint64_t Addr
std::string Name
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This defines the Use class.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static void createAndCollectMergePhiForReduction(VPInstruction *RedResult, DenseMap< const RecurrenceDescriptor *, Value * > &ReductionResumeValues, VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue)
static std::optional< unsigned > getSmallBestKnownTC(ScalarEvolution &SE, Loop *L)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it's operands.
const char LLVMLoopVectorizeFollowupAll[]
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static Type * MaybeVectorizeType(Type *Elt, ElementCount VF)
cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional< unsigned > VScale, Loop *L, ScalarEvolution &SE, ScalarEpilogueLowering SEL)
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan)
Feed a resume value for every FOR from the vector loop to the scalar loop, if middle block branches t...
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static void addUsersInExitBlock(Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector< PHINode *, InductionDescriptor > &Inductions)
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static void AddRuntimeUnrollDisableMetaData(Loop *L)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static cl::opt< bool > PrintVPlansInDotFormat("vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans"))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define R2(n)
This file contains the declarations for metadata subclasses.
Module.h This file contains the declarations for the Module class.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define OP(OPC)
Definition: SandboxIR.h:610
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:214
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:460
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:233
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:517
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:374
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
BinaryOps getOpcode() const
Definition: InstrTypes.h:442
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1965
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1401
unsigned arg_size() const
Definition: InstrTypes.h:1408
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FIRST_ICMP_PREDICATE
Definition: InstrTypes.h:788
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:850
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:857
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:146
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:317
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:103
param_iterator param_begin() const
Definition: DerivedTypes.h:128
param_iterator param_end() const
Definition: DerivedTypes.h:129
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:705
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:214
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:384
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:769
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:702
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:743
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:922
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1193
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2265
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2261
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:142
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1344
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:468
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2371
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1421
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2686
A struct for saving information about induction variables.
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
virtual std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
PHINode * createInductionResumeValue(PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef< BasicBlock * > BypassBlocks, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create a new phi node for the induction variable OrigPhi to resume iteration count in the scalar epil...
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
BasicBlock * LoopScalarBody
The scalar loop body.
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
DenseMap< PHINode *, Value * > IVEndValues
ElementCount MinProfitableTripCount
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
LoopVectorizationCostModel * Cost
The profitablity analysis.
SmallMapVector< const RecurrenceDescriptor *, PHINode *, 4 > ReductionResumeValues
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
BasicBlock * completeLoopSkeleton()
Complete the loop skeleton by adding debug MDs, creating appropriate conditional branches in the midd...
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
LoopVectorizationLegality * Legal
The legality analysis.
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
LoopInfo * LI
Loop Info.
void createInductionResumeValues(const SCEV2ValueTy &ExpandedSCEVs, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create new phi nodes for the induction variables to resume iteration count in the scalar epilogue,...
ProfileSummaryInfo * PSI
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State)
Fix the non-induction PHIs in Plan.
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
virtual std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
unsigned UF
The vectorization unroll factor to use.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan)
Fix the vectorized code, taking care of header phi's, live-outs, and more.
BasicBlock * LoopExitBlock
The unique ExitBlock of the scalar loop if one exists.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:97
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
const char * getOpcodeName() const
Definition: Instruction.h:276
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:463
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:266
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:470
uint32_t getFactor() const
Definition: VectorUtils.h:486
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:540
InstTy * getInsertPos() const
Definition: VectorUtils.h:556
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:612
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:657
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:668
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:649
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:632
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:662
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:174
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:571
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1254
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, TTI::TargetCostKind CostKind) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool isEpilogueVectorizationProfitable(const ElementCount VF) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if V is invariant across all loop iterations according to SCEV.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
Type * getWidestInductionType()
Returns the widest induction type.
const LoopAccessInfo * getLAI() const
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
std::optional< VectorizationFactor > plan(ElementCount UserVF, unsigned UserIC)
Plan how to best vectorize, return the best VF and its cost, or std::nullopt if vectorization and int...
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
std::pair< DenseMap< const SCEV *, Value * >, DenseMap< const RecurrenceDescriptor *, Value * > > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
ElementCount getBestVF()
Return the most profitable vectorization factor.
VPlan & getBestPlanFor(ElementCount VF) const
Return the best VPlan for VF.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
void printPlans(raw_ostream &O)
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:502
Metadata node.
Definition: Metadata.h:1069
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1542
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:193
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:688
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
size_type size() const
Definition: SmallPtrSet.h:96
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:347
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:385
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:436
iterator end() const
Definition: SmallPtrSet.h:461
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:368
iterator begin() const
Definition: SmallPtrSet.h:456
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:503
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:95
size_t size() const
Definition: SmallVector.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:587
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:951
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:697
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
An instruction for storing to memory.
Definition: Instructions.h:290
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition: TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition: TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:239
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:221
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:234
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
op_iterator op_end()
Definition: User.h:236
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:71
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:2978
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3050
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:3002
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:483
iterator end()
Definition: VPlan.h:3012
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3010
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3063
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:212
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:3041
bool empty() const
Definition: VPlan.h:3021
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:2033
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:437
VPRegionBlock * getParent()
Definition: VPlan.h:509
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:177
void setName(const Twine &newName)
Definition: VPlan.h:502
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:155
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:544
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:534
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:3586
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:2717
Type * getScalarType() const
Returns the scalar type of the induction.
Definition: VPlan.h:2746
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:396
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:2916
VPValue * getStartValue() const
Definition: VPlan.h:2915
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1719
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1763
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1752
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1229
@ ResumePhi
Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
Definition: VPlan.h:1247
unsigned getOpcode() const
Definition: VPlan.h:1341
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2090
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:196
static VPLane getFirstLane()
Definition: VPlan.h:180
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:764
VPBasicBlock * getParent()
Definition: VPlan.h:789
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:860
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPValue * getVPValueOrAddLiveIn(Value *V, VPlan &Plan)
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
void createSwitchEdgeMasks(SwitchInst *SI)
Create an edge mask for every destination of cases and/or default.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
void createHeaderMask()
Create the mask for the vector loop header block.
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1149
A recipe for handling reduction phis.
Definition: VPlan.h:1974
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:2028
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:2020
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2181
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3156
const VPBlockBase * getEntry() const
Definition: VPlan.h:3195
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:3227
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2296
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isUniform() const
Definition: VPlan.h:2336
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:891
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:955
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:39
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:202
operand_range operands()
Definition: VPlanValue.h:272
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:257
unsigned getNumOperands() const
Definition: VPlanValue.h:251
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:252
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:246
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1421
user_iterator user_begin()
Definition: VPlanValue.h:128
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:172
user_iterator user_end()
Definition: VPlanValue.h:130
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:167
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1425
user_range users()
Definition: VPlanValue.h:132
A recipe to compute the pointers for widened memory accesses of IndexTy for all parts.
Definition: VPlan.h:1655
A recipe for widening Call instructions.
Definition: VPlan.h:1526
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:2842
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1439
A recipe for handling GEP instructions.
Definition: VPlan.h:1613
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:1776
A common base class for widening memory operations.
Definition: VPlan.h:2453
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition: VPlan.h:2500
Instruction & Ingredient
Definition: VPlan.h:2455
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2514
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2507
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition: VPlan.h:2504
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:1902
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:1941
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:1938
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition: VPlan.h:1406
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3260
void printDOT(raw_ostream &O) const
Print this VPlan in DOT format to O.
Definition: VPlan.cpp:1173
void prepareToExecute(Value *TripCount, Value *VectorTripCount, Value *CanonicalIVStartValue, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:916
VPBasicBlock * getEntry()
Definition: VPlan.h:3362
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3387
void setName(const Twine &newName)
Definition: VPlan.h:3424
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3390
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3366
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3380
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition: VPlan.h:3407
void addLiveOut(PHINode *PN, VPValue *V)
Definition: VPlan.cpp:1182
VPBasicBlock * getPreheader()
Definition: VPlan.h:3495
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.h:3462
bool hasVF(ElementCount VF)
Definition: VPlan.h:3400
bool hasUF(unsigned UF) const
Definition: VPlan.h:3413
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition: VPlan.cpp:1086
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3373
static VPlanPtr createInitialVPlan(const SCEV *TripCount, ScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop)
Create initial VPlan, having an "entry" VPBasicBlock (wrapping original scalar pre-header ) which con...
Definition: VPlan.cpp:858
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3428
LLVM_DUMP_METHOD void dump() const
Dump the plan to stderr (for debugging).
Definition: VPlan.cpp:1179
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:976
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:3470
const MapVector< PHINode *, VPLiveOut * > & getLiveOuts() const
Definition: VPlan.h:3481
void print(raw_ostream &O) const
Print this VPlan to O.
Definition: VPlan.cpp:1123
VPValue * getSCEVExpansion(const SCEV *S) const
Definition: VPlan.h:3485
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1225
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
VectorBuilder & setEVL(Value *NewExplicitVectorLength)
Definition: VectorBuilder.h:82
VectorBuilder & setMask(Value *NewMask)
Definition: VectorBuilder.h:78
Value * createVectorInstruction(unsigned Opcode, Type *ReturnTy, ArrayRef< Value * > VecOpArray, const Twine &Name=Twine())
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:671
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:664
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:258
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:225
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:239
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:826
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
bool isUniformAfterVectorization(const VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlan.h:3810
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlan.cpp:1610
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
pred_iterator pred_end(BasicBlock *BB)
Definition: CFG.h:114
@ Offset
Definition: DWP.cpp:480
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1894
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:988
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2406
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7133
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const SCEV * createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *OrigLoop)
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:465
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:214
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
pred_iterator pred_begin(BasicBlock *BB)
Definition: CFG.h:110
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:55
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:147
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< bool > EnableLoopVectorization
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:419
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:572
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
Type * ToVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
Definition: VectorUtils.h:135
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2242
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1701
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:1952
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2045
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
Definition: VPlan.h:95
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:593
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:28
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
LoopVectorizeResult runImpl(Function &F)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:69
A marker to determine if extra passes after loop vectorization should be run.
Definition: LoopVectorize.h:86
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:100
ElementCount End
Definition: VPlan.h:105
Struct to hold various analysis needed for cost computations.
Definition: VPlan.h:737
LoopVectorizationCostModel & CM
Definition: VPlan.h:741
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
SmallPtrSet< Instruction *, 8 > SkipCostComputation
Definition: VPlan.h:742
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:1947
VPIteration represents a single point in the iteration space of the output (vectorized and/or unrolle...
Definition: VPlan.h:238
bool isFirstIteration() const
Definition: VPlan.h:250
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:384
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:392
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:255
Value * get(VPValue *Def, unsigned Part, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def and a given Part if IsScalar is false,...
Definition: VPlan.cpp:254
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:429
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:432
void addMetadata(Value *To, Instruction *From)
Add metadata from one instruction to another.
Definition: VPlan.cpp:369
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:425
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:361
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:401
void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar=false)
Set the generated vector Value for a given VPValue and a given Part, if IsScalar is false.
Definition: VPlan.h:307
std::optional< VPIteration > Instance
Hold the indices to generate specific scalar instructions.
Definition: VPlan.h:267
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:409
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:415
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:412
ElementCount VF
The chosen Vectorization and Unroll Factors of the loop being vectorized.
Definition: VPlan.h:261
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:380
void execute(VPTransformState &State) override
Generate the wide load or gather.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2580
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:2529
A recipe for widening select instructions.
Definition: VPlan.h:1579
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition: VPlan.h:2655
void execute(VPTransformState &State) override
Generate the wide store or scatter.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2658
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:2603
static bool tryAddExplicitVectorLength(VPlan &Plan)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimize(VPlan &Plan, ScalarEvolution &SE)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs, LLVMContext &Ctx)
Insert truncates and extends for any truncated recipe.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Sink users of fixed-order recurrences after the recipe defining their previous value.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.