LLVM 20.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanPatternMatch.h"
63#include "VPlanTransforms.h"
64#include "VPlanVerifier.h"
65#include "llvm/ADT/APInt.h"
66#include "llvm/ADT/ArrayRef.h"
67#include "llvm/ADT/DenseMap.h"
69#include "llvm/ADT/Hashing.h"
70#include "llvm/ADT/MapVector.h"
71#include "llvm/ADT/STLExtras.h"
73#include "llvm/ADT/SmallSet.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
82#include "llvm/Analysis/CFG.h"
98#include "llvm/IR/Attributes.h"
99#include "llvm/IR/BasicBlock.h"
100#include "llvm/IR/CFG.h"
101#include "llvm/IR/Constant.h"
102#include "llvm/IR/Constants.h"
103#include "llvm/IR/DataLayout.h"
104#include "llvm/IR/DebugInfo.h"
106#include "llvm/IR/DebugLoc.h"
107#include "llvm/IR/DerivedTypes.h"
109#include "llvm/IR/Dominators.h"
110#include "llvm/IR/Function.h"
111#include "llvm/IR/IRBuilder.h"
112#include "llvm/IR/InstrTypes.h"
113#include "llvm/IR/Instruction.h"
114#include "llvm/IR/Instructions.h"
116#include "llvm/IR/Intrinsics.h"
117#include "llvm/IR/MDBuilder.h"
118#include "llvm/IR/Metadata.h"
119#include "llvm/IR/Module.h"
120#include "llvm/IR/Operator.h"
121#include "llvm/IR/PatternMatch.h"
123#include "llvm/IR/Type.h"
124#include "llvm/IR/Use.h"
125#include "llvm/IR/User.h"
126#include "llvm/IR/Value.h"
127#include "llvm/IR/ValueHandle.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
133#include "llvm/Support/Debug.h"
147#include <algorithm>
148#include <cassert>
149#include <cmath>
150#include <cstdint>
151#include <functional>
152#include <iterator>
153#include <limits>
154#include <map>
155#include <memory>
156#include <string>
157#include <tuple>
158#include <utility>
159
160using namespace llvm;
161
162#define LV_NAME "loop-vectorize"
163#define DEBUG_TYPE LV_NAME
164
165#ifndef NDEBUG
166const char VerboseDebug[] = DEBUG_TYPE "-verbose";
167#endif
168
169/// @{
170/// Metadata attribute names
171const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
173 "llvm.loop.vectorize.followup_vectorized";
175 "llvm.loop.vectorize.followup_epilogue";
176/// @}
177
178STATISTIC(LoopsVectorized, "Number of loops vectorized");
179STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
180STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
181
183 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
184 cl::desc("Enable vectorization of epilogue loops."));
185
187 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
188 cl::desc("When epilogue vectorization is enabled, and a value greater than "
189 "1 is specified, forces the given VF for all applicable epilogue "
190 "loops."));
191
193 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
194 cl::desc("Only loops with vectorization factor equal to or larger than "
195 "the specified value are considered for epilogue vectorization."));
196
197/// Loops with a known constant trip count below this number are vectorized only
198/// if no scalar iteration overheads are incurred.
200 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
201 cl::desc("Loops with a constant trip count that is smaller than this "
202 "value are vectorized only if no scalar iteration overheads "
203 "are incurred."));
204
206 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
207 cl::desc("The maximum allowed number of runtime memory checks"));
208
209// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
210// that predication is preferred, and this lists all options. I.e., the
211// vectorizer will try to fold the tail-loop (epilogue) into the vector body
212// and predicate the instructions accordingly. If tail-folding fails, there are
213// different fallback strategies depending on these values:
215 enum Option {
219 };
220} // namespace PreferPredicateTy
221
223 "prefer-predicate-over-epilogue",
226 cl::desc("Tail-folding and predication preferences over creating a scalar "
227 "epilogue loop."),
229 "scalar-epilogue",
230 "Don't tail-predicate loops, create scalar epilogue"),
232 "predicate-else-scalar-epilogue",
233 "prefer tail-folding, create scalar epilogue if tail "
234 "folding fails."),
236 "predicate-dont-vectorize",
237 "prefers tail-folding, don't attempt vectorization if "
238 "tail-folding fails.")));
239
241 "force-tail-folding-style", cl::desc("Force the tail folding style"),
242 cl::init(TailFoldingStyle::None),
244 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
246 TailFoldingStyle::Data, "data",
247 "Create lane mask for data only, using active.lane.mask intrinsic"),
248 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
249 "data-without-lane-mask",
250 "Create lane mask with compare/stepvector"),
251 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
252 "Create lane mask using active.lane.mask intrinsic, and use "
253 "it for both data and control flow"),
254 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
255 "data-and-control-without-rt-check",
256 "Similar to data-and-control, but remove the runtime check"),
257 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
258 "Use predicated EVL instructions for tail folding. If EVL "
259 "is unsupported, fallback to data-without-lane-mask.")));
260
262 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
263 cl::desc("Maximize bandwidth when selecting vectorization factor which "
264 "will be determined by the smallest type in loop."));
265
267 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
268 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
269
270/// An interleave-group may need masking if it resides in a block that needs
271/// predication, or in order to mask away gaps.
273 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
274 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
275
277 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
278 cl::desc("A flag that overrides the target's number of scalar registers."));
279
281 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
282 cl::desc("A flag that overrides the target's number of vector registers."));
283
285 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
286 cl::desc("A flag that overrides the target's max interleave factor for "
287 "scalar loops."));
288
290 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
291 cl::desc("A flag that overrides the target's max interleave factor for "
292 "vectorized loops."));
293
295 "force-target-instruction-cost", cl::init(0), cl::Hidden,
296 cl::desc("A flag that overrides the target's expected cost for "
297 "an instruction to a single constant value. Mostly "
298 "useful for getting consistent testing."));
299
301 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
302 cl::desc(
303 "Pretend that scalable vectors are supported, even if the target does "
304 "not support them. This flag should only be used for testing."));
305
307 "small-loop-cost", cl::init(20), cl::Hidden,
308 cl::desc(
309 "The cost of a loop that is considered 'small' by the interleaver."));
310
312 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
313 cl::desc("Enable the use of the block frequency analysis to access PGO "
314 "heuristics minimizing code growth in cold regions and being more "
315 "aggressive in hot regions."));
316
317// Runtime interleave loops for load/store throughput.
319 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
320 cl::desc(
321 "Enable runtime interleaving until load/store ports are saturated"));
322
323/// The number of stores in a loop that are allowed to need predication.
325 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
326 cl::desc("Max number of stores to be predicated behind an if."));
327
329 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
330 cl::desc("Count the induction variable only once when interleaving"));
331
333 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
334 cl::desc("Enable if predication of stores during vectorization."));
335
337 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
338 cl::desc("The maximum interleave count to use when interleaving a scalar "
339 "reduction in a nested loop."));
340
341static cl::opt<bool>
342 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
344 cl::desc("Prefer in-loop vector reductions, "
345 "overriding the targets preference."));
346
348 "force-ordered-reductions", cl::init(false), cl::Hidden,
349 cl::desc("Enable the vectorisation of loops with in-order (strict) "
350 "FP reductions"));
351
353 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
354 cl::desc(
355 "Prefer predicating a reduction operation over an after loop select."));
356
357namespace llvm {
359 "enable-vplan-native-path", cl::Hidden,
360 cl::desc("Enable VPlan-native vectorization path with "
361 "support for outer loop vectorization."));
362}
363
364// This flag enables the stress testing of the VPlan H-CFG construction in the
365// VPlan-native vectorization path. It must be used in conjuction with
366// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
367// verification of the H-CFGs built.
369 "vplan-build-stress-test", cl::init(false), cl::Hidden,
370 cl::desc(
371 "Build VPlan for every supported loop nest in the function and bail "
372 "out right after the build (stress test the VPlan H-CFG construction "
373 "in the VPlan-native vectorization path)."));
374
376 "interleave-loops", cl::init(true), cl::Hidden,
377 cl::desc("Enable loop interleaving in Loop vectorization passes"));
379 "vectorize-loops", cl::init(true), cl::Hidden,
380 cl::desc("Run the Loop vectorization passes"));
381
383 "vplan-print-in-dot-format", cl::Hidden,
384 cl::desc("Use dot format instead of plain text when dumping VPlans"));
385
387 "force-widen-divrem-via-safe-divisor", cl::Hidden,
388 cl::desc(
389 "Override cost based safe divisor widening for div/rem instructions"));
390
392 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
394 cl::desc("Try wider VFs if they enable the use of vector variants"));
395
396// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
397// variables not overflowing do not hold. See `emitSCEVChecks`.
398static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
399// Likelyhood of bypassing the vectorized loop because pointers overlap. See
400// `emitMemRuntimeChecks`.
401static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
402// Likelyhood of bypassing the vectorized loop because there are zero trips left
403// after prolog. See `emitIterationCountCheck`.
404static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
405
406/// A helper function that returns true if the given type is irregular. The
407/// type is irregular if its allocated size doesn't equal the store size of an
408/// element of the corresponding vector type.
409static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
410 // Determine if an array of N elements of type Ty is "bitcast compatible"
411 // with a <N x Ty> vector.
412 // This is only true if there is no padding between the array elements.
413 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
414}
415
416/// Returns "best known" trip count for the specified loop \p L as defined by
417/// the following procedure:
418/// 1) Returns exact trip count if it is known.
419/// 2) Returns expected trip count according to profile data if any.
420/// 3) Returns upper bound estimate if it is known.
421/// 4) Returns std::nullopt if all of the above failed.
422static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
423 Loop *L) {
424 // Check if exact trip count is known.
425 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
426 return ExpectedTC;
427
428 // Check if there is an expected trip count available from profile data.
430 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
431 return *EstimatedTC;
432
433 // Check if upper bound estimate is known.
434 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
435 return ExpectedTC;
436
437 return std::nullopt;
438}
439
440namespace {
441// Forward declare GeneratedRTChecks.
442class GeneratedRTChecks;
443
444using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
445} // namespace
446
447namespace llvm {
448
450
451/// InnerLoopVectorizer vectorizes loops which contain only one basic
452/// block to a specified vectorization factor (VF).
453/// This class performs the widening of scalars into vectors, or multiple
454/// scalars. This class also implements the following features:
455/// * It inserts an epilogue loop for handling loops that don't have iteration
456/// counts that are known to be a multiple of the vectorization factor.
457/// * It handles the code generation for reduction variables.
458/// * Scalarization (implementation using scalars) of un-vectorizable
459/// instructions.
460/// InnerLoopVectorizer does not perform any vectorization-legality
461/// checks, and relies on the caller to check for the different legality
462/// aspects. The InnerLoopVectorizer relies on the
463/// LoopVectorizationLegality class to provide information about the induction
464/// and reduction variables that were found to a given vectorization factor.
466public:
469 const TargetLibraryInfo *TLI,
473 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
475 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
476 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
477 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
478 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
480 // Query this against the original loop and save it here because the profile
481 // of the original loop header may change as the transformation happens.
484
486 this->MinProfitableTripCount = VecWidth;
487 else
488 this->MinProfitableTripCount = MinProfitableTripCount;
489 }
490
491 virtual ~InnerLoopVectorizer() = default;
492
493 /// Create a new empty loop that will contain vectorized instructions later
494 /// on, while the old loop will be used as the scalar remainder. Control flow
495 /// is generated around the vectorized (and scalar epilogue) loops consisting
496 /// of various checks and bypasses. Return the pre-header block of the new
497 /// loop and the start value for the canonical induction, if it is != 0. The
498 /// latter is the case when vectorizing the epilogue loop. In the case of
499 /// epilogue vectorization, this function is overriden to handle the more
500 /// complex control flow around the loops. \p ExpandedSCEVs is used to
501 /// look up SCEV expansions for expressions needed during skeleton creation.
502 virtual std::pair<BasicBlock *, Value *>
503 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
504
505 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
506 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
507
508 // Return true if any runtime check is added.
510
511 /// A helper function to scalarize a single Instruction in the innermost loop.
512 /// Generates a sequence of scalar instances for each lane between \p MinLane
513 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
514 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
515 /// Instr's operands.
516 void scalarizeInstruction(const Instruction *Instr,
517 VPReplicateRecipe *RepRecipe,
518 const VPIteration &Instance,
519 VPTransformState &State);
520
521 /// Fix the non-induction PHIs in \p Plan.
522 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
523
524 /// Create a new phi node for the induction variable \p OrigPhi to resume
525 /// iteration count in the scalar epilogue, from where the vectorized loop
526 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
527 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
528 /// and the resume values can come from an additional bypass block, the \p
529 /// AdditionalBypass pair provides information about the bypass block and the
530 /// end value on the edge from bypass to this loop.
532 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
533 ArrayRef<BasicBlock *> BypassBlocks,
534 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
535
536 /// Returns the original loop trip count.
537 Value *getTripCount() const { return TripCount; }
538
539 /// Used to set the trip count after ILV's construction and after the
540 /// preheader block has been executed. Note that this always holds the trip
541 /// count of the original loop for both main loop and epilogue vectorization.
542 void setTripCount(Value *TC) { TripCount = TC; }
543
544protected:
546
547 /// A small list of PHINodes.
549
550 /// A type for scalarized values in the new loop. Each value from the
551 /// original loop, when scalarized, is represented by UF x VF scalar values
552 /// in the new unrolled loop, where UF is the unroll factor and VF is the
553 /// vectorization factor.
555
556 /// Set up the values of the IVs correctly when exiting the vector loop.
557 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
558 Value *VectorTripCount, Value *EndValue,
559 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
560 VPlan &Plan, VPTransformState &State);
561
562 /// Iteratively sink the scalarized operands of a predicated instruction into
563 /// the block that was created for it.
564 void sinkScalarOperands(Instruction *PredInst);
565
566 /// Returns (and creates if needed) the trip count of the widened loop.
568
569 /// Emit a bypass check to see if the vector trip count is zero, including if
570 /// it overflows.
572
573 /// Emit a bypass check to see if all of the SCEV assumptions we've
574 /// had to make are correct. Returns the block containing the checks or
575 /// nullptr if no checks have been added.
577
578 /// Emit bypass checks to check any memory assumptions we may have made.
579 /// Returns the block containing the checks or nullptr if no checks have been
580 /// added.
582
583 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
584 /// vector loop preheader, middle block and scalar preheader.
586
587 /// Create new phi nodes for the induction variables to resume iteration count
588 /// in the scalar epilogue, from where the vectorized loop left off.
589 /// In cases where the loop skeleton is more complicated (eg. epilogue
590 /// vectorization) and the resume values can come from an additional bypass
591 /// block, the \p AdditionalBypass pair provides information about the bypass
592 /// block and the end value on the edge from bypass to this loop.
594 const SCEV2ValueTy &ExpandedSCEVs,
595 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
596
597 /// Complete the loop skeleton by adding debug MDs, creating appropriate
598 /// conditional branches in the middle block, preparing the builder and
599 /// running the verifier. Return the preheader of the completed vector loop.
601
602 /// Allow subclasses to override and print debug traces before/after vplan
603 /// execution, when trace information is requested.
604 virtual void printDebugTracesAtStart(){};
605 virtual void printDebugTracesAtEnd(){};
606
607 /// The original loop.
609
610 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
611 /// dynamic knowledge to simplify SCEV expressions and converts them to a
612 /// more usable form.
614
615 /// Loop Info.
617
618 /// Dominator Tree.
620
621 /// Target Library Info.
623
624 /// Target Transform Info.
626
627 /// Assumption Cache.
629
630 /// Interface to emit optimization remarks.
632
633 /// The vectorization SIMD factor to use. Each vector will have this many
634 /// vector elements.
636
638
639 /// The vectorization unroll factor to use. Each scalar is vectorized to this
640 /// many different vector instructions.
641 unsigned UF;
642
643 /// The builder that we use
645
646 // --- Vectorization state ---
647
648 /// The vector-loop preheader.
650
651 /// The scalar-loop preheader.
653
654 /// Middle Block between the vector and the scalar.
656
657 /// The unique ExitBlock of the scalar loop if one exists. Note that
658 /// there can be multiple exiting edges reaching this block.
660
661 /// The scalar loop body.
663
664 /// A list of all bypass blocks. The first block is the entry of the loop.
666
667 /// Store instructions that were predicated.
669
670 /// Trip count of the original loop.
671 Value *TripCount = nullptr;
672
673 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
675
676 /// The legality analysis.
678
679 /// The profitablity analysis.
681
682 // Record whether runtime checks are added.
683 bool AddedSafetyChecks = false;
684
685 // Holds the end values for each induction variable. We save the end values
686 // so we can later fix-up the external users of the induction variables.
688
689 /// BFI and PSI are used to check for profile guided size optimizations.
692
693 // Whether this loop should be optimized for size based on profile guided size
694 // optimizatios.
696
697 /// Structure to hold information about generated runtime checks, responsible
698 /// for cleaning the checks, if vectorization turns out unprofitable.
699 GeneratedRTChecks &RTChecks;
700
701 // Holds the resume values for reductions in the loops, used to set the
702 // correct start value of reduction PHIs when vectorizing the epilogue.
705};
706
708public:
711 const TargetLibraryInfo *TLI,
713 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
716 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
718 ElementCount::getFixed(1),
719 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
720 BFI, PSI, Check) {}
721};
722
723/// Encapsulate information regarding vectorization of a loop and its epilogue.
724/// This information is meant to be updated and used across two stages of
725/// epilogue vectorization.
728 unsigned MainLoopUF = 0;
730 unsigned EpilogueUF = 0;
735 Value *TripCount = nullptr;
737
739 ElementCount EVF, unsigned EUF)
740 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
741 assert(EUF == 1 &&
742 "A high UF for the epilogue loop is likely not beneficial.");
743 }
744};
745
746/// An extension of the inner loop vectorizer that creates a skeleton for a
747/// vectorized loop that has its epilogue (residual) also vectorized.
748/// The idea is to run the vplan on a given loop twice, firstly to setup the
749/// skeleton and vectorize the main loop, and secondly to complete the skeleton
750/// from the first step and vectorize the epilogue. This is achieved by
751/// deriving two concrete strategy classes from this base class and invoking
752/// them in succession from the loop vectorizer planner.
754public:
762 GeneratedRTChecks &Checks)
764 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
765 CM, BFI, PSI, Checks),
766 EPI(EPI) {}
767
768 // Override this function to handle the more complex control flow around the
769 // three loops.
770 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
771 const SCEV2ValueTy &ExpandedSCEVs) final {
772 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
773 }
774
775 /// The interface for creating a vectorized skeleton using one of two
776 /// different strategies, each corresponding to one execution of the vplan
777 /// as described above.
778 virtual std::pair<BasicBlock *, Value *>
779 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
780
781 /// Holds and updates state information required to vectorize the main loop
782 /// and its epilogue in two separate passes. This setup helps us avoid
783 /// regenerating and recomputing runtime safety checks. It also helps us to
784 /// shorten the iteration-count-check path length for the cases where the
785 /// iteration count of the loop is so small that the main vector loop is
786 /// completely skipped.
788};
789
790/// A specialized derived class of inner loop vectorizer that performs
791/// vectorization of *main* loops in the process of vectorizing loops and their
792/// epilogues.
794public:
802 GeneratedRTChecks &Check)
804 EPI, LVL, CM, BFI, PSI, Check) {}
805 /// Implements the interface for creating a vectorized skeleton using the
806 /// *main loop* strategy (ie the first pass of vplan execution).
807 std::pair<BasicBlock *, Value *>
808 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
809
810protected:
811 /// Emits an iteration count bypass check once for the main loop (when \p
812 /// ForEpilogue is false) and once for the epilogue loop (when \p
813 /// ForEpilogue is true).
814 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
815 void printDebugTracesAtStart() override;
816 void printDebugTracesAtEnd() override;
817};
818
819// A specialized derived class of inner loop vectorizer that performs
820// vectorization of *epilogue* loops in the process of vectorizing loops and
821// their epilogues.
823public:
831 GeneratedRTChecks &Checks)
833 EPI, LVL, CM, BFI, PSI, Checks) {
835 }
836 /// Implements the interface for creating a vectorized skeleton using the
837 /// *epilogue loop* strategy (ie the second pass of vplan execution).
838 std::pair<BasicBlock *, Value *>
839 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
840
841protected:
842 /// Emits an iteration count bypass check after the main vector loop has
843 /// finished to see if there are any iterations left to execute by either
844 /// the vector epilogue or the scalar epilogue.
846 BasicBlock *Bypass,
847 BasicBlock *Insert);
848 void printDebugTracesAtStart() override;
849 void printDebugTracesAtEnd() override;
850};
851} // end namespace llvm
852
853/// Look for a meaningful debug location on the instruction or it's
854/// operands.
856 if (!I)
857 return DebugLoc();
858
860 if (I->getDebugLoc() != Empty)
861 return I->getDebugLoc();
862
863 for (Use &Op : I->operands()) {
864 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
865 if (OpInst->getDebugLoc() != Empty)
866 return OpInst->getDebugLoc();
867 }
868
869 return I->getDebugLoc();
870}
871
872/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
873/// is passed, the message relates to that particular instruction.
874#ifndef NDEBUG
875static void debugVectorizationMessage(const StringRef Prefix,
876 const StringRef DebugMsg,
877 Instruction *I) {
878 dbgs() << "LV: " << Prefix << DebugMsg;
879 if (I != nullptr)
880 dbgs() << " " << *I;
881 else
882 dbgs() << '.';
883 dbgs() << '\n';
884}
885#endif
886
887/// Create an analysis remark that explains why vectorization failed
888///
889/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
890/// RemarkName is the identifier for the remark. If \p I is passed it is an
891/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
892/// the location of the remark. \return the remark object that can be
893/// streamed to.
895 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
896 Value *CodeRegion = TheLoop->getHeader();
897 DebugLoc DL = TheLoop->getStartLoc();
898
899 if (I) {
900 CodeRegion = I->getParent();
901 // If there is no debug location attached to the instruction, revert back to
902 // using the loop's.
903 if (I->getDebugLoc())
904 DL = I->getDebugLoc();
905 }
906
907 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
908}
909
910namespace llvm {
911
912/// Return a value for Step multiplied by VF.
914 int64_t Step) {
915 assert(Ty->isIntegerTy() && "Expected an integer step");
916 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
917}
918
919/// Return the runtime value for VF.
921 return B.CreateElementCount(Ty, VF);
922}
923
925 Loop *OrigLoop) {
926 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
927 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
928
929 ScalarEvolution &SE = *PSE.getSE();
930 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
931}
932
934 const StringRef OREMsg, const StringRef ORETag,
935 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
936 Instruction *I) {
937 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
938 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
939 ORE->emit(
940 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
941 << "loop not vectorized: " << OREMsg);
942}
943
944/// Reports an informative message: print \p Msg for debugging purposes as well
945/// as an optimization remark. Uses either \p I as location of the remark, or
946/// otherwise \p TheLoop.
947static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
948 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
949 Instruction *I = nullptr) {
951 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
952 ORE->emit(
953 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
954 << Msg);
955}
956
957/// Report successful vectorization of the loop. In case an outer loop is
958/// vectorized, prepend "outer" to the vectorization remark.
960 VectorizationFactor VF, unsigned IC) {
962 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
963 nullptr));
964 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
965 ORE->emit([&]() {
966 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
967 TheLoop->getHeader())
968 << "vectorized " << LoopType << "loop (vectorization width: "
969 << ore::NV("VectorizationFactor", VF.Width)
970 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
971 });
972}
973
974} // end namespace llvm
975
976namespace llvm {
977
978// Loop vectorization cost-model hints how the scalar epilogue loop should be
979// lowered.
981
982 // The default: allowing scalar epilogues.
984
985 // Vectorization with OptForSize: don't allow epilogues.
987
988 // A special case of vectorisation with OptForSize: loops with a very small
989 // trip count are considered for vectorization under OptForSize, thereby
990 // making sure the cost of their loop body is dominant, free of runtime
991 // guards and scalar iteration overheads.
993
994 // Loop hint predicate indicating an epilogue is undesired.
996
997 // Directive indicating we must either tail fold or not vectorize
1000
1001using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1002
1003/// LoopVectorizationCostModel - estimates the expected speedups due to
1004/// vectorization.
1005/// In many cases vectorization is not profitable. This can happen because of
1006/// a number of reasons. In this class we mainly attempt to predict the
1007/// expected speedup/slowdowns due to the supported instruction set. We use the
1008/// TargetTransformInfo to query the different backends for the cost of
1009/// different operations.
1011public:
1015 const TargetTransformInfo &TTI,
1021 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1022 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1023 Hints(Hints), InterleaveInfo(IAI) {}
1024
1025 /// \return An upper bound for the vectorization factors (both fixed and
1026 /// scalable). If the factors are 0, vectorization and interleaving should be
1027 /// avoided up front.
1028 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1029
1030 /// \return True if runtime checks are required for vectorization, and false
1031 /// otherwise.
1032 bool runtimeChecksRequired();
1033
1034 /// Setup cost-based decisions for user vectorization factor.
1035 /// \return true if the UserVF is a feasible VF to be chosen.
1039 return expectedCost(UserVF).isValid();
1040 }
1041
1042 /// \return The size (in bits) of the smallest and widest types in the code
1043 /// that needs to be vectorized. We ignore values that remain scalar such as
1044 /// 64 bit loop indices.
1045 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1046
1047 /// \return The desired interleave count.
1048 /// If interleave count has been specified by metadata it will be returned.
1049 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1050 /// are the selected vectorization factor and the cost of the selected VF.
1051 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1052
1053 /// Memory access instruction may be vectorized in more than one way.
1054 /// Form of instruction after vectorization depends on cost.
1055 /// This function takes cost-based decisions for Load/Store instructions
1056 /// and collects them in a map. This decisions map is used for building
1057 /// the lists of loop-uniform and loop-scalar instructions.
1058 /// The calculated cost is saved with widening decision in order to
1059 /// avoid redundant calculations.
1061
1062 /// A call may be vectorized in different ways depending on whether we have
1063 /// vectorized variants available and whether the target supports masking.
1064 /// This function analyzes all calls in the function at the supplied VF,
1065 /// makes a decision based on the costs of available options, and stores that
1066 /// decision in a map for use in planning and plan execution.
1068
1069 /// A struct that represents some properties of the register usage
1070 /// of a loop.
1072 /// Holds the number of loop invariant values that are used in the loop.
1073 /// The key is ClassID of target-provided register class.
1075 /// Holds the maximum number of concurrent live intervals in the loop.
1076 /// The key is ClassID of target-provided register class.
1078 };
1079
1080 /// \return Returns information about the register usages of the loop for the
1081 /// given vectorization factors.
1084
1085 /// Collect values we want to ignore in the cost model.
1086 void collectValuesToIgnore();
1087
1088 /// Collect all element types in the loop for which widening is needed.
1090
1091 /// Split reductions into those that happen in the loop, and those that happen
1092 /// outside. In loop reductions are collected into InLoopReductions.
1094
1095 /// Returns true if we should use strict in-order reductions for the given
1096 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1097 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1098 /// of FP operations.
1099 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1100 return !Hints->allowReordering() && RdxDesc.isOrdered();
1101 }
1102
1103 /// \returns The smallest bitwidth each instruction can be represented with.
1104 /// The vector equivalents of these instructions should be truncated to this
1105 /// type.
1107 return MinBWs;
1108 }
1109
1110 /// \returns True if it is more profitable to scalarize instruction \p I for
1111 /// vectorization factor \p VF.
1113 assert(VF.isVector() &&
1114 "Profitable to scalarize relevant only for VF > 1.");
1115 assert(
1116 TheLoop->isInnermost() &&
1117 "cost-model should not be used for outer loops (in VPlan-native path)");
1118
1119 auto Scalars = InstsToScalarize.find(VF);
1120 assert(Scalars != InstsToScalarize.end() &&
1121 "VF not yet analyzed for scalarization profitability");
1122 return Scalars->second.contains(I);
1123 }
1124
1125 /// Returns true if \p I is known to be uniform after vectorization.
1127 assert(
1128 TheLoop->isInnermost() &&
1129 "cost-model should not be used for outer loops (in VPlan-native path)");
1130 // Pseudo probe needs to be duplicated for each unrolled iteration and
1131 // vector lane so that profiled loop trip count can be accurately
1132 // accumulated instead of being under counted.
1133 if (isa<PseudoProbeInst>(I))
1134 return false;
1135
1136 if (VF.isScalar())
1137 return true;
1138
1139 auto UniformsPerVF = Uniforms.find(VF);
1140 assert(UniformsPerVF != Uniforms.end() &&
1141 "VF not yet analyzed for uniformity");
1142 return UniformsPerVF->second.count(I);
1143 }
1144
1145 /// Returns true if \p I is known to be scalar after vectorization.
1147 assert(
1148 TheLoop->isInnermost() &&
1149 "cost-model should not be used for outer loops (in VPlan-native path)");
1150 if (VF.isScalar())
1151 return true;
1152
1153 auto ScalarsPerVF = Scalars.find(VF);
1154 assert(ScalarsPerVF != Scalars.end() &&
1155 "Scalar values are not calculated for VF");
1156 return ScalarsPerVF->second.count(I);
1157 }
1158
1159 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1160 /// for vectorization factor \p VF.
1162 return VF.isVector() && MinBWs.contains(I) &&
1163 !isProfitableToScalarize(I, VF) &&
1165 }
1166
1167 /// Decision that was taken during cost calculation for memory instruction.
1170 CM_Widen, // For consecutive accesses with stride +1.
1171 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1178
1179 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1180 /// instruction \p I and vector width \p VF.
1183 assert(VF.isVector() && "Expected VF >=2");
1184 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1185 }
1186
1187 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1188 /// interleaving group \p Grp and vector width \p VF.
1192 assert(VF.isVector() && "Expected VF >=2");
1193 /// Broadcast this decicion to all instructions inside the group.
1194 /// But the cost will be assigned to one instruction only.
1195 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1196 if (auto *I = Grp->getMember(i)) {
1197 if (Grp->getInsertPos() == I)
1198 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1199 else
1200 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1201 }
1202 }
1203 }
1204
1205 /// Return the cost model decision for the given instruction \p I and vector
1206 /// width \p VF. Return CM_Unknown if this instruction did not pass
1207 /// through the cost modeling.
1209 assert(VF.isVector() && "Expected VF to be a vector VF");
1210 assert(
1211 TheLoop->isInnermost() &&
1212 "cost-model should not be used for outer loops (in VPlan-native path)");
1213
1214 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1215 auto Itr = WideningDecisions.find(InstOnVF);
1216 if (Itr == WideningDecisions.end())
1217 return CM_Unknown;
1218 return Itr->second.first;
1219 }
1220
1221 /// Return the vectorization cost for the given instruction \p I and vector
1222 /// width \p VF.
1224 assert(VF.isVector() && "Expected VF >=2");
1225 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1226 assert(WideningDecisions.contains(InstOnVF) &&
1227 "The cost is not calculated");
1228 return WideningDecisions[InstOnVF].second;
1229 }
1230
1235 std::optional<unsigned> MaskPos;
1237 };
1238
1240 Function *Variant, Intrinsic::ID IID,
1241 std::optional<unsigned> MaskPos,
1243 assert(!VF.isScalar() && "Expected vector VF");
1244 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1245 MaskPos, Cost};
1246 }
1247
1249 ElementCount VF) const {
1250 assert(!VF.isScalar() && "Expected vector VF");
1251 return CallWideningDecisions.at(std::make_pair(CI, VF));
1252 }
1253
1254 /// Return True if instruction \p I is an optimizable truncate whose operand
1255 /// is an induction variable. Such a truncate will be removed by adding a new
1256 /// induction variable with the destination type.
1258 // If the instruction is not a truncate, return false.
1259 auto *Trunc = dyn_cast<TruncInst>(I);
1260 if (!Trunc)
1261 return false;
1262
1263 // Get the source and destination types of the truncate.
1264 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1265 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1266
1267 // If the truncate is free for the given types, return false. Replacing a
1268 // free truncate with an induction variable would add an induction variable
1269 // update instruction to each iteration of the loop. We exclude from this
1270 // check the primary induction variable since it will need an update
1271 // instruction regardless.
1272 Value *Op = Trunc->getOperand(0);
1273 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1274 return false;
1275
1276 // If the truncated value is not an induction variable, return false.
1277 return Legal->isInductionPhi(Op);
1278 }
1279
1280 /// Collects the instructions to scalarize for each predicated instruction in
1281 /// the loop.
1283
1284 /// Collect Uniform and Scalar values for the given \p VF.
1285 /// The sets depend on CM decision for Load/Store instructions
1286 /// that may be vectorized as interleave, gather-scatter or scalarized.
1287 /// Also make a decision on what to do about call instructions in the loop
1288 /// at that VF -- scalarize, call a known vector routine, or call a
1289 /// vector intrinsic.
1291 // Do the analysis once.
1292 if (VF.isScalar() || Uniforms.contains(VF))
1293 return;
1296 collectLoopUniforms(VF);
1297 collectLoopScalars(VF);
1298 }
1299
1300 /// Returns true if the target machine supports masked store operation
1301 /// for the given \p DataType and kind of access to \p Ptr.
1302 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1303 return Legal->isConsecutivePtr(DataType, Ptr) &&
1304 TTI.isLegalMaskedStore(DataType, Alignment);
1305 }
1306
1307 /// Returns true if the target machine supports masked load operation
1308 /// for the given \p DataType and kind of access to \p Ptr.
1309 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1310 return Legal->isConsecutivePtr(DataType, Ptr) &&
1311 TTI.isLegalMaskedLoad(DataType, Alignment);
1312 }
1313
1314 /// Returns true if the target machine can represent \p V as a masked gather
1315 /// or scatter operation.
1317 bool LI = isa<LoadInst>(V);
1318 bool SI = isa<StoreInst>(V);
1319 if (!LI && !SI)
1320 return false;
1321 auto *Ty = getLoadStoreType(V);
1323 if (VF.isVector())
1324 Ty = VectorType::get(Ty, VF);
1325 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1326 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1327 }
1328
1329 /// Returns true if the target machine supports all of the reduction
1330 /// variables found for the given VF.
1332 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1333 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1334 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1335 }));
1336 }
1337
1338 /// Given costs for both strategies, return true if the scalar predication
1339 /// lowering should be used for div/rem. This incorporates an override
1340 /// option so it is not simply a cost comparison.
1342 InstructionCost SafeDivisorCost) const {
1343 switch (ForceSafeDivisor) {
1344 case cl::BOU_UNSET:
1345 return ScalarCost < SafeDivisorCost;
1346 case cl::BOU_TRUE:
1347 return false;
1348 case cl::BOU_FALSE:
1349 return true;
1350 };
1351 llvm_unreachable("impossible case value");
1352 }
1353
1354 /// Returns true if \p I is an instruction which requires predication and
1355 /// for which our chosen predication strategy is scalarization (i.e. we
1356 /// don't have an alternate strategy such as masking available).
1357 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1359
1360 /// Returns true if \p I is an instruction that needs to be predicated
1361 /// at runtime. The result is independent of the predication mechanism.
1362 /// Superset of instructions that return true for isScalarWithPredication.
1363 bool isPredicatedInst(Instruction *I) const;
1364
1365 /// Return the costs for our two available strategies for lowering a
1366 /// div/rem operation which requires speculating at least one lane.
1367 /// First result is for scalarization (will be invalid for scalable
1368 /// vectors); second is for the safe-divisor strategy.
1369 std::pair<InstructionCost, InstructionCost>
1371 ElementCount VF) const;
1372
1373 /// Returns true if \p I is a memory instruction with consecutive memory
1374 /// access that can be widened.
1376
1377 /// Returns true if \p I is a memory instruction in an interleaved-group
1378 /// of memory accesses that can be vectorized with wide vector loads/stores
1379 /// and shuffles.
1381
1382 /// Check if \p Instr belongs to any interleaved access group.
1384 return InterleaveInfo.isInterleaved(Instr);
1385 }
1386
1387 /// Get the interleaved access group that \p Instr belongs to.
1390 return InterleaveInfo.getInterleaveGroup(Instr);
1391 }
1392
1393 /// Returns true if we're required to use a scalar epilogue for at least
1394 /// the final iteration of the original loop.
1395 bool requiresScalarEpilogue(bool IsVectorizing) const {
1396 if (!isScalarEpilogueAllowed()) {
1397 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1398 return false;
1399 }
1400 // If we might exit from anywhere but the latch, must run the exiting
1401 // iteration in scalar form.
1403 LLVM_DEBUG(
1404 dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1405 return true;
1406 }
1407 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1408 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1409 "interleaved group requires scalar epilogue\n");
1410 return true;
1411 }
1412 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1413 return false;
1414 }
1415
1416 /// Returns true if we're required to use a scalar epilogue for at least
1417 /// the final iteration of the original loop for all VFs in \p Range.
1418 /// A scalar epilogue must either be required for all VFs in \p Range or for
1419 /// none.
1421 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1422 return requiresScalarEpilogue(VF.isVector());
1423 };
1424 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1425 assert(
1426 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1427 "all VFs in range must agree on whether a scalar epilogue is required");
1428 return IsRequired;
1429 }
1430
1431 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1432 /// loop hint annotation.
1434 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1435 }
1436
1437 /// Returns the TailFoldingStyle that is best for the current loop.
1438 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1439 if (!ChosenTailFoldingStyle)
1441 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1442 : ChosenTailFoldingStyle->second;
1443 }
1444
1445 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1446 /// overflow or not.
1447 /// \param IsScalableVF true if scalable vector factors enabled.
1448 /// \param UserIC User specific interleave count.
1449 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1450 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1451 if (!Legal->canFoldTailByMasking()) {
1452 ChosenTailFoldingStyle =
1454 return;
1455 }
1456
1457 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1458 ChosenTailFoldingStyle = std::make_pair(
1459 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1460 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1461 return;
1462 }
1463
1464 // Set styles when forced.
1465 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1466 ForceTailFoldingStyle.getValue());
1468 return;
1469 // Override forced styles if needed.
1470 // FIXME: use actual opcode/data type for analysis here.
1471 // FIXME: Investigate opportunity for fixed vector factor.
1472 bool EVLIsLegal =
1473 IsScalableVF && UserIC <= 1 &&
1474 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1476 // FIXME: implement support for max safe dependency distance.
1478 if (!EVLIsLegal) {
1479 // If for some reason EVL mode is unsupported, fallback to
1480 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1481 // in a generic way.
1482 ChosenTailFoldingStyle =
1485 LLVM_DEBUG(
1486 dbgs()
1487 << "LV: Preference for VP intrinsics indicated. Will "
1488 "not try to generate VP Intrinsics "
1489 << (UserIC > 1
1490 ? "since interleave count specified is greater than 1.\n"
1491 : "due to non-interleaving reasons.\n"));
1492 }
1493 }
1494
1495 /// Returns true if all loop blocks should be masked to fold tail loop.
1496 bool foldTailByMasking() const {
1497 // TODO: check if it is possible to check for None style independent of
1498 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1500 }
1501
1502 /// Returns true if the instructions in this block requires predication
1503 /// for any reason, e.g. because tail folding now requires a predicate
1504 /// or because the block in the original loop was predicated.
1507 }
1508
1509 /// Returns true if VP intrinsics with explicit vector length support should
1510 /// be generated in the tail folded loop.
1511 bool foldTailWithEVL() const {
1513 }
1514
1515 /// Returns true if the Phi is part of an inloop reduction.
1516 bool isInLoopReduction(PHINode *Phi) const {
1517 return InLoopReductions.contains(Phi);
1518 }
1519
1520 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1521 /// with factor VF. Return the cost of the instruction, including
1522 /// scalarization overhead if it's needed.
1524
1525 /// Estimate cost of a call instruction CI if it were vectorized with factor
1526 /// VF. Return the cost of the instruction, including scalarization overhead
1527 /// if it's needed.
1529
1530 /// Invalidates decisions already taken by the cost model.
1532 WideningDecisions.clear();
1533 CallWideningDecisions.clear();
1534 Uniforms.clear();
1535 Scalars.clear();
1536 }
1537
1538 /// Returns the expected execution cost. The unit of the cost does
1539 /// not matter because we use the 'cost' units to compare different
1540 /// vector widths. The cost that is returned is *not* normalized by
1541 /// the factor width. If \p Invalid is not nullptr, this function
1542 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1543 /// each instruction that has an Invalid cost for the given VF.
1547
1548 bool hasPredStores() const { return NumPredStores > 0; }
1549
1550 /// Returns true if epilogue vectorization is considered profitable, and
1551 /// false otherwise.
1552 /// \p VF is the vectorization factor chosen for the original loop.
1554
1555 /// Returns the execution time cost of an instruction for a given vector
1556 /// width. Vector width of one means scalar.
1558
1559 /// Return the cost of instructions in an inloop reduction pattern, if I is
1560 /// part of that pattern.
1561 std::optional<InstructionCost>
1564
1565private:
1566 unsigned NumPredStores = 0;
1567
1568 /// \return An upper bound for the vectorization factors for both
1569 /// fixed and scalable vectorization, where the minimum-known number of
1570 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1571 /// disabled or unsupported, then the scalable part will be equal to
1572 /// ElementCount::getScalable(0).
1573 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1574 ElementCount UserVF,
1575 bool FoldTailByMasking);
1576
1577 /// \return the maximized element count based on the targets vector
1578 /// registers and the loop trip-count, but limited to a maximum safe VF.
1579 /// This is a helper function of computeFeasibleMaxVF.
1580 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1581 unsigned SmallestType,
1582 unsigned WidestType,
1583 ElementCount MaxSafeVF,
1584 bool FoldTailByMasking);
1585
1586 /// Checks if scalable vectorization is supported and enabled. Caches the
1587 /// result to avoid repeated debug dumps for repeated queries.
1588 bool isScalableVectorizationAllowed();
1589
1590 /// \return the maximum legal scalable VF, based on the safe max number
1591 /// of elements.
1592 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1593
1594 /// Calculate vectorization cost of memory instruction \p I.
1595 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1596
1597 /// The cost computation for scalarized memory instruction.
1598 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1599
1600 /// The cost computation for interleaving group of memory instructions.
1601 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1602
1603 /// The cost computation for Gather/Scatter instruction.
1604 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1605
1606 /// The cost computation for widening instruction \p I with consecutive
1607 /// memory access.
1608 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1609
1610 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1611 /// Load: scalar load + broadcast.
1612 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1613 /// element)
1614 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1615
1616 /// Estimate the overhead of scalarizing an instruction. This is a
1617 /// convenience wrapper for the type-based getScalarizationOverhead API.
1618 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1620
1621 /// Returns true if an artificially high cost for emulated masked memrefs
1622 /// should be used.
1623 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1624
1625 /// Map of scalar integer values to the smallest bitwidth they can be legally
1626 /// represented as. The vector equivalents of these values should be truncated
1627 /// to this type.
1629
1630 /// A type representing the costs for instructions if they were to be
1631 /// scalarized rather than vectorized. The entries are Instruction-Cost
1632 /// pairs.
1633 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1634
1635 /// A set containing all BasicBlocks that are known to present after
1636 /// vectorization as a predicated block.
1638 PredicatedBBsAfterVectorization;
1639
1640 /// Records whether it is allowed to have the original scalar loop execute at
1641 /// least once. This may be needed as a fallback loop in case runtime
1642 /// aliasing/dependence checks fail, or to handle the tail/remainder
1643 /// iterations when the trip count is unknown or doesn't divide by the VF,
1644 /// or as a peel-loop to handle gaps in interleave-groups.
1645 /// Under optsize and when the trip count is very small we don't allow any
1646 /// iterations to execute in the scalar loop.
1647 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1648
1649 /// Control finally chosen tail folding style. The first element is used if
1650 /// the IV update may overflow, the second element - if it does not.
1651 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1652 ChosenTailFoldingStyle;
1653
1654 /// true if scalable vectorization is supported and enabled.
1655 std::optional<bool> IsScalableVectorizationAllowed;
1656
1657 /// A map holding scalar costs for different vectorization factors. The
1658 /// presence of a cost for an instruction in the mapping indicates that the
1659 /// instruction will be scalarized when vectorizing with the associated
1660 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1662
1663 /// Holds the instructions known to be uniform after vectorization.
1664 /// The data is collected per VF.
1666
1667 /// Holds the instructions known to be scalar after vectorization.
1668 /// The data is collected per VF.
1670
1671 /// Holds the instructions (address computations) that are forced to be
1672 /// scalarized.
1674
1675 /// PHINodes of the reductions that should be expanded in-loop.
1676 SmallPtrSet<PHINode *, 4> InLoopReductions;
1677
1678 /// A Map of inloop reduction operations and their immediate chain operand.
1679 /// FIXME: This can be removed once reductions can be costed correctly in
1680 /// VPlan. This was added to allow quick lookup of the inloop operations.
1681 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1682
1683 /// Returns the expected difference in cost from scalarizing the expression
1684 /// feeding a predicated instruction \p PredInst. The instructions to
1685 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1686 /// non-negative return value implies the expression will be scalarized.
1687 /// Currently, only single-use chains are considered for scalarization.
1688 InstructionCost computePredInstDiscount(Instruction *PredInst,
1689 ScalarCostsTy &ScalarCosts,
1690 ElementCount VF);
1691
1692 /// Collect the instructions that are uniform after vectorization. An
1693 /// instruction is uniform if we represent it with a single scalar value in
1694 /// the vectorized loop corresponding to each vector iteration. Examples of
1695 /// uniform instructions include pointer operands of consecutive or
1696 /// interleaved memory accesses. Note that although uniformity implies an
1697 /// instruction will be scalar, the reverse is not true. In general, a
1698 /// scalarized instruction will be represented by VF scalar values in the
1699 /// vectorized loop, each corresponding to an iteration of the original
1700 /// scalar loop.
1701 void collectLoopUniforms(ElementCount VF);
1702
1703 /// Collect the instructions that are scalar after vectorization. An
1704 /// instruction is scalar if it is known to be uniform or will be scalarized
1705 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1706 /// to the list if they are used by a load/store instruction that is marked as
1707 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1708 /// VF values in the vectorized loop, each corresponding to an iteration of
1709 /// the original scalar loop.
1710 void collectLoopScalars(ElementCount VF);
1711
1712 /// Keeps cost model vectorization decision and cost for instructions.
1713 /// Right now it is used for memory instructions only.
1715 std::pair<InstWidening, InstructionCost>>;
1716
1717 DecisionList WideningDecisions;
1718
1719 using CallDecisionList =
1720 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1721
1722 CallDecisionList CallWideningDecisions;
1723
1724 /// Returns true if \p V is expected to be vectorized and it needs to be
1725 /// extracted.
1726 bool needsExtract(Value *V, ElementCount VF) const {
1727 Instruction *I = dyn_cast<Instruction>(V);
1728 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1730 return false;
1731
1732 // Assume we can vectorize V (and hence we need extraction) if the
1733 // scalars are not computed yet. This can happen, because it is called
1734 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1735 // the scalars are collected. That should be a safe assumption in most
1736 // cases, because we check if the operands have vectorizable types
1737 // beforehand in LoopVectorizationLegality.
1738 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1739 };
1740
1741 /// Returns a range containing only operands needing to be extracted.
1742 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1743 ElementCount VF) const {
1745 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1746 }
1747
1748public:
1749 /// The loop that we evaluate.
1751
1752 /// Predicated scalar evolution analysis.
1754
1755 /// Loop Info analysis.
1757
1758 /// Vectorization legality.
1760
1761 /// Vector target information.
1763
1764 /// Target Library Info.
1766
1767 /// Demanded bits analysis.
1769
1770 /// Assumption cache.
1772
1773 /// Interface to emit optimization remarks.
1775
1777
1778 /// Loop Vectorize Hint.
1780
1781 /// The interleave access information contains groups of interleaved accesses
1782 /// with the same stride and close to each other.
1784
1785 /// Values to ignore in the cost model.
1787
1788 /// Values to ignore in the cost model when VF > 1.
1790
1791 /// All element types found in the loop.
1793};
1794} // end namespace llvm
1795
1796namespace {
1797/// Helper struct to manage generating runtime checks for vectorization.
1798///
1799/// The runtime checks are created up-front in temporary blocks to allow better
1800/// estimating the cost and un-linked from the existing IR. After deciding to
1801/// vectorize, the checks are moved back. If deciding not to vectorize, the
1802/// temporary blocks are completely removed.
1803class GeneratedRTChecks {
1804 /// Basic block which contains the generated SCEV checks, if any.
1805 BasicBlock *SCEVCheckBlock = nullptr;
1806
1807 /// The value representing the result of the generated SCEV checks. If it is
1808 /// nullptr, either no SCEV checks have been generated or they have been used.
1809 Value *SCEVCheckCond = nullptr;
1810
1811 /// Basic block which contains the generated memory runtime checks, if any.
1812 BasicBlock *MemCheckBlock = nullptr;
1813
1814 /// The value representing the result of the generated memory runtime checks.
1815 /// If it is nullptr, either no memory runtime checks have been generated or
1816 /// they have been used.
1817 Value *MemRuntimeCheckCond = nullptr;
1818
1819 DominatorTree *DT;
1820 LoopInfo *LI;
1822
1823 SCEVExpander SCEVExp;
1824 SCEVExpander MemCheckExp;
1825
1826 bool CostTooHigh = false;
1827 const bool AddBranchWeights;
1828
1829 Loop *OuterLoop = nullptr;
1830
1831public:
1832 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1834 bool AddBranchWeights)
1835 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1836 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1837
1838 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1839 /// accurately estimate the cost of the runtime checks. The blocks are
1840 /// un-linked from the IR and is added back during vector code generation. If
1841 /// there is no vector code generation, the check blocks are removed
1842 /// completely.
1843 void Create(Loop *L, const LoopAccessInfo &LAI,
1844 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1845
1846 // Hard cutoff to limit compile-time increase in case a very large number of
1847 // runtime checks needs to be generated.
1848 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1849 // profile info.
1850 CostTooHigh =
1852 if (CostTooHigh)
1853 return;
1854
1855 BasicBlock *LoopHeader = L->getHeader();
1856 BasicBlock *Preheader = L->getLoopPreheader();
1857
1858 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1859 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1860 // may be used by SCEVExpander. The blocks will be un-linked from their
1861 // predecessors and removed from LI & DT at the end of the function.
1862 if (!UnionPred.isAlwaysTrue()) {
1863 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1864 nullptr, "vector.scevcheck");
1865
1866 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1867 &UnionPred, SCEVCheckBlock->getTerminator());
1868 }
1869
1870 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1871 if (RtPtrChecking.Need) {
1872 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1873 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1874 "vector.memcheck");
1875
1876 auto DiffChecks = RtPtrChecking.getDiffChecks();
1877 if (DiffChecks) {
1878 Value *RuntimeVF = nullptr;
1879 MemRuntimeCheckCond = addDiffRuntimeChecks(
1880 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1881 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1882 if (!RuntimeVF)
1883 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1884 return RuntimeVF;
1885 },
1886 IC);
1887 } else {
1888 MemRuntimeCheckCond = addRuntimeChecks(
1889 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1891 }
1892 assert(MemRuntimeCheckCond &&
1893 "no RT checks generated although RtPtrChecking "
1894 "claimed checks are required");
1895 }
1896
1897 if (!MemCheckBlock && !SCEVCheckBlock)
1898 return;
1899
1900 // Unhook the temporary block with the checks, update various places
1901 // accordingly.
1902 if (SCEVCheckBlock)
1903 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1904 if (MemCheckBlock)
1905 MemCheckBlock->replaceAllUsesWith(Preheader);
1906
1907 if (SCEVCheckBlock) {
1908 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1909 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1910 Preheader->getTerminator()->eraseFromParent();
1911 }
1912 if (MemCheckBlock) {
1913 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1914 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1915 Preheader->getTerminator()->eraseFromParent();
1916 }
1917
1918 DT->changeImmediateDominator(LoopHeader, Preheader);
1919 if (MemCheckBlock) {
1920 DT->eraseNode(MemCheckBlock);
1921 LI->removeBlock(MemCheckBlock);
1922 }
1923 if (SCEVCheckBlock) {
1924 DT->eraseNode(SCEVCheckBlock);
1925 LI->removeBlock(SCEVCheckBlock);
1926 }
1927
1928 // Outer loop is used as part of the later cost calculations.
1929 OuterLoop = L->getParentLoop();
1930 }
1931
1932 InstructionCost getCost() {
1933 if (SCEVCheckBlock || MemCheckBlock)
1934 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1935
1936 if (CostTooHigh) {
1938 Cost.setInvalid();
1939 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1940 return Cost;
1941 }
1942
1943 InstructionCost RTCheckCost = 0;
1944 if (SCEVCheckBlock)
1945 for (Instruction &I : *SCEVCheckBlock) {
1946 if (SCEVCheckBlock->getTerminator() == &I)
1947 continue;
1950 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1951 RTCheckCost += C;
1952 }
1953 if (MemCheckBlock) {
1954 InstructionCost MemCheckCost = 0;
1955 for (Instruction &I : *MemCheckBlock) {
1956 if (MemCheckBlock->getTerminator() == &I)
1957 continue;
1960 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1961 MemCheckCost += C;
1962 }
1963
1964 // If the runtime memory checks are being created inside an outer loop
1965 // we should find out if these checks are outer loop invariant. If so,
1966 // the checks will likely be hoisted out and so the effective cost will
1967 // reduce according to the outer loop trip count.
1968 if (OuterLoop) {
1969 ScalarEvolution *SE = MemCheckExp.getSE();
1970 // TODO: If profitable, we could refine this further by analysing every
1971 // individual memory check, since there could be a mixture of loop
1972 // variant and invariant checks that mean the final condition is
1973 // variant.
1974 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1975 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1976 // It seems reasonable to assume that we can reduce the effective
1977 // cost of the checks even when we know nothing about the trip
1978 // count. Assume that the outer loop executes at least twice.
1979 unsigned BestTripCount = 2;
1980
1981 // If exact trip count is known use that.
1982 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
1983 BestTripCount = SmallTC;
1985 // Else use profile data if available.
1986 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
1987 BestTripCount = *EstimatedTC;
1988 }
1989
1990 BestTripCount = std::max(BestTripCount, 1U);
1991 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1992
1993 // Let's ensure the cost is always at least 1.
1994 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
1996
1997 if (BestTripCount > 1)
1999 << "We expect runtime memory checks to be hoisted "
2000 << "out of the outer loop. Cost reduced from "
2001 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2002
2003 MemCheckCost = NewMemCheckCost;
2004 }
2005 }
2006
2007 RTCheckCost += MemCheckCost;
2008 }
2009
2010 if (SCEVCheckBlock || MemCheckBlock)
2011 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2012 << "\n");
2013
2014 return RTCheckCost;
2015 }
2016
2017 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2018 /// unused.
2019 ~GeneratedRTChecks() {
2020 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2021 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2022 if (!SCEVCheckCond)
2023 SCEVCleaner.markResultUsed();
2024
2025 if (!MemRuntimeCheckCond)
2026 MemCheckCleaner.markResultUsed();
2027
2028 if (MemRuntimeCheckCond) {
2029 auto &SE = *MemCheckExp.getSE();
2030 // Memory runtime check generation creates compares that use expanded
2031 // values. Remove them before running the SCEVExpanderCleaners.
2032 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2033 if (MemCheckExp.isInsertedInstruction(&I))
2034 continue;
2035 SE.forgetValue(&I);
2036 I.eraseFromParent();
2037 }
2038 }
2039 MemCheckCleaner.cleanup();
2040 SCEVCleaner.cleanup();
2041
2042 if (SCEVCheckCond)
2043 SCEVCheckBlock->eraseFromParent();
2044 if (MemRuntimeCheckCond)
2045 MemCheckBlock->eraseFromParent();
2046 }
2047
2048 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2049 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2050 /// depending on the generated condition.
2051 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2052 BasicBlock *LoopVectorPreHeader,
2053 BasicBlock *LoopExitBlock) {
2054 if (!SCEVCheckCond)
2055 return nullptr;
2056
2057 Value *Cond = SCEVCheckCond;
2058 // Mark the check as used, to prevent it from being removed during cleanup.
2059 SCEVCheckCond = nullptr;
2060 if (auto *C = dyn_cast<ConstantInt>(Cond))
2061 if (C->isZero())
2062 return nullptr;
2063
2064 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2065
2066 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2067 // Create new preheader for vector loop.
2068 if (OuterLoop)
2069 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2070
2071 SCEVCheckBlock->getTerminator()->eraseFromParent();
2072 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2073 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2074 SCEVCheckBlock);
2075
2076 DT->addNewBlock(SCEVCheckBlock, Pred);
2077 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2078
2079 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2080 if (AddBranchWeights)
2081 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2082 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2083 return SCEVCheckBlock;
2084 }
2085
2086 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2087 /// the branches to branch to the vector preheader or \p Bypass, depending on
2088 /// the generated condition.
2089 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2090 BasicBlock *LoopVectorPreHeader) {
2091 // Check if we generated code that checks in runtime if arrays overlap.
2092 if (!MemRuntimeCheckCond)
2093 return nullptr;
2094
2095 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2096 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2097 MemCheckBlock);
2098
2099 DT->addNewBlock(MemCheckBlock, Pred);
2100 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2101 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2102
2103 if (OuterLoop)
2104 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2105
2106 BranchInst &BI =
2107 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2108 if (AddBranchWeights) {
2109 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2110 }
2111 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2112 MemCheckBlock->getTerminator()->setDebugLoc(
2113 Pred->getTerminator()->getDebugLoc());
2114
2115 // Mark the check as used, to prevent it from being removed during cleanup.
2116 MemRuntimeCheckCond = nullptr;
2117 return MemCheckBlock;
2118 }
2119};
2120} // namespace
2121
2123 return Style == TailFoldingStyle::Data ||
2124 Style == TailFoldingStyle::DataAndControlFlow ||
2125 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2126}
2127
2129 return Style == TailFoldingStyle::DataAndControlFlow ||
2130 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2131}
2132
2133// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2134// vectorization. The loop needs to be annotated with #pragma omp simd
2135// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2136// vector length information is not provided, vectorization is not considered
2137// explicit. Interleave hints are not allowed either. These limitations will be
2138// relaxed in the future.
2139// Please, note that we are currently forced to abuse the pragma 'clang
2140// vectorize' semantics. This pragma provides *auto-vectorization hints*
2141// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2142// provides *explicit vectorization hints* (LV can bypass legal checks and
2143// assume that vectorization is legal). However, both hints are implemented
2144// using the same metadata (llvm.loop.vectorize, processed by
2145// LoopVectorizeHints). This will be fixed in the future when the native IR
2146// representation for pragma 'omp simd' is introduced.
2147static bool isExplicitVecOuterLoop(Loop *OuterLp,
2149 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2150 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2151
2152 // Only outer loops with an explicit vectorization hint are supported.
2153 // Unannotated outer loops are ignored.
2155 return false;
2156
2157 Function *Fn = OuterLp->getHeader()->getParent();
2158 if (!Hints.allowVectorization(Fn, OuterLp,
2159 true /*VectorizeOnlyWhenForced*/)) {
2160 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2161 return false;
2162 }
2163
2164 if (Hints.getInterleave() > 1) {
2165 // TODO: Interleave support is future work.
2166 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2167 "outer loops.\n");
2168 Hints.emitRemarkWithHints();
2169 return false;
2170 }
2171
2172 return true;
2173}
2174
2178 // Collect inner loops and outer loops without irreducible control flow. For
2179 // now, only collect outer loops that have explicit vectorization hints. If we
2180 // are stress testing the VPlan H-CFG construction, we collect the outermost
2181 // loop of every loop nest.
2182 if (L.isInnermost() || VPlanBuildStressTest ||
2184 LoopBlocksRPO RPOT(&L);
2185 RPOT.perform(LI);
2186 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2187 V.push_back(&L);
2188 // TODO: Collect inner loops inside marked outer loops in case
2189 // vectorization fails for the outer loop. Do not invoke
2190 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2191 // already known to be reducible. We can use an inherited attribute for
2192 // that.
2193 return;
2194 }
2195 }
2196 for (Loop *InnerL : L)
2197 collectSupportedLoops(*InnerL, LI, ORE, V);
2198}
2199
2200//===----------------------------------------------------------------------===//
2201// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2202// LoopVectorizationCostModel and LoopVectorizationPlanner.
2203//===----------------------------------------------------------------------===//
2204
2205/// Compute the transformed value of Index at offset StartValue using step
2206/// StepValue.
2207/// For integer induction, returns StartValue + Index * StepValue.
2208/// For pointer induction, returns StartValue[Index * StepValue].
2209/// FIXME: The newly created binary instructions should contain nsw/nuw
2210/// flags, which can be found from the original scalar operations.
2211static Value *
2213 Value *Step,
2215 const BinaryOperator *InductionBinOp) {
2216 Type *StepTy = Step->getType();
2217 Value *CastedIndex = StepTy->isIntegerTy()
2218 ? B.CreateSExtOrTrunc(Index, StepTy)
2219 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2220 if (CastedIndex != Index) {
2221 CastedIndex->setName(CastedIndex->getName() + ".cast");
2222 Index = CastedIndex;
2223 }
2224
2225 // Note: the IR at this point is broken. We cannot use SE to create any new
2226 // SCEV and then expand it, hoping that SCEV's simplification will give us
2227 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2228 // lead to various SCEV crashes. So all we can do is to use builder and rely
2229 // on InstCombine for future simplifications. Here we handle some trivial
2230 // cases only.
2231 auto CreateAdd = [&B](Value *X, Value *Y) {
2232 assert(X->getType() == Y->getType() && "Types don't match!");
2233 if (auto *CX = dyn_cast<ConstantInt>(X))
2234 if (CX->isZero())
2235 return Y;
2236 if (auto *CY = dyn_cast<ConstantInt>(Y))
2237 if (CY->isZero())
2238 return X;
2239 return B.CreateAdd(X, Y);
2240 };
2241
2242 // We allow X to be a vector type, in which case Y will potentially be
2243 // splatted into a vector with the same element count.
2244 auto CreateMul = [&B](Value *X, Value *Y) {
2245 assert(X->getType()->getScalarType() == Y->getType() &&
2246 "Types don't match!");
2247 if (auto *CX = dyn_cast<ConstantInt>(X))
2248 if (CX->isOne())
2249 return Y;
2250 if (auto *CY = dyn_cast<ConstantInt>(Y))
2251 if (CY->isOne())
2252 return X;
2253 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2254 if (XVTy && !isa<VectorType>(Y->getType()))
2255 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2256 return B.CreateMul(X, Y);
2257 };
2258
2259 switch (InductionKind) {
2261 assert(!isa<VectorType>(Index->getType()) &&
2262 "Vector indices not supported for integer inductions yet");
2263 assert(Index->getType() == StartValue->getType() &&
2264 "Index type does not match StartValue type");
2265 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2266 return B.CreateSub(StartValue, Index);
2267 auto *Offset = CreateMul(Index, Step);
2268 return CreateAdd(StartValue, Offset);
2269 }
2271 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2273 assert(!isa<VectorType>(Index->getType()) &&
2274 "Vector indices not supported for FP inductions yet");
2275 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2276 assert(InductionBinOp &&
2277 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2278 InductionBinOp->getOpcode() == Instruction::FSub) &&
2279 "Original bin op should be defined for FP induction");
2280
2281 Value *MulExp = B.CreateFMul(Step, Index);
2282 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2283 "induction");
2284 }
2286 return nullptr;
2287 }
2288 llvm_unreachable("invalid enum");
2289}
2290
2291std::optional<unsigned> getMaxVScale(const Function &F,
2292 const TargetTransformInfo &TTI) {
2293 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2294 return MaxVScale;
2295
2296 if (F.hasFnAttribute(Attribute::VScaleRange))
2297 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2298
2299 return std::nullopt;
2300}
2301
2302/// For the given VF and UF and maximum trip count computed for the loop, return
2303/// whether the induction variable might overflow in the vectorized loop. If not,
2304/// then we know a runtime overflow check always evaluates to false and can be
2305/// removed.
2308 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2309 // Always be conservative if we don't know the exact unroll factor.
2310 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2311
2312 Type *IdxTy = Cost->Legal->getWidestInductionType();
2313 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2314
2315 // We know the runtime overflow check is known false iff the (max) trip-count
2316 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2317 // the vector loop induction variable.
2318 if (unsigned TC =
2319 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2320 uint64_t MaxVF = VF.getKnownMinValue();
2321 if (VF.isScalable()) {
2322 std::optional<unsigned> MaxVScale =
2323 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2324 if (!MaxVScale)
2325 return false;
2326 MaxVF *= *MaxVScale;
2327 }
2328
2329 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2330 }
2331
2332 return false;
2333}
2334
2335// Return whether we allow using masked interleave-groups (for dealing with
2336// strided loads/stores that reside in predicated blocks, or for dealing
2337// with gaps).
2339 // If an override option has been passed in for interleaved accesses, use it.
2340 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2342
2344}
2345
2347 VPReplicateRecipe *RepRecipe,
2348 const VPIteration &Instance,
2349 VPTransformState &State) {
2350 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2351
2352 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2353 // the first lane and part.
2354 if (isa<NoAliasScopeDeclInst>(Instr))
2355 if (!Instance.isFirstIteration())
2356 return;
2357
2358 // Does this instruction return a value ?
2359 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2360
2361 Instruction *Cloned = Instr->clone();
2362 if (!IsVoidRetTy) {
2363 Cloned->setName(Instr->getName() + ".cloned");
2364#if !defined(NDEBUG)
2365 // Verify that VPlan type inference results agree with the type of the
2366 // generated values.
2367 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2368 "inferred type and type from generated instructions do not match");
2369#endif
2370 }
2371
2372 RepRecipe->setFlags(Cloned);
2373
2374 if (auto DL = Instr->getDebugLoc())
2375 State.setDebugLocFrom(DL);
2376
2377 // Replace the operands of the cloned instructions with their scalar
2378 // equivalents in the new loop.
2379 for (const auto &I : enumerate(RepRecipe->operands())) {
2380 auto InputInstance = Instance;
2381 VPValue *Operand = I.value();
2383 InputInstance.Lane = VPLane::getFirstLane();
2384 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2385 }
2386 State.addNewMetadata(Cloned, Instr);
2387
2388 // Place the cloned scalar in the new loop.
2389 State.Builder.Insert(Cloned);
2390
2391 State.set(RepRecipe, Cloned, Instance);
2392
2393 // If we just cloned a new assumption, add it the assumption cache.
2394 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2396
2397 // End if-block.
2398 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2399 if (IfPredicateInstr)
2400 PredicatedInstructions.push_back(Cloned);
2401}
2402
2403Value *
2405 if (VectorTripCount)
2406 return VectorTripCount;
2407
2408 Value *TC = getTripCount();
2409 IRBuilder<> Builder(InsertBlock->getTerminator());
2410
2411 Type *Ty = TC->getType();
2412 // This is where we can make the step a runtime constant.
2413 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2414
2415 // If the tail is to be folded by masking, round the number of iterations N
2416 // up to a multiple of Step instead of rounding down. This is done by first
2417 // adding Step-1 and then rounding down. Note that it's ok if this addition
2418 // overflows: the vector induction variable will eventually wrap to zero given
2419 // that it starts at zero and its Step is a power of two; the loop will then
2420 // exit, with the last early-exit vector comparison also producing all-true.
2421 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2422 // is accounted for in emitIterationCountCheck that adds an overflow check.
2423 if (Cost->foldTailByMasking()) {
2425 "VF*UF must be a power of 2 when folding tail by masking");
2426 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2427 "n.rnd.up");
2428 }
2429
2430 // Now we need to generate the expression for the part of the loop that the
2431 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2432 // iterations are not required for correctness, or N - Step, otherwise. Step
2433 // is equal to the vectorization factor (number of SIMD elements) times the
2434 // unroll factor (number of SIMD instructions).
2435 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2436
2437 // There are cases where we *must* run at least one iteration in the remainder
2438 // loop. See the cost model for when this can happen. If the step evenly
2439 // divides the trip count, we set the remainder to be equal to the step. If
2440 // the step does not evenly divide the trip count, no adjustment is necessary
2441 // since there will already be scalar iterations. Note that the minimum
2442 // iterations check ensures that N >= Step.
2443 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2444 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2445 R = Builder.CreateSelect(IsZero, Step, R);
2446 }
2447
2448 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2449
2450 return VectorTripCount;
2451}
2452
2454 Value *Count = getTripCount();
2455 // Reuse existing vector loop preheader for TC checks.
2456 // Note that new preheader block is generated for vector loop.
2457 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2458 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2459
2460 // Generate code to check if the loop's trip count is less than VF * UF, or
2461 // equal to it in case a scalar epilogue is required; this implies that the
2462 // vector trip count is zero. This check also covers the case where adding one
2463 // to the backedge-taken count overflowed leading to an incorrect trip count
2464 // of zero. In this case we will also jump to the scalar loop.
2465 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2467
2468 // If tail is to be folded, vector loop takes care of all iterations.
2469 Type *CountTy = Count->getType();
2470 Value *CheckMinIters = Builder.getFalse();
2471 auto CreateStep = [&]() -> Value * {
2472 // Create step with max(MinProTripCount, UF * VF).
2474 return createStepForVF(Builder, CountTy, VF, UF);
2475
2476 Value *MinProfTC =
2478 if (!VF.isScalable())
2479 return MinProfTC;
2481 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2482 };
2483
2484 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2485 if (Style == TailFoldingStyle::None)
2486 CheckMinIters =
2487 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2488 else if (VF.isScalable() &&
2491 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2492 // an overflow to zero when updating induction variables and so an
2493 // additional overflow check is required before entering the vector loop.
2494
2495 // Get the maximum unsigned value for the type.
2496 Value *MaxUIntTripCount =
2497 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2498 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2499
2500 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2501 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2502 }
2503
2504 // Create new preheader for vector loop.
2506 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2507 "vector.ph");
2508
2509 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2510 DT->getNode(Bypass)->getIDom()) &&
2511 "TC check is expected to dominate Bypass");
2512
2513 // Update dominator for Bypass & LoopExit (if needed).
2514 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2515 BranchInst &BI =
2516 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2518 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2519 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2520 LoopBypassBlocks.push_back(TCCheckBlock);
2521}
2522
2524 BasicBlock *const SCEVCheckBlock =
2525 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2526 if (!SCEVCheckBlock)
2527 return nullptr;
2528
2529 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2531 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2532 "Cannot SCEV check stride or overflow when optimizing for size");
2533
2534
2535 // Update dominator only if this is first RT check.
2536 if (LoopBypassBlocks.empty()) {
2537 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2538 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2539 // If there is an epilogue which must run, there's no edge from the
2540 // middle block to exit blocks and thus no need to update the immediate
2541 // dominator of the exit blocks.
2542 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2543 }
2544
2545 LoopBypassBlocks.push_back(SCEVCheckBlock);
2546 AddedSafetyChecks = true;
2547 return SCEVCheckBlock;
2548}
2549
2551 // VPlan-native path does not do any analysis for runtime checks currently.
2553 return nullptr;
2554
2555 BasicBlock *const MemCheckBlock =
2556 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2557
2558 // Check if we generated code that checks in runtime if arrays overlap. We put
2559 // the checks into a separate block to make the more common case of few
2560 // elements faster.
2561 if (!MemCheckBlock)
2562 return nullptr;
2563
2564 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2565 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2566 "Cannot emit memory checks when optimizing for size, unless forced "
2567 "to vectorize.");
2568 ORE->emit([&]() {
2569 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2572 << "Code-size may be reduced by not forcing "
2573 "vectorization, or by source-code modifications "
2574 "eliminating the need for runtime checks "
2575 "(e.g., adding 'restrict').";
2576 });
2577 }
2578
2579 LoopBypassBlocks.push_back(MemCheckBlock);
2580
2581 AddedSafetyChecks = true;
2582
2583 return MemCheckBlock;
2584}
2585
2589 assert(LoopVectorPreHeader && "Invalid loop structure");
2590 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2591 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2592 "multiple exit loop without required epilogue?");
2593
2596 LI, nullptr, Twine(Prefix) + "middle.block");
2599 nullptr, Twine(Prefix) + "scalar.ph");
2600}
2601
2603 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2604 ArrayRef<BasicBlock *> BypassBlocks,
2605 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2607 assert(VectorTripCount && "Expected valid arguments");
2608
2609 Instruction *OldInduction = Legal->getPrimaryInduction();
2610 Value *&EndValue = IVEndValues[OrigPhi];
2611 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
2612 if (OrigPhi == OldInduction) {
2613 // We know what the end value is.
2614 EndValue = VectorTripCount;
2615 } else {
2617
2618 // Fast-math-flags propagate from the original induction instruction.
2619 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2620 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2621
2622 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
2623 Step, II.getKind(), II.getInductionBinOp());
2624 EndValue->setName("ind.end");
2625
2626 // Compute the end value for the additional bypass (if applicable).
2627 if (AdditionalBypass.first) {
2628 B.SetInsertPoint(AdditionalBypass.first,
2629 AdditionalBypass.first->getFirstInsertionPt());
2630 EndValueFromAdditionalBypass =
2631 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
2632 Step, II.getKind(), II.getInductionBinOp());
2633 EndValueFromAdditionalBypass->setName("ind.end");
2634 }
2635 }
2636
2637 // Create phi nodes to merge from the backedge-taken check block.
2638 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
2640 // Copy original phi DL over to the new one.
2641 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2642
2643 // The new PHI merges the original incoming value, in case of a bypass,
2644 // or the value at the end of the vectorized loop.
2645 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
2646
2647 // Fix the scalar body counter (PHI node).
2648 // The old induction's phi node in the scalar body needs the truncated
2649 // value.
2650 for (BasicBlock *BB : BypassBlocks)
2651 BCResumeVal->addIncoming(II.getStartValue(), BB);
2652
2653 if (AdditionalBypass.first)
2654 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
2655 EndValueFromAdditionalBypass);
2656 return BCResumeVal;
2657}
2658
2659/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2660/// expansion results.
2662 const SCEV2ValueTy &ExpandedSCEVs) {
2663 const SCEV *Step = ID.getStep();
2664 if (auto *C = dyn_cast<SCEVConstant>(Step))
2665 return C->getValue();
2666 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2667 return U->getValue();
2668 auto I = ExpandedSCEVs.find(Step);
2669 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2670 return I->second;
2671}
2672
2674 const SCEV2ValueTy &ExpandedSCEVs,
2675 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2676 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
2677 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
2678 "Inconsistent information about additional bypass.");
2679 // We are going to resume the execution of the scalar loop.
2680 // Go over all of the induction variables that we found and fix the
2681 // PHIs that are left in the scalar version of the loop.
2682 // The starting values of PHI nodes depend on the counter of the last
2683 // iteration in the vectorized loop.
2684 // If we come from a bypass edge then we need to start from the original
2685 // start value.
2686 for (const auto &InductionEntry : Legal->getInductionVars()) {
2687 PHINode *OrigPhi = InductionEntry.first;
2688 const InductionDescriptor &II = InductionEntry.second;
2689 PHINode *BCResumeVal = createInductionResumeValue(
2690 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
2691 AdditionalBypass);
2692 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
2693 }
2694}
2695
2696std::pair<BasicBlock *, Value *>
2698 const SCEV2ValueTy &ExpandedSCEVs) {
2699 /*
2700 In this function we generate a new loop. The new loop will contain
2701 the vectorized instructions while the old loop will continue to run the
2702 scalar remainder.
2703
2704 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2705 / | preheader are expanded here. Eventually all required SCEV
2706 / | expansion should happen here.
2707 / v
2708 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2709 | / |
2710 | / v
2711 || [ ] <-- vector pre header.
2712 |/ |
2713 | v
2714 | [ ] \
2715 | [ ]_| <-- vector loop (created during VPlan execution).
2716 | |
2717 | v
2718 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2719 | | successors created during VPlan execution)
2720 \/ |
2721 /\ v
2722 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2723 | |
2724 (opt) v <-- edge from middle to exit iff epilogue is not required.
2725 | [ ] \
2726 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
2727 \ |
2728 \ v
2729 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2730 ...
2731 */
2732
2733 // Create an empty vector loop, and prepare basic blocks for the runtime
2734 // checks.
2736
2737 // Now, compare the new count to zero. If it is zero skip the vector loop and
2738 // jump to the scalar loop. This check also covers the case where the
2739 // backedge-taken count is uint##_max: adding one to it will overflow leading
2740 // to an incorrect trip count of zero. In this (rare) case we will also jump
2741 // to the scalar loop.
2743
2744 // Generate the code to check any assumptions that we've made for SCEV
2745 // expressions.
2747
2748 // Generate the code that checks in runtime if arrays overlap. We put the
2749 // checks into a separate block to make the more common case of few elements
2750 // faster.
2752
2753 // Emit phis for the new starting index of the scalar loop.
2754 createInductionResumeValues(ExpandedSCEVs);
2755
2756 return {LoopVectorPreHeader, nullptr};
2757}
2758
2759// Fix up external users of the induction variable. At this point, we are
2760// in LCSSA form, with all external PHIs that use the IV having one input value,
2761// coming from the remainder loop. We need those PHIs to also have a correct
2762// value for the IV when arriving directly from the middle block.
2764 const InductionDescriptor &II,
2765 Value *VectorTripCount, Value *EndValue,
2766 BasicBlock *MiddleBlock,
2767 BasicBlock *VectorHeader, VPlan &Plan,
2768 VPTransformState &State) {
2769 // There are two kinds of external IV usages - those that use the value
2770 // computed in the last iteration (the PHI) and those that use the penultimate
2771 // value (the value that feeds into the phi from the loop latch).
2772 // We allow both, but they, obviously, have different values.
2773
2774 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
2775
2776 DenseMap<Value *, Value *> MissingVals;
2777
2778 // An external user of the last iteration's value should see the value that
2779 // the remainder loop uses to initialize its own IV.
2781 for (User *U : PostInc->users()) {
2782 Instruction *UI = cast<Instruction>(U);
2783 if (!OrigLoop->contains(UI)) {
2784 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2785 MissingVals[UI] = EndValue;
2786 }
2787 }
2788
2789 // An external user of the penultimate value need to see EndValue - Step.
2790 // The simplest way to get this is to recompute it from the constituent SCEVs,
2791 // that is Start + (Step * (CRD - 1)).
2792 for (User *U : OrigPhi->users()) {
2793 auto *UI = cast<Instruction>(U);
2794 if (!OrigLoop->contains(UI)) {
2795 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2796 IRBuilder<> B(MiddleBlock->getTerminator());
2797
2798 // Fast-math-flags propagate from the original induction instruction.
2799 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2800 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2801
2802 Value *CountMinusOne = B.CreateSub(
2803 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
2804 CountMinusOne->setName("cmo");
2805
2806 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2807 assert(StepVPV && "step must have been expanded during VPlan execution");
2808 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2809 : State.get(StepVPV, {0, 0});
2810 Value *Escape =
2811 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
2812 II.getKind(), II.getInductionBinOp());
2813 Escape->setName("ind.escape");
2814 MissingVals[UI] = Escape;
2815 }
2816 }
2817
2818 for (auto &I : MissingVals) {
2819 PHINode *PHI = cast<PHINode>(I.first);
2820 // One corner case we have to handle is two IVs "chasing" each-other,
2821 // that is %IV2 = phi [...], [ %IV1, %latch ]
2822 // In this case, if IV1 has an external use, we need to avoid adding both
2823 // "last value of IV1" and "penultimate value of IV2". So, verify that we
2824 // don't already have an incoming value for the middle block.
2825 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
2826 PHI->addIncoming(I.second, MiddleBlock);
2827 Plan.removeLiveOut(PHI);
2828 }
2829 }
2830}
2831
2832namespace {
2833
2834struct CSEDenseMapInfo {
2835 static bool canHandle(const Instruction *I) {
2836 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2837 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2838 }
2839
2840 static inline Instruction *getEmptyKey() {
2842 }
2843
2844 static inline Instruction *getTombstoneKey() {
2846 }
2847
2848 static unsigned getHashValue(const Instruction *I) {
2849 assert(canHandle(I) && "Unknown instruction!");
2850 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2851 I->value_op_end()));
2852 }
2853
2854 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2855 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2856 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2857 return LHS == RHS;
2858 return LHS->isIdenticalTo(RHS);
2859 }
2860};
2861
2862} // end anonymous namespace
2863
2864///Perform cse of induction variable instructions.
2865static void cse(BasicBlock *BB) {
2866 // Perform simple cse.
2868 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2869 if (!CSEDenseMapInfo::canHandle(&In))
2870 continue;
2871
2872 // Check if we can replace this instruction with any of the
2873 // visited instructions.
2874 if (Instruction *V = CSEMap.lookup(&In)) {
2875 In.replaceAllUsesWith(V);
2876 In.eraseFromParent();
2877 continue;
2878 }
2879
2880 CSEMap[&In] = &In;
2881 }
2882}
2883
2886 ElementCount VF) const {
2887 // We only need to calculate a cost if the VF is scalar; for actual vectors
2888 // we should already have a pre-calculated cost at each VF.
2889 if (!VF.isScalar())
2890 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2891
2893 Type *RetTy = CI->getType();
2895 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
2896 return *RedCost;
2897
2899 for (auto &ArgOp : CI->args())
2900 Tys.push_back(ArgOp->getType());
2901
2902 InstructionCost ScalarCallCost =
2904
2905 // If this is an intrinsic we may have a lower cost for it.
2907 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2908 return std::min(ScalarCallCost, IntrinsicCost);
2909 }
2910 return ScalarCallCost;
2911}
2912
2914 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2915 return Elt;
2916 return VectorType::get(Elt, VF);
2917}
2918
2921 ElementCount VF) const {
2923 assert(ID && "Expected intrinsic call!");
2924 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
2925 FastMathFlags FMF;
2926 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2927 FMF = FPMO->getFastMathFlags();
2928
2931 SmallVector<Type *> ParamTys;
2932 std::transform(FTy->param_begin(), FTy->param_end(),
2933 std::back_inserter(ParamTys),
2934 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
2935
2936 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2937 dyn_cast<IntrinsicInst>(CI));
2938 return TTI.getIntrinsicInstrCost(CostAttrs,
2940}
2941
2943 VPlan &Plan) {
2944 // Fix widened non-induction PHIs by setting up the PHI operands.
2946 fixNonInductionPHIs(Plan, State);
2947
2948 // Forget the original basic block.
2951
2952 // After vectorization, the exit blocks of the original loop will have
2953 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2954 // looked through single-entry phis.
2955 SmallVector<BasicBlock *> ExitBlocks;
2956 OrigLoop->getExitBlocks(ExitBlocks);
2957 for (BasicBlock *Exit : ExitBlocks)
2958 for (PHINode &PN : Exit->phis())
2960
2961 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2962 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
2963 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
2964 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2965 // No edge from the middle block to the unique exit block has been inserted
2966 // and there is nothing to fix from vector loop; phis should have incoming
2967 // from scalar loop only.
2968 } else {
2969 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
2970 // the cost model.
2971
2972 // If we inserted an edge from the middle block to the unique exit block,
2973 // update uses outside the loop (phis) to account for the newly inserted
2974 // edge.
2975
2976 // Fix-up external users of the induction variables.
2977 for (const auto &Entry : Legal->getInductionVars())
2978 fixupIVUsers(Entry.first, Entry.second,
2980 IVEndValues[Entry.first], LoopMiddleBlock,
2981 VectorLoop->getHeader(), Plan, State);
2982 }
2983
2984 // Fix live-out phis not already fixed earlier.
2985 for (const auto &KV : Plan.getLiveOuts())
2986 KV.second->fixPhi(Plan, State);
2987
2989 sinkScalarOperands(&*PI);
2990
2991 // Remove redundant induction instructions.
2992 cse(VectorLoop->getHeader());
2993
2994 // Set/update profile weights for the vector and remainder loops as original
2995 // loop iterations are now distributed among them. Note that original loop
2996 // represented by LoopScalarBody becomes remainder loop after vectorization.
2997 //
2998 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2999 // end up getting slightly roughened result but that should be OK since
3000 // profile is not inherently precise anyway. Note also possible bypass of
3001 // vector code caused by legality checks is ignored, assigning all the weight
3002 // to the vector loop, optimistically.
3003 //
3004 // For scalable vectorization we can't know at compile time how many iterations
3005 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3006 // vscale of '1'.
3009 VF.getKnownMinValue() * UF);
3010}
3011
3013 // The basic block and loop containing the predicated instruction.
3014 auto *PredBB = PredInst->getParent();
3015 auto *VectorLoop = LI->getLoopFor(PredBB);
3016
3017 // Initialize a worklist with the operands of the predicated instruction.
3018 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3019
3020 // Holds instructions that we need to analyze again. An instruction may be
3021 // reanalyzed if we don't yet know if we can sink it or not.
3022 SmallVector<Instruction *, 8> InstsToReanalyze;
3023
3024 // Returns true if a given use occurs in the predicated block. Phi nodes use
3025 // their operands in their corresponding predecessor blocks.
3026 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3027 auto *I = cast<Instruction>(U.getUser());
3028 BasicBlock *BB = I->getParent();
3029 if (auto *Phi = dyn_cast<PHINode>(I))
3030 BB = Phi->getIncomingBlock(
3031 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3032 return BB == PredBB;
3033 };
3034
3035 // Iteratively sink the scalarized operands of the predicated instruction
3036 // into the block we created for it. When an instruction is sunk, it's
3037 // operands are then added to the worklist. The algorithm ends after one pass
3038 // through the worklist doesn't sink a single instruction.
3039 bool Changed;
3040 do {
3041 // Add the instructions that need to be reanalyzed to the worklist, and
3042 // reset the changed indicator.
3043 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3044 InstsToReanalyze.clear();
3045 Changed = false;
3046
3047 while (!Worklist.empty()) {
3048 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3049
3050 // We can't sink an instruction if it is a phi node, is not in the loop,
3051 // may have side effects or may read from memory.
3052 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3053 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3054 I->mayHaveSideEffects() || I->mayReadFromMemory())
3055 continue;
3056
3057 // If the instruction is already in PredBB, check if we can sink its
3058 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3059 // sinking the scalar instruction I, hence it appears in PredBB; but it
3060 // may have failed to sink I's operands (recursively), which we try
3061 // (again) here.
3062 if (I->getParent() == PredBB) {
3063 Worklist.insert(I->op_begin(), I->op_end());
3064 continue;
3065 }
3066
3067 // It's legal to sink the instruction if all its uses occur in the
3068 // predicated block. Otherwise, there's nothing to do yet, and we may
3069 // need to reanalyze the instruction.
3070 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3071 InstsToReanalyze.push_back(I);
3072 continue;
3073 }
3074
3075 // Move the instruction to the beginning of the predicated block, and add
3076 // it's operands to the worklist.
3077 I->moveBefore(&*PredBB->getFirstInsertionPt());
3078 Worklist.insert(I->op_begin(), I->op_end());
3079
3080 // The sinking may have enabled other instructions to be sunk, so we will
3081 // need to iterate.
3082 Changed = true;
3083 }
3084 } while (Changed);
3085}
3086
3088 VPTransformState &State) {
3089 auto Iter = vp_depth_first_deep(Plan.getEntry());
3090 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3091 for (VPRecipeBase &P : VPBB->phis()) {
3092 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3093 if (!VPPhi)
3094 continue;
3095 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3096 // Make sure the builder has a valid insert point.
3097 Builder.SetInsertPoint(NewPhi);
3098 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3099 VPValue *Inc = VPPhi->getIncomingValue(i);
3100 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3101 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3102 }
3103 }
3104 }
3105}
3106
3107void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3108 // We should not collect Scalars more than once per VF. Right now, this
3109 // function is called from collectUniformsAndScalars(), which already does
3110 // this check. Collecting Scalars for VF=1 does not make any sense.
3111 assert(VF.isVector() && !Scalars.contains(VF) &&
3112 "This function should not be visited twice for the same VF");
3113
3114 // This avoids any chances of creating a REPLICATE recipe during planning
3115 // since that would result in generation of scalarized code during execution,
3116 // which is not supported for scalable vectors.
3117 if (VF.isScalable()) {
3118 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3119 return;
3120 }
3121
3123
3124 // These sets are used to seed the analysis with pointers used by memory
3125 // accesses that will remain scalar.
3127 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3128 auto *Latch = TheLoop->getLoopLatch();
3129
3130 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3131 // The pointer operands of loads and stores will be scalar as long as the
3132 // memory access is not a gather or scatter operation. The value operand of a
3133 // store will remain scalar if the store is scalarized.
3134 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3135 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3136 assert(WideningDecision != CM_Unknown &&
3137 "Widening decision should be ready at this moment");
3138 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3139 if (Ptr == Store->getValueOperand())
3140 return WideningDecision == CM_Scalarize;
3141 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3142 "Ptr is neither a value or pointer operand");
3143 return WideningDecision != CM_GatherScatter;
3144 };
3145
3146 // A helper that returns true if the given value is a bitcast or
3147 // getelementptr instruction contained in the loop.
3148 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3149 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3150 isa<GetElementPtrInst>(V)) &&
3152 };
3153
3154 // A helper that evaluates a memory access's use of a pointer. If the use will
3155 // be a scalar use and the pointer is only used by memory accesses, we place
3156 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3157 // PossibleNonScalarPtrs.
3158 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3159 // We only care about bitcast and getelementptr instructions contained in
3160 // the loop.
3161 if (!isLoopVaryingBitCastOrGEP(Ptr))
3162 return;
3163
3164 // If the pointer has already been identified as scalar (e.g., if it was
3165 // also identified as uniform), there's nothing to do.
3166 auto *I = cast<Instruction>(Ptr);
3167 if (Worklist.count(I))
3168 return;
3169
3170 // If the use of the pointer will be a scalar use, and all users of the
3171 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3172 // place the pointer in PossibleNonScalarPtrs.
3173 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3174 return isa<LoadInst>(U) || isa<StoreInst>(U);
3175 }))
3176 ScalarPtrs.insert(I);
3177 else
3178 PossibleNonScalarPtrs.insert(I);
3179 };
3180
3181 // We seed the scalars analysis with three classes of instructions: (1)
3182 // instructions marked uniform-after-vectorization and (2) bitcast,
3183 // getelementptr and (pointer) phi instructions used by memory accesses
3184 // requiring a scalar use.
3185 //
3186 // (1) Add to the worklist all instructions that have been identified as
3187 // uniform-after-vectorization.
3188 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3189
3190 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3191 // memory accesses requiring a scalar use. The pointer operands of loads and
3192 // stores will be scalar as long as the memory accesses is not a gather or
3193 // scatter operation. The value operand of a store will remain scalar if the
3194 // store is scalarized.
3195 for (auto *BB : TheLoop->blocks())
3196 for (auto &I : *BB) {
3197 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3198 evaluatePtrUse(Load, Load->getPointerOperand());
3199 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3200 evaluatePtrUse(Store, Store->getPointerOperand());
3201 evaluatePtrUse(Store, Store->getValueOperand());
3202 }
3203 }
3204 for (auto *I : ScalarPtrs)
3205 if (!PossibleNonScalarPtrs.count(I)) {
3206 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3207 Worklist.insert(I);
3208 }
3209
3210 // Insert the forced scalars.
3211 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3212 // induction variable when the PHI user is scalarized.
3213 auto ForcedScalar = ForcedScalars.find(VF);
3214 if (ForcedScalar != ForcedScalars.end())
3215 for (auto *I : ForcedScalar->second) {
3216 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3217 Worklist.insert(I);
3218 }
3219
3220 // Expand the worklist by looking through any bitcasts and getelementptr
3221 // instructions we've already identified as scalar. This is similar to the
3222 // expansion step in collectLoopUniforms(); however, here we're only
3223 // expanding to include additional bitcasts and getelementptr instructions.
3224 unsigned Idx = 0;
3225 while (Idx != Worklist.size()) {
3226 Instruction *Dst = Worklist[Idx++];
3227 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3228 continue;
3229 auto *Src = cast<Instruction>(Dst->getOperand(0));
3230 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3231 auto *J = cast<Instruction>(U);
3232 return !TheLoop->contains(J) || Worklist.count(J) ||
3233 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3234 isScalarUse(J, Src));
3235 })) {
3236 Worklist.insert(Src);
3237 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3238 }
3239 }
3240
3241 // An induction variable will remain scalar if all users of the induction
3242 // variable and induction variable update remain scalar.
3243 for (const auto &Induction : Legal->getInductionVars()) {
3244 auto *Ind = Induction.first;
3245 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3246
3247 // If tail-folding is applied, the primary induction variable will be used
3248 // to feed a vector compare.
3249 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3250 continue;
3251
3252 // Returns true if \p Indvar is a pointer induction that is used directly by
3253 // load/store instruction \p I.
3254 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3255 Instruction *I) {
3256 return Induction.second.getKind() ==
3258 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3259 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3260 };
3261
3262 // Determine if all users of the induction variable are scalar after
3263 // vectorization.
3264 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3265 auto *I = cast<Instruction>(U);
3266 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3267 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3268 });
3269 if (!ScalarInd)
3270 continue;
3271
3272 // If the induction variable update is a fixed-order recurrence, neither the
3273 // induction variable or its update should be marked scalar after
3274 // vectorization.
3275 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3276 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3277 continue;
3278
3279 // Determine if all users of the induction variable update instruction are
3280 // scalar after vectorization.
3281 auto ScalarIndUpdate =
3282 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3283 auto *I = cast<Instruction>(U);
3284 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3285 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3286 });
3287 if (!ScalarIndUpdate)
3288 continue;
3289
3290 // The induction variable and its update instruction will remain scalar.
3291 Worklist.insert(Ind);
3292 Worklist.insert(IndUpdate);
3293 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3294 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3295 << "\n");
3296 }
3297
3298 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3299}
3300
3302 Instruction *I, ElementCount VF) const {
3303 if (!isPredicatedInst(I))
3304 return false;
3305
3306 // Do we have a non-scalar lowering for this predicated
3307 // instruction? No - it is scalar with predication.
3308 switch(I->getOpcode()) {
3309 default:
3310 return true;
3311 case Instruction::Call:
3312 if (VF.isScalar())
3313 return true;
3314 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3315 .Kind == CM_Scalarize;
3316 case Instruction::Load:
3317 case Instruction::Store: {
3319 auto *Ty = getLoadStoreType(I);
3320 Type *VTy = Ty;
3321 if (VF.isVector())
3322 VTy = VectorType::get(Ty, VF);
3323 const Align Alignment = getLoadStoreAlignment(I);
3324 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3325 TTI.isLegalMaskedGather(VTy, Alignment))
3326 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3327 TTI.isLegalMaskedScatter(VTy, Alignment));
3328 }
3329 case Instruction::UDiv:
3330 case Instruction::SDiv:
3331 case Instruction::SRem:
3332 case Instruction::URem: {
3333 // We have the option to use the safe-divisor idiom to avoid predication.
3334 // The cost based decision here will always select safe-divisor for
3335 // scalable vectors as scalarization isn't legal.
3336 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3337 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3338 }
3339 }
3340}
3341
3343 if (!blockNeedsPredicationForAnyReason(I->getParent()))
3344 return false;
3345
3346 // Can we prove this instruction is safe to unconditionally execute?
3347 // If not, we must use some form of predication.
3348 switch(I->getOpcode()) {
3349 default:
3350 return false;
3351 case Instruction::Load:
3352 case Instruction::Store: {
3353 if (!Legal->isMaskRequired(I))
3354 return false;
3355 // When we know the load's address is loop invariant and the instruction
3356 // in the original scalar loop was unconditionally executed then we
3357 // don't need to mark it as a predicated instruction. Tail folding may
3358 // introduce additional predication, but we're guaranteed to always have
3359 // at least one active lane. We call Legal->blockNeedsPredication here
3360 // because it doesn't query tail-folding. For stores, we need to prove
3361 // both speculation safety (which follows from the same argument as loads),
3362 // but also must prove the value being stored is correct. The easiest
3363 // form of the later is to require that all values stored are the same.
3365 (isa<LoadInst>(I) ||
3366 (isa<StoreInst>(I) &&
3367 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3368 !Legal->blockNeedsPredication(I->getParent()))
3369 return false;
3370 return true;
3371 }
3372 case Instruction::UDiv:
3373 case Instruction::SDiv:
3374 case Instruction::SRem:
3375 case Instruction::URem:
3376 // TODO: We can use the loop-preheader as context point here and get
3377 // context sensitive reasoning
3379 case Instruction::Call:
3380 return Legal->isMaskRequired(I);
3381 }
3382}
3383
3384std::pair<InstructionCost, InstructionCost>
3386 ElementCount VF) const {
3387 assert(I->getOpcode() == Instruction::UDiv ||
3388 I->getOpcode() == Instruction::SDiv ||
3389 I->getOpcode() == Instruction::SRem ||
3390 I->getOpcode() == Instruction::URem);
3392
3394
3395 // Scalarization isn't legal for scalable vector types
3396 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3397 if (!VF.isScalable()) {
3398 // Get the scalarization cost and scale this amount by the probability of
3399 // executing the predicated block. If the instruction is not predicated,
3400 // we fall through to the next case.
3401 ScalarizationCost = 0;
3402
3403 // These instructions have a non-void type, so account for the phi nodes
3404 // that we will create. This cost is likely to be zero. The phi node
3405 // cost, if any, should be scaled by the block probability because it
3406 // models a copy at the end of each predicated block.
3407 ScalarizationCost += VF.getKnownMinValue() *
3408 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3409
3410 // The cost of the non-predicated instruction.
3411 ScalarizationCost += VF.getKnownMinValue() *
3412 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3413
3414 // The cost of insertelement and extractelement instructions needed for
3415 // scalarization.
3416 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3417
3418 // Scale the cost by the probability of executing the predicated blocks.
3419 // This assumes the predicated block for each vector lane is equally
3420 // likely.
3421 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3422 }
3423 InstructionCost SafeDivisorCost = 0;
3424
3425 auto *VecTy = ToVectorTy(I->getType(), VF);
3426
3427 // The cost of the select guard to ensure all lanes are well defined
3428 // after we speculate above any internal control flow.
3429 SafeDivisorCost += TTI.getCmpSelInstrCost(
3430 Instruction::Select, VecTy,
3431 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
3433
3434 // Certain instructions can be cheaper to vectorize if they have a constant
3435 // second vector operand. One example of this are shifts on x86.
3436 Value *Op2 = I->getOperand(1);
3437 auto Op2Info = TTI.getOperandInfo(Op2);
3438 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3439 Legal->isInvariant(Op2))
3441
3442 SmallVector<const Value *, 4> Operands(I->operand_values());
3443 SafeDivisorCost += TTI.getArithmeticInstrCost(
3444 I->getOpcode(), VecTy, CostKind,
3445 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3446 Op2Info, Operands, I);
3447 return {ScalarizationCost, SafeDivisorCost};
3448}
3449
3451 Instruction *I, ElementCount VF) const {
3452 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3454 "Decision should not be set yet.");
3455 auto *Group = getInterleavedAccessGroup(I);
3456 assert(Group && "Must have a group.");
3457
3458 // If the instruction's allocated size doesn't equal it's type size, it
3459 // requires padding and will be scalarized.
3460 auto &DL = I->getDataLayout();
3461 auto *ScalarTy = getLoadStoreType(I);
3462 if (hasIrregularType(ScalarTy, DL))
3463 return false;
3464
3465 // If the group involves a non-integral pointer, we may not be able to
3466 // losslessly cast all values to a common type.
3467 unsigned InterleaveFactor = Group->getFactor();
3468 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3469 for (unsigned i = 0; i < InterleaveFactor; i++) {
3470 Instruction *Member = Group->getMember(i);
3471 if (!Member)
3472 continue;
3473 auto *MemberTy = getLoadStoreType(Member);
3474 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3475 // Don't coerce non-integral pointers to integers or vice versa.
3476 if (MemberNI != ScalarNI) {
3477 // TODO: Consider adding special nullptr value case here
3478 return false;
3479 } else if (MemberNI && ScalarNI &&
3480 ScalarTy->getPointerAddressSpace() !=
3481 MemberTy->getPointerAddressSpace()) {
3482 return false;
3483 }
3484 }
3485
3486 // Check if masking is required.
3487 // A Group may need masking for one of two reasons: it resides in a block that
3488 // needs predication, or it was decided to use masking to deal with gaps
3489 // (either a gap at the end of a load-access that may result in a speculative
3490 // load, or any gaps in a store-access).
3491 bool PredicatedAccessRequiresMasking =
3492 blockNeedsPredicationForAnyReason(I->getParent()) &&
3494 bool LoadAccessWithGapsRequiresEpilogMasking =
3495 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3497 bool StoreAccessWithGapsRequiresMasking =
3498 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3499 if (!PredicatedAccessRequiresMasking &&
3500 !LoadAccessWithGapsRequiresEpilogMasking &&
3501 !StoreAccessWithGapsRequiresMasking)
3502 return true;
3503
3504 // If masked interleaving is required, we expect that the user/target had
3505 // enabled it, because otherwise it either wouldn't have been created or
3506 // it should have been invalidated by the CostModel.
3508 "Masked interleave-groups for predicated accesses are not enabled.");
3509
3510 if (Group->isReverse())
3511 return false;
3512
3513 auto *Ty = getLoadStoreType(I);
3514 const Align Alignment = getLoadStoreAlignment(I);
3515 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3516 : TTI.isLegalMaskedStore(Ty, Alignment);
3517}
3518
3520 Instruction *I, ElementCount VF) {
3521 // Get and ensure we have a valid memory instruction.
3522 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3523
3525 auto *ScalarTy = getLoadStoreType(I);
3526
3527 // In order to be widened, the pointer should be consecutive, first of all.
3528 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3529 return false;
3530
3531 // If the instruction is a store located in a predicated block, it will be
3532 // scalarized.
3533 if (isScalarWithPredication(I, VF))
3534 return false;
3535
3536 // If the instruction's allocated size doesn't equal it's type size, it
3537 // requires padding and will be scalarized.
3538 auto &DL = I->getDataLayout();
3539 if (hasIrregularType(ScalarTy, DL))
3540 return false;
3541
3542 return true;
3543}
3544
3545void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3546 // We should not collect Uniforms more than once per VF. Right now,
3547 // this function is called from collectUniformsAndScalars(), which
3548 // already does this check. Collecting Uniforms for VF=1 does not make any
3549 // sense.
3550
3551 assert(VF.isVector() && !Uniforms.contains(VF) &&
3552 "This function should not be visited twice for the same VF");
3553
3554 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
3555 // not analyze again. Uniforms.count(VF) will return 1.
3556 Uniforms[VF].clear();
3557
3558 // We now know that the loop is vectorizable!
3559 // Collect instructions inside the loop that will remain uniform after
3560 // vectorization.
3561
3562 // Global values, params and instructions outside of current loop are out of
3563 // scope.
3564 auto isOutOfScope = [&](Value *V) -> bool {
3565 Instruction *I = dyn_cast<Instruction>(V);
3566 return (!I || !TheLoop->contains(I));
3567 };
3568
3569 // Worklist containing uniform instructions demanding lane 0.
3570 SetVector<Instruction *> Worklist;
3571
3572 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3573 // that require predication must not be considered uniform after
3574 // vectorization, because that would create an erroneous replicating region
3575 // where only a single instance out of VF should be formed.
3576 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
3577 if (isOutOfScope(I)) {
3578 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3579 << *I << "\n");
3580 return;
3581 }
3582 if (isPredicatedInst(I)) {
3583 LLVM_DEBUG(
3584 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3585 << "\n");
3586 return;
3587 }
3588 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3589 Worklist.insert(I);
3590 };
3591
3592 // Start with the conditional branches exiting the loop. If the branch
3593 // condition is an instruction contained in the loop that is only used by the
3594 // branch, it is uniform.
3596 TheLoop->getExitingBlocks(Exiting);
3597 for (BasicBlock *E : Exiting) {
3598 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3599 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3600 addToWorklistIfAllowed(Cmp);
3601 }
3602
3603 auto PrevVF = VF.divideCoefficientBy(2);
3604 // Return true if all lanes perform the same memory operation, and we can
3605 // thus chose to execute only one.
3606 auto isUniformMemOpUse = [&](Instruction *I) {
3607 // If the value was already known to not be uniform for the previous
3608 // (smaller VF), it cannot be uniform for the larger VF.
3609 if (PrevVF.isVector()) {
3610 auto Iter = Uniforms.find(PrevVF);
3611 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3612 return false;
3613 }
3614 if (!Legal->isUniformMemOp(*I, VF))
3615 return false;
3616 if (isa<LoadInst>(I))
3617 // Loading the same address always produces the same result - at least
3618 // assuming aliasing and ordering which have already been checked.
3619 return true;
3620 // Storing the same value on every iteration.
3621 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3622 };
3623
3624 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
3625 InstWidening WideningDecision = getWideningDecision(I, VF);
3626 assert(WideningDecision != CM_Unknown &&
3627 "Widening decision should be ready at this moment");
3628
3629 if (isUniformMemOpUse(I))
3630 return true;
3631
3632 return (WideningDecision == CM_Widen ||
3633 WideningDecision == CM_Widen_Reverse ||
3634 WideningDecision == CM_Interleave);
3635 };
3636
3637 // Returns true if Ptr is the pointer operand of a memory access instruction
3638 // I, I is known to not require scalarization, and the pointer is not also
3639 // stored.
3640 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3641 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3642 return false;
3643 return getLoadStorePointerOperand(I) == Ptr &&
3644 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3645 };
3646
3647 // Holds a list of values which are known to have at least one uniform use.
3648 // Note that there may be other uses which aren't uniform. A "uniform use"
3649 // here is something which only demands lane 0 of the unrolled iterations;
3650 // it does not imply that all lanes produce the same value (e.g. this is not
3651 // the usual meaning of uniform)
3652 SetVector<Value *> HasUniformUse;
3653
3654 // Scan the loop for instructions which are either a) known to have only
3655 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3656 for (auto *BB : TheLoop->blocks())
3657 for (auto &I : *BB) {
3658 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3659 switch (II->getIntrinsicID()) {
3660 case Intrinsic::sideeffect:
3661 case Intrinsic::experimental_noalias_scope_decl:
3662 case Intrinsic::assume:
3663 case Intrinsic::lifetime_start:
3664 case Intrinsic::lifetime_end:
3666 addToWorklistIfAllowed(&I);
3667 break;
3668 default:
3669 break;
3670 }
3671 }
3672
3673 // ExtractValue instructions must be uniform, because the operands are
3674 // known to be loop-invariant.
3675 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3676 assert(isOutOfScope(EVI->getAggregateOperand()) &&
3677 "Expected aggregate value to be loop invariant");
3678 addToWorklistIfAllowed(EVI);
3679 continue;
3680 }
3681
3682 // If there's no pointer operand, there's nothing to do.
3684 if (!Ptr)
3685 continue;
3686
3687 if (isUniformMemOpUse(&I))
3688 addToWorklistIfAllowed(&I);
3689
3690 if (isVectorizedMemAccessUse(&I, Ptr))
3691 HasUniformUse.insert(Ptr);
3692 }
3693
3694 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3695 // demanding) users. Since loops are assumed to be in LCSSA form, this
3696 // disallows uses outside the loop as well.
3697 for (auto *V : HasUniformUse) {
3698 if (isOutOfScope(V))
3699 continue;
3700 auto *I = cast<Instruction>(V);
3701 auto UsersAreMemAccesses =
3702 llvm::all_of(I->users(), [&](User *U) -> bool {
3703 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
3704 });
3705 if (UsersAreMemAccesses)
3706 addToWorklistIfAllowed(I);
3707 }
3708
3709 // Expand Worklist in topological order: whenever a new instruction
3710 // is added , its users should be already inside Worklist. It ensures
3711 // a uniform instruction will only be used by uniform instructions.
3712 unsigned idx = 0;
3713 while (idx != Worklist.size()) {
3714 Instruction *I = Worklist[idx++];
3715
3716 for (auto *OV : I->operand_values()) {
3717 // isOutOfScope operands cannot be uniform instructions.
3718 if (isOutOfScope(OV))
3719 continue;
3720 // First order recurrence Phi's should typically be considered
3721 // non-uniform.
3722 auto *OP = dyn_cast<PHINode>(OV);
3724 continue;
3725 // If all the users of the operand are uniform, then add the
3726 // operand into the uniform worklist.
3727 auto *OI = cast<Instruction>(OV);
3728 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3729 auto *J = cast<Instruction>(U);
3730 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
3731 }))
3732 addToWorklistIfAllowed(OI);
3733 }
3734 }
3735
3736 // For an instruction to be added into Worklist above, all its users inside
3737 // the loop should also be in Worklist. However, this condition cannot be
3738 // true for phi nodes that form a cyclic dependence. We must process phi
3739 // nodes separately. An induction variable will remain uniform if all users
3740 // of the induction variable and induction variable update remain uniform.
3741 // The code below handles both pointer and non-pointer induction variables.
3742 BasicBlock *Latch = TheLoop->getLoopLatch();
3743 for (const auto &Induction : Legal->getInductionVars()) {
3744 auto *Ind = Induction.first;
3745 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3746
3747 // Determine if all users of the induction variable are uniform after
3748 // vectorization.
3749 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3750 auto *I = cast<Instruction>(U);
3751 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3752 isVectorizedMemAccessUse(I, Ind);
3753 });
3754 if (!UniformInd)
3755 continue;
3756
3757 // Determine if all users of the induction variable update instruction are
3758 // uniform after vectorization.
3759 auto UniformIndUpdate =
3760 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3761 auto *I = cast<Instruction>(U);
3762 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3763 isVectorizedMemAccessUse(I, IndUpdate);
3764 });
3765 if (!UniformIndUpdate)
3766 continue;
3767
3768 // The induction variable and its update instruction will remain uniform.
3769 addToWorklistIfAllowed(Ind);
3770 addToWorklistIfAllowed(IndUpdate);
3771 }
3772
3773 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3774}
3775
3777 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3778
3780 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3781 "runtime pointer checks needed. Enable vectorization of this "
3782 "loop with '#pragma clang loop vectorize(enable)' when "
3783 "compiling with -Os/-Oz",
3784 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3785 return true;
3786 }
3787
3788 if (!PSE.getPredicate().isAlwaysTrue()) {
3789 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3790 "runtime SCEV checks needed. Enable vectorization of this "
3791 "loop with '#pragma clang loop vectorize(enable)' when "
3792 "compiling with -Os/-Oz",
3793 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3794 return true;
3795 }
3796
3797 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3798 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3799 reportVectorizationFailure("Runtime stride check for small trip count",
3800 "runtime stride == 1 checks needed. Enable vectorization of "
3801 "this loop without such check by compiling with -Os/-Oz",
3802 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3803 return true;
3804 }
3805
3806 return false;
3807}
3808
3809bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3810 if (IsScalableVectorizationAllowed)
3811 return *IsScalableVectorizationAllowed;
3812
3813 IsScalableVectorizationAllowed = false;
3815 return false;
3816
3818 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3819 "ScalableVectorizationDisabled", ORE, TheLoop);
3820 return false;
3821 }
3822
3823 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3824
3825 auto MaxScalableVF = ElementCount::getScalable(
3826 std::numeric_limits<ElementCount::ScalarTy>::max());
3827
3828 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3829 // FIXME: While for scalable vectors this is currently sufficient, this should
3830 // be replaced by a more detailed mechanism that filters out specific VFs,
3831 // instead of invalidating vectorization for a whole set of VFs based on the
3832 // MaxVF.
3833
3834 // Disable scalable vectorization if the loop contains unsupported reductions.
3835 if (!canVectorizeReductions(MaxScalableVF)) {
3837 "Scalable vectorization not supported for the reduction "
3838 "operations found in this loop.",
3839 "ScalableVFUnfeasible", ORE, TheLoop);
3840 return false;
3841 }
3842
3843 // Disable scalable vectorization if the loop contains any instructions
3844 // with element types not supported for scalable vectors.
3845 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3846 return !Ty->isVoidTy() &&
3848 })) {
3849 reportVectorizationInfo("Scalable vectorization is not supported "
3850 "for all element types found in this loop.",
3851 "ScalableVFUnfeasible", ORE, TheLoop);
3852 return false;
3853 }
3854
3856 reportVectorizationInfo("The target does not provide maximum vscale value "
3857 "for safe distance analysis.",
3858 "ScalableVFUnfeasible", ORE, TheLoop);
3859 return false;
3860 }
3861
3862 IsScalableVectorizationAllowed = true;
3863 return true;
3864}
3865
3867LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3868 if (!isScalableVectorizationAllowed())
3869 return ElementCount::getScalable(0);
3870
3871 auto MaxScalableVF = ElementCount::getScalable(
3872 std::numeric_limits<ElementCount::ScalarTy>::max());
3874 return MaxScalableVF;
3875
3876 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3877 // Limit MaxScalableVF by the maximum safe dependence distance.
3878 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3879
3880 if (!MaxScalableVF)
3882 "Max legal vector width too small, scalable vectorization "
3883 "unfeasible.",
3884 "ScalableVFUnfeasible", ORE, TheLoop);
3885
3886 return MaxScalableVF;
3887}
3888
3889FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3890 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3892 unsigned SmallestType, WidestType;
3893 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3894
3895 // Get the maximum safe dependence distance in bits computed by LAA.
3896 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3897 // the memory accesses that is most restrictive (involved in the smallest
3898 // dependence distance).
3899 unsigned MaxSafeElements =
3901
3902 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3903 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3904
3905 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3906 << ".\n");
3907 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3908 << ".\n");
3909
3910 // First analyze the UserVF, fall back if the UserVF should be ignored.
3911 if (UserVF) {
3912 auto MaxSafeUserVF =
3913 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3914
3915 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3916 // If `VF=vscale x N` is safe, then so is `VF=N`
3917 if (UserVF.isScalable())
3918 return FixedScalableVFPair(
3919 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3920 else
3921 return UserVF;
3922 }
3923
3924 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3925
3926 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3927 // is better to ignore the hint and let the compiler choose a suitable VF.
3928 if (!UserVF.isScalable()) {
3929 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3930 << " is unsafe, clamping to max safe VF="
3931 << MaxSafeFixedVF << ".\n");
3932 ORE->emit([&]() {
3933 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3935 TheLoop->getHeader())
3936 << "User-specified vectorization factor "
3937 << ore::NV("UserVectorizationFactor", UserVF)
3938 << " is unsafe, clamping to maximum safe vectorization factor "
3939 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3940 });
3941 return MaxSafeFixedVF;
3942 }
3943
3945 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3946 << " is ignored because scalable vectors are not "
3947 "available.\n");
3948 ORE->emit([&]() {
3949 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3951 TheLoop->getHeader())
3952 << "User-specified vectorization factor "
3953 << ore::NV("UserVectorizationFactor", UserVF)
3954 << " is ignored because the target does not support scalable "
3955 "vectors. The compiler will pick a more suitable value.";
3956 });
3957 } else {
3958 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3959 << " is unsafe. Ignoring scalable UserVF.\n");
3960 ORE->emit([&]() {
3961 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3963 TheLoop->getHeader())
3964 << "User-specified vectorization factor "
3965 << ore::NV("UserVectorizationFactor", UserVF)
3966 << " is unsafe. Ignoring the hint to let the compiler pick a "
3967 "more suitable value.";
3968 });
3969 }
3970 }
3971
3972 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3973 << " / " << WidestType << " bits.\n");
3974
3977 if (auto MaxVF =
3978 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3979 MaxSafeFixedVF, FoldTailByMasking))
3980 Result.FixedVF = MaxVF;
3981
3982 if (auto MaxVF =
3983 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3984 MaxSafeScalableVF, FoldTailByMasking))
3985 if (MaxVF.isScalable()) {
3986 Result.ScalableVF = MaxVF;
3987 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3988 << "\n");
3989 }
3990
3991 return Result;
3992}
3993
3997 // TODO: It may by useful to do since it's still likely to be dynamically
3998 // uniform if the target can skip.
4000 "Not inserting runtime ptr check for divergent target",
4001 "runtime pointer checks needed. Not enabled for divergent target",
4002 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4004 }
4005
4006 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4007 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4008 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4009 if (TC == 1) {
4010 reportVectorizationFailure("Single iteration (non) loop",
4011 "loop trip count is one, irrelevant for vectorization",
4012 "SingleIterationLoop", ORE, TheLoop);
4014 }
4015
4016 switch (ScalarEpilogueStatus) {
4018 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4020 [[fallthrough]];
4022 LLVM_DEBUG(
4023 dbgs() << "LV: vector predicate hint/switch found.\n"
4024 << "LV: Not allowing scalar epilogue, creating predicated "
4025 << "vector loop.\n");
4026 break;
4028 // fallthrough as a special case of OptForSize
4030 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4031 LLVM_DEBUG(
4032 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4033 else
4034 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4035 << "count.\n");
4036
4037 // Bail if runtime checks are required, which are not good when optimising
4038 // for size.
4041
4042 break;
4043 }
4044
4045 // The only loops we can vectorize without a scalar epilogue, are loops with
4046 // a bottom-test and a single exiting block. We'd have to handle the fact
4047 // that not every instruction executes on the last iteration. This will
4048 // require a lane mask which varies through the vector loop body. (TODO)
4050 // If there was a tail-folding hint/switch, but we can't fold the tail by
4051 // masking, fallback to a vectorization with a scalar epilogue.
4052 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4053 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4054 "scalar epilogue instead.\n");
4055 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4056 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4057 }
4059 }
4060
4061 // Now try the tail folding
4062
4063 // Invalidate interleave groups that require an epilogue if we can't mask
4064 // the interleave-group.
4066 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4067 "No decisions should have been taken at this point");
4068 // Note: There is no need to invalidate any cost modeling decisions here, as
4069 // non where taken so far.
4071 }
4072
4073 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4074
4075 // Avoid tail folding if the trip count is known to be a multiple of any VF
4076 // we choose.
4077 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4078 MaxFactors.FixedVF.getFixedValue();
4079 if (MaxFactors.ScalableVF) {
4080 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4081 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4082 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4083 *MaxPowerOf2RuntimeVF,
4084 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4085 } else
4086 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4087 }
4088
4089 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4090 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4091 "MaxFixedVF must be a power of 2");
4092 unsigned MaxVFtimesIC =
4093 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4094 ScalarEvolution *SE = PSE.getSE();
4095 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4096 const SCEV *ExitCount = SE->getAddExpr(
4097 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4098 const SCEV *Rem = SE->getURemExpr(
4099 SE->applyLoopGuards(ExitCount, TheLoop),
4100 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4101 if (Rem->isZero()) {
4102 // Accept MaxFixedVF if we do not have a tail.
4103 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4104 return MaxFactors;
4105 }
4106 }
4107
4108 // If we don't know the precise trip count, or if the trip count that we
4109 // found modulo the vectorization factor is not zero, try to fold the tail
4110 // by masking.
4111 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4112 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4113 if (foldTailByMasking()) {
4115 LLVM_DEBUG(
4116 dbgs()
4117 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4118 "try to generate VP Intrinsics with scalable vector "
4119 "factors only.\n");
4120 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4121 // for now.
4122 // TODO: extend it for fixed vectors, if required.
4123 assert(MaxFactors.ScalableVF.isScalable() &&
4124 "Expected scalable vector factor.");
4125
4126 MaxFactors.FixedVF = ElementCount::getFixed(1);
4127 }
4128 return MaxFactors;
4129 }
4130
4131 // If there was a tail-folding hint/switch, but we can't fold the tail by
4132 // masking, fallback to a vectorization with a scalar epilogue.
4133 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4134 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4135 "scalar epilogue instead.\n");
4136 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4137 return MaxFactors;
4138 }
4139
4140 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4141 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4143 }
4144
4145 if (TC == 0) {
4147 "Unable to calculate the loop count due to complex control flow",
4148 "unable to calculate the loop count due to complex control flow",
4149 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4151 }
4152
4154 "Cannot optimize for size and vectorize at the same time.",
4155 "cannot optimize for size and vectorize at the same time. "
4156 "Enable vectorization of this loop with '#pragma clang loop "
4157 "vectorize(enable)' when compiling with -Os/-Oz",
4158 "NoTailLoopWithOptForSize", ORE, TheLoop);
4160}
4161
4162ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4163 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4164 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4165 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4166 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4167 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4169
4170 // Convenience function to return the minimum of two ElementCounts.
4171 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4172 assert((LHS.isScalable() == RHS.isScalable()) &&
4173 "Scalable flags must match");
4174 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4175 };
4176
4177 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4178 // Note that both WidestRegister and WidestType may not be a powers of 2.
4179 auto MaxVectorElementCount = ElementCount::get(
4180 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4181 ComputeScalableMaxVF);
4182 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4183 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4184 << (MaxVectorElementCount * WidestType) << " bits.\n");
4185
4186 if (!MaxVectorElementCount) {
4187 LLVM_DEBUG(dbgs() << "LV: The target has no "
4188 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4189 << " vector registers.\n");
4190 return ElementCount::getFixed(1);
4191 }
4192
4193 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4194 if (MaxVectorElementCount.isScalable() &&
4195 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4196 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4197 auto Min = Attr.getVScaleRangeMin();
4198 WidestRegisterMinEC *= Min;
4199 }
4200
4201 // When a scalar epilogue is required, at least one iteration of the scalar
4202 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4203 // max VF that results in a dead vector loop.
4204 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4205 MaxTripCount -= 1;
4206
4207 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4208 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4209 // If upper bound loop trip count (TC) is known at compile time there is no
4210 // point in choosing VF greater than TC (as done in the loop below). Select
4211 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4212 // scalable, we only fall back on a fixed VF when the TC is less than or
4213 // equal to the known number of lanes.
4214 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4215 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4216 "exceeding the constant trip count: "
4217 << ClampedUpperTripCount << "\n");
4218 return ElementCount::get(
4219 ClampedUpperTripCount,
4220 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4221 }
4222
4224 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4226 ElementCount MaxVF = MaxVectorElementCount;
4227 if (MaximizeBandwidth ||
4228 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4231 auto MaxVectorElementCountMaxBW = ElementCount::get(
4232 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4233 ComputeScalableMaxVF);
4234 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4235
4236 // Collect all viable vectorization factors larger than the default MaxVF
4237 // (i.e. MaxVectorElementCount).
4239 for (ElementCount VS = MaxVectorElementCount * 2;
4240 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4241 VFs.push_back(VS);
4242
4243 // For each VF calculate its register usage.
4244 auto RUs = calculateRegisterUsage(VFs);
4245
4246 // Select the largest VF which doesn't require more registers than existing
4247 // ones.
4248 for (int I = RUs.size() - 1; I >= 0; --I) {
4249 const auto &MLU = RUs[I].MaxLocalUsers;
4250 if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4251 return LU.second <= TTI.getNumberOfRegisters(LU.first);
4252 })) {
4253 MaxVF = VFs[I];
4254 break;
4255 }
4256 }
4257 if (ElementCount MinVF =
4258 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4259 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4260 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4261 << ") with target's minimum: " << MinVF << '\n');
4262 MaxVF = MinVF;
4263 }
4264 }
4265
4266 // Invalidate any widening decisions we might have made, in case the loop
4267 // requires prediction (decided later), but we have already made some
4268 // load/store widening decisions.
4270 }
4271 return MaxVF;
4272}
4273
4274/// Convenience function that returns the value of vscale_range iff
4275/// vscale_range.min == vscale_range.max or otherwise returns the value
4276/// returned by the corresponding TTI method.
4277static std::optional<unsigned>
4279 const Function *Fn = L->getHeader()->getParent();
4280 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4281 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4282 auto Min = Attr.getVScaleRangeMin();
4283 auto Max = Attr.getVScaleRangeMax();
4284 if (Max && Min == Max)
4285 return Max;
4286 }
4287
4288 return TTI.getVScaleForTuning();
4289}
4290
4291bool LoopVectorizationPlanner::isMoreProfitable(
4292 const VectorizationFactor &A, const VectorizationFactor &B) const {
4293 InstructionCost CostA = A.Cost;
4294 InstructionCost CostB = B.Cost;
4295
4296 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4297
4298 // Improve estimate for the vector width if it is scalable.
4299 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4300 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4301 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4302 if (A.Width.isScalable())
4303 EstimatedWidthA *= *VScale;
4304 if (B.Width.isScalable())
4305 EstimatedWidthB *= *VScale;
4306 }
4307
4308 // Assume vscale may be larger than 1 (or the value being tuned for),
4309 // so that scalable vectorization is slightly favorable over fixed-width
4310 // vectorization.
4311 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4312 A.Width.isScalable() && !B.Width.isScalable();
4313
4314 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4315 const InstructionCost &RHS) {
4316 return PreferScalable ? LHS <= RHS : LHS < RHS;
4317 };
4318
4319 // To avoid the need for FP division:
4320 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4321 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4322 if (!MaxTripCount)
4323 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4324
4325 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4326 InstructionCost VectorCost,
4327 InstructionCost ScalarCost) {
4328 // If the trip count is a known (possibly small) constant, the trip count
4329 // will be rounded up to an integer number of iterations under
4330 // FoldTailByMasking. The total cost in that case will be
4331 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4332 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4333 // some extra overheads, but for the purpose of comparing the costs of
4334 // different VFs we can use this to compare the total loop-body cost
4335 // expected after vectorization.
4336 if (CM.foldTailByMasking())
4337 return VectorCost * divideCeil(MaxTripCount, VF);
4338 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4339 };
4340
4341 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4342 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4343 return CmpFn(RTCostA, RTCostB);
4344}
4345
4348 Loop *TheLoop) {
4349 if (InvalidCosts.empty())
4350 return;
4351
4352 // Emit a report of VFs with invalid costs in the loop.
4353
4354 // Group the remarks per instruction, keeping the instruction order from
4355 // InvalidCosts.
4356 std::map<Instruction *, unsigned> Numbering;
4357 unsigned I = 0;
4358 for (auto &Pair : InvalidCosts)
4359 if (!Numbering.count(Pair.first))
4360 Numbering[Pair.first] = I++;
4361
4362 // Sort the list, first on instruction(number) then on VF.
4363 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4364 if (Numbering[A.first] != Numbering[B.first])
4365 return Numbering[A.first] < Numbering[B.first];
4366 const auto &LHS = A.second;
4367 const auto &RHS = B.second;
4368 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4369 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4370 });
4371
4372 // For a list of ordered instruction-vf pairs:
4373 // [(load, vf1), (load, vf2), (store, vf1)]
4374 // Group the instructions together to emit separate remarks for:
4375 // load (vf1, vf2)
4376 // store (vf1)
4377 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4378 auto Subset = ArrayRef<InstructionVFPair>();
4379 do {
4380 if (Subset.empty())
4381 Subset = Tail.take_front(1);
4382
4383 Instruction *I = Subset.front().first;
4384
4385 // If the next instruction is different, or if there are no other pairs,
4386 // emit a remark for the collated subset. e.g.
4387 // [(load, vf1), (load, vf2))]
4388 // to emit:
4389 // remark: invalid costs for 'load' at VF=(vf, vf2)
4390 if (Subset == Tail || Tail[Subset.size()].first != I) {
4391 std::string OutString;
4392 raw_string_ostream OS(OutString);
4393 assert(!Subset.empty() && "Unexpected empty range");
4394 OS << "Instruction with invalid costs prevented vectorization at VF=(";
4395 for (const auto &Pair : Subset)
4396 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4397 OS << "):";
4398 if (auto *CI = dyn_cast<CallInst>(I))
4399 OS << " call to " << CI->getCalledFunction()->getName();
4400 else
4401 OS << " " << I->getOpcodeName();
4402 OS.flush();
4403 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4404 Tail = Tail.drop_front(Subset.size());
4405 Subset = {};
4406 } else
4407 // Grow the subset by one element
4408 Subset = Tail.take_front(Subset.size() + 1);
4409 } while (!Tail.empty());
4410}
4411
4412/// Check if any recipe of \p Plan will generate a vector value, which will be
4413/// assigned a vector register.
4415 const TargetTransformInfo &TTI) {
4416 assert(VF.isVector() && "Checking a scalar VF?");
4417 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
4419 DenseSet<VPRecipeBase *> EphemeralRecipes;
4420 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4421 // Set of already visited types.
4422 DenseSet<Type *> Visited;
4423 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4425 for (VPRecipeBase &R : *VPBB) {
4426 if (EphemeralRecipes.contains(&R))
4427 continue;
4428 // Continue early if the recipe is considered to not produce a vector
4429 // result. Note that this includes VPInstruction where some opcodes may
4430 // produce a vector, to preserve existing behavior as VPInstructions model
4431 // aspects not directly mapped to existing IR instructions.
4432 switch (R.getVPDefID()) {
4433 case VPDef::VPDerivedIVSC:
4434 case VPDef::VPScalarIVStepsSC:
4435 case VPDef::VPScalarCastSC:
4436 case VPDef::VPReplicateSC:
4437 case VPDef::VPInstructionSC:
4438 case VPDef::VPCanonicalIVPHISC:
4439 case VPDef::VPVectorPointerSC:
4440 case VPDef::VPExpandSCEVSC:
4441 case VPDef::VPEVLBasedIVPHISC:
4442 case VPDef::VPPredInstPHISC:
4443 case VPDef::VPBranchOnMaskSC:
4444 continue;
4445 case VPDef::VPReductionSC:
4446 case VPDef::VPActiveLaneMaskPHISC:
4447 case VPDef::VPWidenCallSC:
4448 case VPDef::VPWidenCanonicalIVSC:
4449 case VPDef::VPWidenCastSC:
4450 case VPDef::VPWidenGEPSC:
4451 case VPDef::VPWidenSC:
4452 case VPDef::VPWidenSelectSC:
4453 case VPDef::VPBlendSC:
4454 case VPDef::VPFirstOrderRecurrencePHISC:
4455 case VPDef::VPWidenPHISC:
4456 case VPDef::VPWidenIntOrFpInductionSC:
4457 case VPDef::VPWidenPointerInductionSC:
4458 case VPDef::VPReductionPHISC:
4459 case VPDef::VPInterleaveSC:
4460 case VPDef::VPWidenLoadEVLSC:
4461 case VPDef::VPWidenLoadSC:
4462 case VPDef::VPWidenStoreEVLSC:
4463 case VPDef::VPWidenStoreSC:
4464 break;
4465 default:
4466 llvm_unreachable("unhandled recipe");
4467 }
4468
4469 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4470 Type *VectorTy = ToVectorTy(ScalarTy, VF);
4471 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4472 if (!NumLegalParts)
4473 return false;
4474 if (VF.isScalable()) {
4475 // <vscale x 1 x iN> is assumed to be profitable over iN because
4476 // scalable registers are a distinct register class from scalar
4477 // ones. If we ever find a target which wants to lower scalable
4478 // vectors back to scalars, we'll need to update this code to
4479 // explicitly ask TTI about the register class uses for each part.
4480 return NumLegalParts <= VF.getKnownMinValue();
4481 }
4482 // Two or more parts that share a register - are vectorized.
4483 return NumLegalParts < VF.getKnownMinValue();
4484 };
4485
4486 // If no def nor is a store, e.g., branches, continue - no value to check.
4487 if (R.getNumDefinedValues() == 0 &&
4488 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4489 &R))
4490 continue;
4491 // For multi-def recipes, currently only interleaved loads, suffice to
4492 // check first def only.
4493 // For stores check their stored value; for interleaved stores suffice
4494 // the check first stored value only. In all cases this is the second
4495 // operand.
4496 VPValue *ToCheck =
4497 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4498 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4499 if (!Visited.insert({ScalarTy}).second)
4500 continue;
4501 if (WillWiden(ScalarTy))
4502 return true;
4503 }
4504 }
4505
4506 return false;
4507}
4508
4509VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4511 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4512 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4513 assert(any_of(VPlans,
4514 [](std::unique_ptr<VPlan> &P) {
4515 return P->hasVF(ElementCount::getFixed(1));
4516 }) &&
4517 "Expected Scalar VF to be a candidate");
4518
4519 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4520 ExpectedCost);
4521 VectorizationFactor ChosenFactor = ScalarCost;
4522
4523 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4524 if (ForceVectorization &&
4525 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4526 // Ignore scalar width, because the user explicitly wants vectorization.
4527 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4528 // evaluation.
4529 ChosenFactor.Cost = InstructionCost::getMax();
4530 }
4531
4532 SmallVector<InstructionVFPair> InvalidCosts;
4533 for (auto &P : VPlans) {
4534 for (ElementCount VF : P->vectorFactors()) {
4535 // The cost for scalar VF=1 is already calculated, so ignore it.
4536 if (VF.isScalar())
4537 continue;
4538
4539 InstructionCost C = CM.expectedCost(VF, &InvalidCosts);
4540 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4541
4542#ifndef NDEBUG
4543 unsigned AssumedMinimumVscale =
4544 getVScaleForTuning(OrigLoop, TTI).value_or(1);
4545 unsigned Width =
4546 Candidate.Width.isScalable()
4547 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
4548 : Candidate.Width.getFixedValue();
4549 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4550 << " costs: " << (Candidate.Cost / Width));
4551 if (VF.isScalable())
4552 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4553 << AssumedMinimumVscale << ")");
4554 LLVM_DEBUG(dbgs() << ".\n");
4555#endif
4556
4557 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4558 LLVM_DEBUG(
4559 dbgs()
4560 << "LV: Not considering vector loop of width " << VF
4561 << " because it will not generate any vector instructions.\n");
4562 continue;
4563 }
4564
4565 // If profitable add it to ProfitableVF list.
4566 if (isMoreProfitable(Candidate, ScalarCost))
4567 ProfitableVFs.push_back(Candidate);
4568
4569 if (isMoreProfitable(Candidate, ChosenFactor))
4570 ChosenFactor = Candidate;
4571 }
4572 }
4573
4574 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
4575
4578 "There are conditional stores.",
4579 "store that is conditionally executed prevents vectorization",
4580 "ConditionalStore", ORE, OrigLoop);
4581 ChosenFactor = ScalarCost;
4582 }
4583
4584 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4585 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4586 << "LV: Vectorization seems to be not beneficial, "
4587 << "but was forced by a user.\n");
4588 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
4589 return ChosenFactor;
4590}
4591
4592bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4593 ElementCount VF) const {
4594 // Cross iteration phis such as reductions need special handling and are
4595 // currently unsupported.
4596 if (any_of(OrigLoop->getHeader()->phis(),
4597 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4598 return false;
4599
4600 // Phis with uses outside of the loop require special handling and are
4601 // currently unsupported.
4602 for (const auto &Entry : Legal->getInductionVars()) {
4603 // Look for uses of the value of the induction at the last iteration.
4604 Value *PostInc =
4605 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4606 for (User *U : PostInc->users())
4607 if (!OrigLoop->contains(cast<Instruction>(U)))
4608 return false;
4609 // Look for uses of penultimate value of the induction.
4610 for (User *U : Entry.first->users())
4611 if (!OrigLoop->contains(cast<Instruction>(U)))
4612 return false;
4613 }
4614
4615 // Epilogue vectorization code has not been auditted to ensure it handles
4616 // non-latch exits properly. It may be fine, but it needs auditted and
4617 // tested.
4618 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4619 return false;
4620
4621 return true;
4622}
4623
4625 const ElementCount VF) const {
4626 // FIXME: We need a much better cost-model to take different parameters such
4627 // as register pressure, code size increase and cost of extra branches into
4628 // account. For now we apply a very crude heuristic and only consider loops
4629 // with vectorization factors larger than a certain value.
4630
4631 // Allow the target to opt out entirely.
4633 return false;
4634
4635 // We also consider epilogue vectorization unprofitable for targets that don't
4636 // consider interleaving beneficial (eg. MVE).
4637 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4638 return false;
4639
4640 unsigned Multiplier = 1;
4641 if (VF.isScalable())
4642 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
4643 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
4644 return true;
4645 return false;
4646}
4647
4649 const ElementCount MainLoopVF, unsigned IC) {
4652 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4653 return Result;
4654 }
4655
4656 if (!CM.isScalarEpilogueAllowed()) {
4657 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4658 "epilogue is allowed.\n");
4659 return Result;
4660 }
4661
4662 // Not really a cost consideration, but check for unsupported cases here to
4663 // simplify the logic.
4664 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4665 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4666 "is not a supported candidate.\n");
4667 return Result;
4668 }
4669
4671 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4673 if (hasPlanWithVF(ForcedEC))
4674 return {ForcedEC, 0, 0};
4675 else {
4676 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4677 "viable.\n");
4678 return Result;
4679 }
4680 }
4681
4682 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4683 OrigLoop->getHeader()->getParent()->hasMinSize()) {
4684 LLVM_DEBUG(
4685 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4686 return Result;
4687 }
4688
4689 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
4690 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4691 "this loop\n");
4692 return Result;
4693 }
4694
4695 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4696 // the main loop handles 8 lanes per iteration. We could still benefit from
4697 // vectorizing the epilogue loop with VF=4.
4698 ElementCount EstimatedRuntimeVF = MainLoopVF;
4699 if (MainLoopVF.isScalable()) {
4700 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
4701 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
4702 EstimatedRuntimeVF *= *VScale;
4703 }
4704
4705 ScalarEvolution &SE = *PSE.getSE();
4706 Type *TCType = Legal->getWidestInductionType();
4707 const SCEV *RemainingIterations = nullptr;
4708 for (auto &NextVF : ProfitableVFs) {
4709 // Skip candidate VFs without a corresponding VPlan.
4710 if (!hasPlanWithVF(NextVF.Width))
4711 continue;
4712
4713 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
4714 // vectors) or the VF of the main loop (fixed vectors).
4715 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4716 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4717 ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
4718 continue;
4719
4720 // If NextVF is greater than the number of remaining iterations, the
4721 // epilogue loop would be dead. Skip such factors.
4722 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4723 // TODO: extend to support scalable VFs.
4724 if (!RemainingIterations) {
4725 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
4726 RemainingIterations = SE.getURemExpr(
4727 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4728 }
4729 if (SE.isKnownPredicate(
4731 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4732 RemainingIterations))
4733 continue;
4734 }
4735
4736 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
4737 Result = NextVF;
4738 }
4739
4740 if (Result != VectorizationFactor::Disabled())
4741 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4742 << Result.Width << "\n");
4743 return Result;
4744}
4745
4746std::pair<unsigned, unsigned>
4748 unsigned MinWidth = -1U;
4749 unsigned MaxWidth = 8;
4751 // For in-loop reductions, no element types are added to ElementTypesInLoop
4752 // if there are no loads/stores in the loop. In this case, check through the
4753 // reduction variables to determine the maximum width.
4754 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4755 // Reset MaxWidth so that we can find the smallest type used by recurrences
4756 // in the loop.
4757 MaxWidth = -1U;
4758 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4759 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4760 // When finding the min width used by the recurrence we need to account
4761 // for casts on the input operands of the recurrence.
4762 MaxWidth = std::min<unsigned>(
4763 MaxWidth, std::min<unsigned>(
4766 }
4767 } else {
4768 for (Type *T : ElementTypesInLoop) {
4769 MinWidth = std::min<unsigned>(
4770 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4771 MaxWidth = std::max<unsigned>(
4772 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4773 }
4774 }
4775 return {MinWidth, MaxWidth};
4776}
4777
4779 ElementTypesInLoop.clear();
4780 // For each block.
4781 for (BasicBlock *BB : TheLoop->blocks()) {
4782 // For each instruction in the loop.
4783 for (Instruction &I : BB->instructionsWithoutDebug()) {
4784 Type *T = I.getType();
4785
4786 // Skip ignored values.
4787 if (ValuesToIgnore.count(&I))
4788 continue;
4789
4790 // Only examine Loads, Stores and PHINodes.
4791 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4792 continue;
4793
4794 // Examine PHI nodes that are reduction variables. Update the type to
4795 // account for the recurrence type.
4796 if (auto *PN = dyn_cast<PHINode>(&I)) {
4797 if (!Legal->isReductionVariable(PN))
4798 continue;
4799 const RecurrenceDescriptor &RdxDesc =
4800 Legal->getReductionVars().find(PN)->second;
4803 RdxDesc.getRecurrenceType(),
4805 continue;
4806 T = RdxDesc.getRecurrenceType();
4807 }
4808
4809 // Examine the stored values.
4810 if (auto *ST = dyn_cast<StoreInst>(&I))
4811 T = ST->getValueOperand()->getType();
4812
4813 assert(T->isSized() &&
4814 "Expected the load/store/recurrence type to be sized");
4815
4816 ElementTypesInLoop.insert(T);
4817 }
4818 }
4819}
4820
4821unsigned
4823 InstructionCost LoopCost) {
4824 // -- The interleave heuristics --
4825 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4826 // There are many micro-architectural considerations that we can't predict
4827 // at this level. For example, frontend pressure (on decode or fetch) due to
4828 // code size, or the number and capabilities of the execution ports.
4829 //
4830 // We use the following heuristics to select the interleave count:
4831 // 1. If the code has reductions, then we interleave to break the cross
4832 // iteration dependency.
4833 // 2. If the loop is really small, then we interleave to reduce the loop
4834 // overhead.
4835 // 3. We don't interleave if we think that we will spill registers to memory
4836 // due to the increased register pressure.
4837
4839 return 1;
4840
4841 // Do not interleave if EVL is preferred and no User IC is specified.
4842 if (foldTailWithEVL()) {
4843 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4844 "Unroll factor forced to be 1.\n");
4845 return 1;
4846 }
4847
4848 // We used the distance for the interleave count.
4850 return 1;
4851
4852 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
4853 const bool HasReductions = !Legal->getReductionVars().empty();
4854
4855 // If we did not calculate the cost for VF (because the user selected the VF)
4856 // then we calculate the cost of VF here.
4857 if (LoopCost == 0) {
4858 LoopCost = expectedCost(VF);
4859 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4860
4861 // Loop body is free and there is no need for interleaving.
4862 if (LoopCost == 0)
4863 return 1;
4864 }
4865
4867 // We divide by these constants so assume that we have at least one
4868 // instruction that uses at least one register.
4869 for (auto& pair : R.MaxLocalUsers) {
4870 pair.second = std::max(pair.second, 1U);
4871 }
4872
4873 // We calculate the interleave count using the following formula.
4874 // Subtract the number of loop invariants from the number of available
4875 // registers. These registers are used by all of the interleaved instances.
4876 // Next, divide the remaining registers by the number of registers that is
4877 // required by the loop, in order to estimate how many parallel instances
4878 // fit without causing spills. All of this is rounded down if necessary to be
4879 // a power of two. We want power of two interleave count to simplify any
4880 // addressing operations or alignment considerations.
4881 // We also want power of two interleave counts to ensure that the induction
4882 // variable of the vector loop wraps to zero, when tail is folded by masking;
4883 // this currently happens when OptForSize, in which case IC is set to 1 above.
4884 unsigned IC = UINT_MAX;
4885
4886 for (auto& pair : R.MaxLocalUsers) {
4887 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4888 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4889 << " registers of "
4890 << TTI.getRegisterClassName(pair.first) << " register class\n");
4891 if (VF.isScalar()) {
4892 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4893 TargetNumRegisters = ForceTargetNumScalarRegs;
4894 } else {
4895 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4896 TargetNumRegisters = ForceTargetNumVectorRegs;
4897 }
4898 unsigned MaxLocalUsers = pair.second;
4899 unsigned LoopInvariantRegs = 0;
4900 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
4901 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
4902
4903 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4904 MaxLocalUsers);
4905 // Don't count the induction variable as interleaved.
4907 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4908 std::max(1U, (MaxLocalUsers - 1)));
4909 }
4910
4911 IC = std::min(IC, TmpIC);
4912 }
4913
4914 // Clamp the interleave ranges to reasonable counts.
4915 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4916
4917 // Check if the user has overridden the max.
4918 if (VF.isScalar()) {
4919 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4920 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4921 } else {
4922 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4923 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4924 }
4925
4926 unsigned EstimatedVF = VF.getKnownMinValue();
4927 if (VF.isScalable()) {
4928 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
4929 EstimatedVF *= *VScale;
4930 }
4931 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4932
4933 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4934 if (KnownTC > 0) {
4935 // At least one iteration must be scalar when this constraint holds. So the
4936 // maximum available iterations for interleaving is one less.
4937 unsigned AvailableTC =
4938 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
4939
4940 // If trip count is known we select between two prospective ICs, where
4941 // 1) the aggressive IC is capped by the trip count divided by VF
4942 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4943 // The final IC is selected in a way that the epilogue loop trip count is
4944 // minimized while maximizing the IC itself, so that we either run the
4945 // vector loop at least once if it generates a small epilogue loop, or else
4946 // we run the vector loop at least twice.
4947
4948 unsigned InterleaveCountUB = bit_floor(
4949 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4950 unsigned InterleaveCountLB = bit_floor(std::max(
4951 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4952 MaxInterleaveCount = InterleaveCountLB;
4953
4954 if (InterleaveCountUB != InterleaveCountLB) {
4955 unsigned TailTripCountUB =
4956 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4957 unsigned TailTripCountLB =
4958 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4959 // If both produce same scalar tail, maximize the IC to do the same work
4960 // in fewer vector loop iterations
4961 if (TailTripCountUB == TailTripCountLB)
4962 MaxInterleaveCount = InterleaveCountUB;
4963 }
4964 } else if (BestKnownTC && *BestKnownTC > 0) {
4965 // At least one iteration must be scalar when this constraint holds. So the
4966 // maximum available iterations for interleaving is one less.
4967 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
4968 ? (*BestKnownTC) - 1
4969 : *BestKnownTC;
4970
4971 // If trip count is an estimated compile time constant, limit the
4972 // IC to be capped by the trip count divided by VF * 2, such that the vector
4973 // loop runs at least twice to make interleaving seem profitable when there
4974 // is an epilogue loop present. Since exact Trip count is not known we
4975 // choose to be conservative in our IC estimate.
4976 MaxInterleaveCount = bit_floor(std::max(
4977 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4978 }
4979
4980 assert(MaxInterleaveCount > 0 &&
4981 "Maximum interleave count must be greater than 0");
4982
4983 // Clamp the calculated IC to be between the 1 and the max interleave count
4984 // that the target and trip count allows.
4985 if (IC > MaxInterleaveCount)
4986 IC = MaxInterleaveCount;
4987 else
4988 // Make sure IC is greater than 0.
4989 IC = std::max(1u, IC);
4990
4991 assert(IC > 0 && "Interleave count must be greater than 0.");
4992
4993 // Interleave if we vectorized this loop and there is a reduction that could
4994 // benefit from interleaving.
4995 if (VF.isVector() && HasReductions) {
4996 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4997 return IC;
4998 }
4999
5000 // For any scalar loop that either requires runtime checks or predication we
5001 // are better off leaving this to the unroller. Note that if we've already
5002 // vectorized the loop we will have done the runtime check and so interleaving
5003 // won't require further checks.
5004 bool ScalarInterleavingRequiresPredication =
5005 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5006 return Legal->blockNeedsPredication(BB);
5007 }));
5008 bool ScalarInterleavingRequiresRuntimePointerCheck =
5010
5011 // We want to interleave small loops in order to reduce the loop overhead and
5012 // potentially expose ILP opportunities.
5013 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5014 << "LV: IC is " << IC << '\n'
5015 << "LV: VF is " << VF << '\n');
5016 const bool AggressivelyInterleaveReductions =
5017 TTI.enableAggressiveInterleaving(HasReductions);
5018 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5019 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5020 // We assume that the cost overhead is 1 and we use the cost model
5021 // to estimate the cost of the loop and interleave until the cost of the
5022 // loop overhead is about 5% of the cost of the loop.
5023 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5024 SmallLoopCost / *LoopCost.getValue()));
5025
5026 // Interleave until store/load ports (estimated by max interleave count) are
5027 // saturated.
5028 unsigned NumStores = Legal->getNumStores();
5029 unsigned NumLoads = Legal->getNumLoads();
5030 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5031 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5032
5033 // There is little point in interleaving for reductions containing selects
5034 // and compares when VF=1 since it may just create more overhead than it's
5035 // worth for loops with small trip counts. This is because we still have to
5036 // do the final reduction after the loop.
5037 bool HasSelectCmpReductions =
5038 HasReductions &&
5039 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5040 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5041 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5042 RdxDesc.getRecurrenceKind());
5043 });
5044 if (HasSelectCmpReductions) {
5045 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5046 return 1;
5047 }
5048
5049 // If we have a scalar reduction (vector reductions are already dealt with
5050 // by this point), we can increase the critical path length if the loop
5051 // we're interleaving is inside another loop. For tree-wise reductions
5052 // set the limit to 2, and for ordered reductions it's best to disable
5053 // interleaving entirely.
5054 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5055 bool HasOrderedReductions =
5056 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5057 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5058 return RdxDesc.isOrdered();
5059 });
5060 if (HasOrderedReductions) {
5061 LLVM_DEBUG(
5062 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5063 return 1;
5064 }
5065
5066 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5067 SmallIC = std::min(SmallIC, F);
5068 StoresIC = std::min(StoresIC, F);
5069 LoadsIC = std::min(LoadsIC, F);
5070 }
5071
5073 std::max(StoresIC, LoadsIC) > SmallIC) {
5074 LLVM_DEBUG(
5075 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5076 return std::max(StoresIC, LoadsIC);
5077 }
5078
5079 // If there are scalar reductions and TTI has enabled aggressive
5080 // interleaving for reductions, we will interleave to expose ILP.
5081 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5082 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5083 // Interleave no less than SmallIC but not as aggressive as the normal IC
5084 // to satisfy the rare situation when resources are too limited.
5085 return std::max(IC / 2, SmallIC);
5086 } else {
5087 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5088 return SmallIC;
5089 }
5090 }
5091
5092 // Interleave if this is a large loop (small loops are already dealt with by
5093 // this point) that could benefit from interleaving.
5094 if (AggressivelyInterleaveReductions) {
5095 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5096 return IC;
5097 }
5098
5099 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5100 return 1;
5101}
5102
5105 // This function calculates the register usage by measuring the highest number
5106 // of values that are alive at a single location. Obviously, this is a very
5107 // rough estimation. We scan the loop in a topological order in order and
5108 // assign a number to each instruction. We use RPO to ensure that defs are
5109 // met before their users. We assume that each instruction that has in-loop
5110 // users starts an interval. We record every time that an in-loop value is
5111 // used, so we have a list of the first and last occurrences of each
5112 // instruction. Next, we transpose this data structure into a multi map that
5113 // holds the list of intervals that *end* at a specific location. This multi
5114 // map allows us to perform a linear search. We scan the instructions linearly
5115 // and record each time that a new interval starts, by placing it in a set.
5116 // If we find this value in the multi-map then we remove it from the set.
5117 // The max register usage is the maximum size of the set.
5118 // We also search for instructions that are defined outside the loop, but are
5119 // used inside the loop. We need this number separately from the max-interval
5120 // usage number because when we unroll, loop-invariant values do not take
5121 // more register.
5123 DFS.perform(LI);
5124
5125 RegisterUsage RU;
5126
5127 // Each 'key' in the map opens a new interval. The values
5128 // of the map are the index of the 'last seen' usage of the
5129 // instruction that is the key.
5131
5132 // Maps instruction to its index.
5134 // Marks the end of each interval.
5135 IntervalMap EndPoint;
5136 // Saves the list of instruction indices that are used in the loop.
5138 // Saves the list of values that are used in the loop but are defined outside
5139 // the loop (not including non-instruction values such as arguments and
5140 // constants).
5141 SmallSetVector<Instruction *, 8> LoopInvariants;
5142
5143 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5144 for (Instruction &I : BB->instructionsWithoutDebug()) {
5145 IdxToInstr.push_back(&I);
5146
5147 // Save the end location of each USE.
5148 for (Value *U : I.operands()) {
5149 auto *Instr = dyn_cast<Instruction>(U);
5150
5151 // Ignore non-instruction values such as arguments, constants, etc.
5152 // FIXME: Might need some motivation why these values are ignored. If
5153 // for example an argument is used inside the loop it will increase the
5154 // register pressure (so shouldn't we add it to LoopInvariants).
5155 if (!Instr)
5156 continue;
5157
5158 // If this instruction is outside the loop then record it and continue.
5159 if (!TheLoop->contains(Instr)) {
5160 LoopInvariants.insert(Instr);
5161 continue;
5162 }
5163
5164 // Overwrite previous end points.
5165 EndPoint[Instr] = IdxToInstr.size();
5166 Ends.insert(Instr);
5167 }
5168 }
5169 }
5170
5171 // Saves the list of intervals that end with the index in 'key'.
5172 using InstrList = SmallVector<Instruction *, 2>;
5173 DenseMap<unsigned, InstrList> TransposeEnds;
5174
5175 // Transpose the EndPoints to a list of values that end at each index.
5176 for (auto &Interval : EndPoint)
5177 TransposeEnds[Interval.second].push_back(Interval.first);
5178
5179 SmallPtrSet<Instruction *, 8> OpenIntervals;
5182
5183 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5184
5185 const auto &TTICapture = TTI;
5186 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5187 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5188 return 0;
5189 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5190 };
5191
5192 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5193 Instruction *I = IdxToInstr[i];
5194
5195 // Remove all of the instructions that end at this location.
5196 InstrList &List = TransposeEnds[i];
5197 for (Instruction *ToRemove : List)
5198 OpenIntervals.erase(ToRemove);
5199
5200 // Ignore instructions that are never used within the loop.
5201 if (!Ends.count(I))
5202 continue;
5203
5204 // Skip ignored values.
5205 if (ValuesToIgnore.count(I))
5206 continue;
5207
5209
5210 // For each VF find the maximum usage of registers.
5211 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5212 // Count the number of registers used, per register class, given all open
5213 // intervals.
5214 // Note that elements in this SmallMapVector will be default constructed
5215 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5216 // there is no previous entry for ClassID.
5218
5219 if (VFs[j].isScalar()) {
5220 for (auto *Inst : OpenIntervals) {
5221 unsigned ClassID =
5222 TTI.getRegisterClassForType(false, Inst->getType());
5223 // FIXME: The target might use more than one register for the type
5224 // even in the scalar case.
5225 RegUsage[ClassID] += 1;
5226 }
5227 } else {
5229 for (auto *Inst : OpenIntervals) {
5230 // Skip ignored values for VF > 1.
5231 if (VecValuesToIgnore.count(Inst))
5232 continue;
5233 if (isScalarAfterVectorization(Inst, VFs[j])) {
5234 unsigned ClassID =
5235 TTI.getRegisterClassForType(false, Inst->getType());
5236 // FIXME: The target might use more than one register for the type
5237 // even in the scalar case.
5238 RegUsage[ClassID] += 1;
5239 } else {
5240 unsigned ClassID =
5241 TTI.getRegisterClassForType(true, Inst->getType());
5242 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5243 }
5244 }
5245 }
5246
5247 for (auto& pair : RegUsage) {
5248 auto &Entry = MaxUsages[j][pair.first];
5249 Entry = std::max(Entry, pair.second);
5250 }
5251 }
5252
5253 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5254 << OpenIntervals.size() << '\n');
5255
5256 // Add the current instruction to the list of open intervals.
5257 OpenIntervals.insert(I);
5258 }
5259
5260 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5261 // Note that elements in this SmallMapVector will be default constructed
5262 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5263 // there is no previous entry for ClassID.
5265
5266 for (auto *Inst : LoopInvariants) {
5267 // FIXME: The target might use more than one register for the type
5268 // even in the scalar case.
5269 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5270 auto *I = cast<Instruction>(U);
5271 return TheLoop != LI->getLoopFor(I->getParent()) ||
5272 isScalarAfterVectorization(I, VFs[i]);
5273 });
5274
5275 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5276 unsigned ClassID =
5277 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5278 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5279 }
5280
5281 LLVM_DEBUG({
5282 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5283 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5284 << " item\n";
5285 for (const auto &pair : MaxUsages[i]) {
5286 dbgs() << "LV(REG): RegisterClass: "
5287 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5288 << " registers\n";
5289 }
5290 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5291 << " item\n";
5292 for (const auto &pair : Invariant) {
5293 dbgs() << "LV(REG): RegisterClass: "
5294 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5295 << " registers\n";
5296 }
5297 });
5298
5299 RU.LoopInvariantRegs = Invariant;
5300 RU.MaxLocalUsers = MaxUsages[i];
5301 RUs[i] = RU;
5302 }
5303
5304 return RUs;
5305}
5306
5307bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5308 ElementCount VF) {
5309 // TODO: Cost model for emulated masked load/store is completely
5310 // broken. This hack guides the cost model to use an artificially
5311 // high enough value to practically disable vectorization with such
5312 // operations, except where previously deployed legality hack allowed
5313 // using very low cost values. This is to avoid regressions coming simply
5314 // from moving "masked load/store" check from legality to cost model.
5315 // Masked Load/Gather emulation was previously never allowed.
5316 // Limited number of Masked Store/Scatter emulation was allowed.
5318 "Expecting a scalar emulated instruction");
5319 return isa<LoadInst>(I) ||
5320 (isa<StoreInst>(I) &&
5321 NumPredStores > NumberOfStoresToPredicate);
5322}
5323
5325 // If we aren't vectorizing the loop, or if we've already collected the
5326 // instructions to scalarize, there's nothing to do. Collection may already
5327 // have occurred if we have a user-selected VF and are now computing the
5328 // expected cost for interleaving.
5329 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5330 return;
5331
5332 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5333 // not profitable to scalarize any instructions, the presence of VF in the
5334 // map will indicate that we've analyzed it already.
5335 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5336
5337 PredicatedBBsAfterVectorization[VF].clear();
5338
5339 // Find all the instructions that are scalar with predication in the loop and
5340 // determine if it would be better to not if-convert the blocks they are in.
5341 // If so, we also record the instructions to scalarize.
5342 for (BasicBlock *BB : TheLoop->blocks()) {
5344 continue;
5345 for (Instruction &I : *BB)
5346 if (isScalarWithPredication(&I, VF)) {
5347 ScalarCostsTy ScalarCosts;
5348 // Do not apply discount logic for:
5349 // 1. Scalars after vectorization, as there will only be a single copy
5350 // of the instruction.
5351 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5352 // 3. Emulated masked memrefs, if a hacked cost is needed.
5353 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5354 !useEmulatedMaskMemRefHack(&I, VF) &&
5355 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5356 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5357 // Remember that BB will remain after vectorization.
5358 PredicatedBBsAfterVectorization[VF].insert(BB);
5359 for (auto *Pred : predecessors(BB)) {
5360 if (Pred->getSingleSuccessor() == BB)
5361 PredicatedBBsAfterVectorization[VF].insert(Pred);
5362 }
5363 }
5364 }
5365}
5366
5367InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5368 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5369 assert(!isUniformAfterVectorization(PredInst, VF) &&
5370 "Instruction marked uniform-after-vectorization will be predicated");
5371
5372 // Initialize the discount to zero, meaning that the scalar version and the
5373 // vector version cost the same.
5374 InstructionCost Discount = 0;
5375
5376 // Holds instructions to analyze. The instructions we visit are mapped in
5377 // ScalarCosts. Those instructions are the ones that would be scalarized if
5378 // we find that the scalar version costs less.
5380
5381 // Returns true if the given instruction can be scalarized.
5382 auto canBeScalarized = [&](Instruction *I) -> bool {
5383 // We only attempt to scalarize instructions forming a single-use chain
5384 // from the original predicated block that would otherwise be vectorized.
5385 // Although not strictly necessary, we give up on instructions we know will
5386 // already be scalar to avoid traversing chains that are unlikely to be
5387 // beneficial.
5388 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5390 return false;
5391
5392 // If the instruction is scalar with predication, it will be analyzed
5393 // separately. We ignore it within the context of PredInst.
5394 if (isScalarWithPredication(I, VF))
5395 return false;
5396
5397 // If any of the instruction's operands are uniform after vectorization,
5398 // the instruction cannot be scalarized. This prevents, for example, a
5399 // masked load from being scalarized.
5400 //
5401 // We assume we will only emit a value for lane zero of an instruction
5402 // marked uniform after vectorization, rather than VF identical values.
5403 // Thus, if we scalarize an instruction that uses a uniform, we would
5404 // create uses of values corresponding to the lanes we aren't emitting code
5405 // for. This behavior can be changed by allowing getScalarValue to clone
5406 // the lane zero values for uniforms rather than asserting.
5407 for (Use &U : I->operands())
5408 if (auto *J = dyn_cast<Instruction>(U.get()))
5409 if (isUniformAfterVectorization(J, VF))
5410 return false;
5411
5412 // Otherwise, we can scalarize the instruction.
5413 return true;
5414 };
5415
5416 // Compute the expected cost discount from scalarizing the entire expression
5417 // feeding the predicated instruction. We currently only consider expressions
5418 // that are single-use instruction chains.
5419 Worklist.push_back(PredInst);
5420 while (!Worklist.empty()) {
5421 Instruction *I = Worklist.pop_back_val();
5422
5423 // If we've already analyzed the instruction, there's nothing to do.
5424 if (ScalarCosts.contains(I))
5425 continue;
5426
5427 // Compute the cost of the vector instruction. Note that this cost already
5428 // includes the scalarization overhead of the predicated instruction.
5429 InstructionCost VectorCost = getInstructionCost(I, VF);
5430
5431 // Compute the cost of the scalarized instruction. This cost is the cost of
5432 // the instruction as if it wasn't if-converted and instead remained in the
5433 // predicated block. We will scale this cost by block probability after
5434 // computing the scalarization overhead.
5435 InstructionCost ScalarCost =
5437
5438 // Compute the scalarization overhead of needed insertelement instructions
5439 // and phi nodes.
5441 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5442 ScalarCost += TTI.getScalarizationOverhead(
5443 cast<VectorType>(ToVectorTy(I->getType(), VF)),
5444 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5445 /*Extract*/ false, CostKind);
5446 ScalarCost +=
5447 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5448 }
5449
5450 // Compute the scalarization overhead of needed extractelement
5451 // instructions. For each of the instruction's operands, if the operand can
5452 // be scalarized, add it to the worklist; otherwise, account for the
5453 // overhead.
5454 for (Use &U : I->operands())
5455 if (auto *J = dyn_cast<Instruction>(U.get())) {
5456 assert(VectorType::isValidElementType(J->getType()) &&
5457 "Instruction has non-scalar type");
5458 if (canBeScalarized(J))
5459 Worklist.push_back(J);
5460 else if (needsExtract(J, VF)) {
5461 ScalarCost += TTI.getScalarizationOverhead(
5462 cast<VectorType>(ToVectorTy(J->getType(), VF)),
5463 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5464 /*Extract*/ true, CostKind);
5465 }
5466 }
5467
5468 // Scale the total scalar cost by block probability.
5469 ScalarCost /= getReciprocalPredBlockProb();
5470
5471 // Compute the discount. A non-negative discount means the vector version
5472 // of the instruction costs more, and scalarizing would be beneficial.
5473 Discount += VectorCost - ScalarCost;
5474 ScalarCosts[I] = ScalarCost;
5475 }
5476
5477 return Discount;
5478}
5479
5483
5484 // For each block.
5485 for (BasicBlock *BB : TheLoop->blocks()) {
5486 InstructionCost BlockCost;
5487
5488 // For each instruction in the old loop.
5489 for (Instruction &I : BB->instructionsWithoutDebug()) {
5490 // Skip ignored values.
5491 if (ValuesToIgnore.count(&I) ||
5492 (VF.isVector() && VecValuesToIgnore.count(&I)))
5493 continue;
5494
5496
5497 // Check if we should override the cost.
5498 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5500
5501 // Keep a list of instructions with invalid costs.
5502 if (Invalid && !C.isValid())
5503 Invalid->emplace_back(&I, VF);
5504
5505 BlockCost += C;
5506 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5507 << VF << " For instruction: " << I << '\n');
5508 }
5509
5510 // If we are vectorizing a predicated block, it will have been
5511 // if-converted. This means that the block's instructions (aside from
5512 // stores and instructions that may divide by zero) will now be
5513 // unconditionally executed. For the scalar case, we may not always execute
5514 // the predicated block, if it is an if-else block. Thus, scale the block's
5515 // cost by the probability of executing it. blockNeedsPredication from
5516 // Legal is used so as to not include all blocks in tail folded loops.
5517 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5518 BlockCost /= getReciprocalPredBlockProb();
5519
5520 Cost += BlockCost;
5521 }
5522
5523 return Cost;
5524}
5525
5526/// Gets Address Access SCEV after verifying that the access pattern
5527/// is loop invariant except the induction variable dependence.
5528///
5529/// This SCEV can be sent to the Target in order to estimate the address
5530/// calculation cost.
5532 Value *Ptr,
5535 const Loop *TheLoop) {
5536
5537 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5538 if (!Gep)
5539 return nullptr;
5540
5541 // We are looking for a gep with all loop invariant indices except for one
5542 // which should be an induction variable.
5543 auto SE = PSE.getSE();
5544 unsigned NumOperands = Gep->getNumOperands();
5545 for (unsigned i = 1; i < NumOperands; ++i) {
5546 Value *Opd = Gep->getOperand(i);
5547 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5548 !Legal->isInductionVariable(Opd))
5549 return nullptr;
5550 }
5551
5552 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5553 return PSE.getSCEV(Ptr);
5554}
5555
5557LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5558 ElementCount VF) {
5559 assert(VF.isVector() &&
5560 "Scalarization cost of instruction implies vectorization.");
5561 if (VF.isScalable())
5563
5564 Type *ValTy = getLoadStoreType(I);
5565 auto SE = PSE.getSE();
5566
5567 unsigned AS = getLoadStoreAddressSpace(I);
5569 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5570 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5571 // that it is being called from this specific place.
5572
5573 // Figure out whether the access is strided and get the stride value
5574 // if it's known in compile time
5575 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5576
5577 // Get the cost of the scalar memory instruction and address computation.
5579 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5580
5581 // Don't pass *I here, since it is scalar but will actually be part of a
5582 // vectorized loop where the user of it is a vectorized instruction.
5584 const Align Alignment = getLoadStoreAlignment(I);
5585 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5586 ValTy->getScalarType(),
5587 Alignment, AS, CostKind);
5588
5589 // Get the overhead of the extractelement and insertelement instructions
5590 // we might create due to scalarization.
5591 Cost += getScalarizationOverhead(I, VF, CostKind);
5592
5593 // If we have a predicated load/store, it will need extra i1 extracts and
5594 // conditional branches, but may not be executed for each vector lane. Scale
5595 // the cost by the probability of executing the predicated block.
5596 if (isPredicatedInst(I)) {
5598
5599 // Add the cost of an i1 extract and a branch
5600 auto *Vec_i1Ty =
5603 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5604 /*Insert=*/false, /*Extract=*/true, CostKind);
5605 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5606
5607 if (useEmulatedMaskMemRefHack(I, VF))
5608 // Artificially setting to a high enough value to practically disable
5609 // vectorization with such operations.
5610 Cost = 3000000;
5611 }
5612
5613 return Cost;
5614}
5615
5617LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5618 ElementCount VF) {
5619 Type *ValTy = getLoadStoreType(I);
5620 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5622 unsigned AS = getLoadStoreAddressSpace(I);
5623 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5625
5626 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5627 "Stride should be 1 or -1 for consecutive memory access");
5628 const Align Alignment = getLoadStoreAlignment(I);
5630 if (Legal->isMaskRequired(I)) {
5631 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5632 CostKind);
5633 } else {
5634 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5635 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5636 CostKind, OpInfo, I);
5637 }
5638
5639 bool Reverse = ConsecutiveStride < 0;
5640 if (Reverse)
5642 std::nullopt, CostKind, 0);
5643 return Cost;
5644}
5645
5647LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5648 ElementCount VF) {
5649 assert(Legal->isUniformMemOp(*I, VF));
5650
5651 Type *ValTy = getLoadStoreType(I);
5652 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5653 const Align Alignment = getLoadStoreAlignment(I);
5654 unsigned AS = getLoadStoreAddressSpace(I);
5656 if (isa<LoadInst>(I)) {
5657 return TTI.getAddressComputationCost(ValTy) +
5658 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5659 CostKind) +
5661 }
5662 StoreInst *SI = cast<StoreInst>(I);
5663
5664 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5665 return TTI.getAddressComputationCost(ValTy) +
5666 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5667 CostKind) +
5668 (isLoopInvariantStoreValue
5669 ? 0
5670 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5671 CostKind, VF.getKnownMinValue() - 1));
5672}
5673
5675LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5676 ElementCount VF) {
5677 Type *ValTy = getLoadStoreType(I);
5678 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5679 const Align Alignment = getLoadStoreAlignment(I);
5681
5682 return TTI.getAddressComputationCost(VectorTy) +
5684 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
5686}
5687
5689LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5690 ElementCount VF) {
5691 Type *ValTy = getLoadStoreType(I);
5692 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5693 unsigned AS = getLoadStoreAddressSpace(I);
5695
5696 auto Group = getInterleavedAccessGroup(I);
5697 assert(Group && "Fail to get an interleaved access group.");
5698
5699 unsigned InterleaveFactor = Group->getFactor();
5700 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5701
5702 // Holds the indices of existing members in the interleaved group.
5704 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5705 if (Group->getMember(IF))
5706 Indices.push_back(IF);
5707
5708 // Calculate the cost of the whole interleaved group.
5709 bool UseMaskForGaps =
5710 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5711 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5713 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
5714 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
5715
5716 if (Group->isReverse()) {
5717 // TODO: Add support for reversed masked interleaved access.
5719 "Reverse masked interleaved access not supported.");
5720 Cost += Group->getNumMembers() *
5722 std::nullopt, CostKind, 0);
5723 }
5724 return Cost;
5725}
5726
5727std::optional<InstructionCost>
5729 Instruction *I, ElementCount VF, Type *Ty,
5731 using namespace llvm::PatternMatch;
5732 // Early exit for no inloop reductions
5733 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5734 return std::nullopt;
5735 auto *VectorTy = cast<VectorType>(Ty);
5736
5737 // We are looking for a pattern of, and finding the minimal acceptable cost:
5738 // reduce(mul(ext(A), ext(B))) or
5739 // reduce(mul(A, B)) or
5740 // reduce(ext(A)) or
5741 // reduce(A).
5742 // The basic idea is that we walk down the tree to do that, finding the root
5743 // reduction instruction in InLoopReductionImmediateChains. From there we find
5744 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5745 // of the components. If the reduction cost is lower then we return it for the
5746 // reduction instruction and 0 for the other instructions in the pattern. If
5747 // it is not we return an invalid cost specifying the orignal cost method
5748 // should be used.
5749 Instruction *RetI = I;
5750 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5751 if (!RetI->hasOneUser())
5752 return std::nullopt;
5753 RetI = RetI->user_back();
5754 }
5755
5756 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5757 RetI->user_back()->getOpcode() == Instruction::Add) {
5758 RetI = RetI->user_back();
5759 }
5760
5761 // Test if the found instruction is a reduction, and if not return an invalid
5762 // cost specifying the parent to use the original cost modelling.
5763 if (!InLoopReductionImmediateChains.count(RetI))
5764 return std::nullopt;
5765
5766 // Find the reduction this chain is a part of and calculate the basic cost of
5767 // the reduction on its own.
5768 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5769 Instruction *ReductionPhi = LastChain;
5770 while (!isa<PHINode>(ReductionPhi))
5771 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5772
5773 const RecurrenceDescriptor &RdxDesc =
5774 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5775
5776 InstructionCost BaseCost;
5777 RecurKind RK = RdxDesc.getRecurrenceKind();
5780 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5781 RdxDesc.getFastMathFlags(), CostKind);
5782 } else {
5784 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5785 }
5786
5787 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5788 // normal fmul instruction to the cost of the fadd reduction.
5789 if (RK == RecurKind::FMulAdd)
5790 BaseCost +=
5791 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5792
5793 // If we're using ordered reductions then we can just return the base cost
5794 // here, since getArithmeticReductionCost calculates the full ordered
5795 // reduction cost when FP reassociation is not allowed.
5796 if (useOrderedReductions(RdxDesc))
5797 return BaseCost;
5798
5799 // Get the operand that was not the reduction chain and match it to one of the
5800 // patterns, returning the better cost if it is found.
5801 Instruction *RedOp = RetI->getOperand(1) == LastChain
5802 ? dyn_cast<Instruction>(RetI->getOperand(0))
5803 : dyn_cast<Instruction>(RetI->getOperand(1));
5804
5805 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5806
5807 Instruction *Op0, *Op1;
5808 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5809 match(RedOp,
5811 match(Op0, m_ZExtOrSExt(m_Value())) &&
5812 Op0->getOpcode() == Op1->getOpcode() &&
5813 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5815 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5816
5817 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5818 // Note that the extend opcodes need to all match, or if A==B they will have
5819 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5820 // which is equally fine.
5821 bool IsUnsigned = isa<ZExtInst>(Op0);
5822 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5823 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5824
5825 InstructionCost ExtCost =
5826 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5828 InstructionCost MulCost =
5829 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5830 InstructionCost Ext2Cost =
5831 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5833
5835 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5836
5837 if (RedCost.isValid() &&
5838 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5839 return I == RetI ? RedCost : 0;
5840 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5841 !TheLoop->isLoopInvariant(RedOp)) {
5842 // Matched reduce(ext(A))
5843 bool IsUnsigned = isa<ZExtInst>(RedOp);
5844 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5846 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5847 RdxDesc.getFastMathFlags(), CostKind);
5848
5849 InstructionCost ExtCost =
5850 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5852 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5853 return I == RetI ? RedCost : 0;
5854 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5855 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5856 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5857 Op0->getOpcode() == Op1->getOpcode() &&
5859 bool IsUnsigned = isa<ZExtInst>(Op0);
5860 Type *Op0Ty = Op0->getOperand(0)->getType();
5861 Type *Op1Ty = Op1->getOperand(0)->getType();
5862 Type *LargestOpTy =
5863 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5864 : Op0Ty;
5865 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5866
5867 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5868 // different sizes. We take the largest type as the ext to reduce, and add
5869 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5871 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5874 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5876 InstructionCost MulCost =
5877 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5878
5880 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5881 InstructionCost ExtraExtCost = 0;
5882 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5883 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5884 ExtraExtCost = TTI.getCastInstrCost(
5885 ExtraExtOp->getOpcode(), ExtType,
5886 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5888 }
5889
5890 if (RedCost.isValid() &&
5891 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5892 return I == RetI ? RedCost : 0;
5893 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5894 // Matched reduce.add(mul())
5895 InstructionCost MulCost =
5896 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5897
5899 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5900
5901 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5902 return I == RetI ? RedCost : 0;
5903 }
5904 }
5905
5906 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5907}
5908
5910LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5911 ElementCount VF) {
5912 // Calculate scalar cost only. Vectorization cost should be ready at this
5913 // moment.
5914 if (VF.isScalar()) {
5915 Type *ValTy = getLoadStoreType(I);
5916 const Align Alignment = getLoadStoreAlignment(I);
5917 unsigned AS = getLoadStoreAddressSpace(I);
5918
5919 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5920 return TTI.getAddressComputationCost(ValTy) +
5921 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
5922 TTI::TCK_RecipThroughput, OpInfo, I);
5923 }
5924 return getWideningCost(I, VF);
5925}
5926
5927InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
5929
5930 // There is no mechanism yet to create a scalable scalarization loop,
5931 // so this is currently Invalid.
5932 if (VF.isScalable())
5934
5935 if (VF.isScalar())
5936 return 0;
5937
5939 Type *RetTy = ToVectorTy(I->getType(), VF);
5940 if (!RetTy->isVoidTy() &&
5941 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5943 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
5944 /*Insert*/ true,
5945 /*Extract*/ false, CostKind);
5946
5947 // Some targets keep addresses scalar.
5948 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5949 return Cost;
5950
5951 // Some targets support efficient element stores.
5952 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5953 return Cost;
5954
5955 // Collect operands to consider.
5956 CallInst *CI = dyn_cast<CallInst>(I);
5957 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5958
5959 // Skip operands that do not require extraction/scalarization and do not incur
5960 // any overhead.
5962 for (auto *V : filterExtractingOperands(Ops, VF))
5963 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
5965 filterExtractingOperands(Ops, VF), Tys, CostKind);
5966}
5967
5969 if (VF.isScalar())
5970 return;
5971 NumPredStores = 0;
5972 for (BasicBlock *BB : TheLoop->blocks()) {
5973 // For each instruction in the old loop.
5974 for (Instruction &I : *BB) {
5976 if (!Ptr)
5977 continue;
5978
5979 // TODO: We should generate better code and update the cost model for
5980 // predicated uniform stores. Today they are treated as any other
5981 // predicated store (see added test cases in
5982 // invariant-store-vectorization.ll).
5983 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
5984 NumPredStores++;
5985
5986 if (Legal->isUniformMemOp(I, VF)) {
5987 auto isLegalToScalarize = [&]() {
5988 if (!VF.isScalable())
5989 // Scalarization of fixed length vectors "just works".
5990 return true;
5991
5992 // We have dedicated lowering for unpredicated uniform loads and
5993 // stores. Note that even with tail folding we know that at least
5994 // one lane is active (i.e. generalized predication is not possible
5995 // here), and the logic below depends on this fact.
5996 if (!foldTailByMasking())
5997 return true;
5998
5999 // For scalable vectors, a uniform memop load is always
6000 // uniform-by-parts and we know how to scalarize that.
6001 if (isa<LoadInst>(I))
6002 return true;
6003
6004 // A uniform store isn't neccessarily uniform-by-part
6005 // and we can't assume scalarization.
6006 auto &SI = cast<StoreInst>(I);
6007 return TheLoop->isLoopInvariant(SI.getValueOperand());
6008 };
6009
6010 const InstructionCost GatherScatterCost =
6012 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6013
6014 // Load: Scalar load + broadcast
6015 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6016 // FIXME: This cost is a significant under-estimate for tail folded
6017 // memory ops.
6018 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6019 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6020
6021 // Choose better solution for the current VF, Note that Invalid
6022 // costs compare as maximumal large. If both are invalid, we get
6023 // scalable invalid which signals a failure and a vectorization abort.
6024 if (GatherScatterCost < ScalarizationCost)
6025 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6026 else
6027 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6028 continue;
6029 }
6030
6031 // We assume that widening is the best solution when possible.
6032 if (memoryInstructionCanBeWidened(&I, VF)) {
6033 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6034 int ConsecutiveStride = Legal->isConsecutivePtr(
6036 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6037 "Expected consecutive stride.");
6038 InstWidening Decision =
6039 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6040 setWideningDecision(&I, VF, Decision, Cost);
6041 continue;
6042 }
6043
6044 // Choose between Interleaving, Gather/Scatter or Scalarization.
6046 unsigned NumAccesses = 1;
6047 if (isAccessInterleaved(&I)) {
6048 auto Group = getInterleavedAccessGroup(&I);
6049 assert(Group && "Fail to get an interleaved access group.");
6050
6051 // Make one decision for the whole group.
6052 if (getWideningDecision(&I, VF) != CM_Unknown)
6053 continue;
6054
6055 NumAccesses = Group->getNumMembers();
6057 InterleaveCost = getInterleaveGroupCost(&I, VF);
6058 }
6059
6060 InstructionCost GatherScatterCost =
6062 ? getGatherScatterCost(&I, VF) * NumAccesses
6064
6065 InstructionCost ScalarizationCost =
6066 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6067
6068 // Choose better solution for the current VF,
6069 // write down this decision and use it during vectorization.
6071 InstWidening Decision;
6072 if (InterleaveCost <= GatherScatterCost &&
6073 InterleaveCost < ScalarizationCost) {
6074 Decision = CM_Interleave;
6075 Cost = InterleaveCost;
6076 } else if (GatherScatterCost < ScalarizationCost) {
6077 Decision = CM_GatherScatter;
6078 Cost = GatherScatterCost;
6079 } else {
6080 Decision = CM_Scalarize;
6081 Cost = ScalarizationCost;
6082 }
6083 // If the instructions belongs to an interleave group, the whole group
6084 // receives the same decision. The whole group receives the cost, but
6085 // the cost will actually be assigned to one instruction.
6086 if (auto Group = getInterleavedAccessGroup(&I))
6087 setWideningDecision(Group, VF, Decision, Cost);
6088 else
6089 setWideningDecision(&I, VF, Decision, Cost);
6090 }
6091 }
6092
6093 // Make sure that any load of address and any other address computation
6094 // remains scalar unless there is gather/scatter support. This avoids
6095 // inevitable extracts into address registers, and also has the benefit of
6096 // activating LSR more, since that pass can't optimize vectorized
6097 // addresses.
6099 return;
6100
6101 // Start with all scalar pointer uses.
6103 for (BasicBlock *BB : TheLoop->blocks())
6104 for (Instruction &I : *BB) {
6105 Instruction *PtrDef =
6106 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6107 if (PtrDef && TheLoop->contains(PtrDef) &&
6109 AddrDefs.insert(PtrDef);
6110 }
6111
6112 // Add all instructions used to generate the addresses.
6114 append_range(Worklist, AddrDefs);
6115 while (!Worklist.empty()) {
6116 Instruction *I = Worklist.pop_back_val();
6117 for (auto &Op : I->operands())
6118 if (auto *InstOp = dyn_cast<Instruction>(Op))
6119 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6120 AddrDefs.insert(InstOp).second)
6121 Worklist.push_back(InstOp);
6122 }
6123
6124 for (auto *I : AddrDefs) {
6125 if (isa<LoadInst>(I)) {
6126 // Setting the desired widening decision should ideally be handled in
6127 // by cost functions, but since this involves the task of finding out
6128 // if the loaded register is involved in an address computation, it is
6129 // instead changed here when we know this is the case.
6130 InstWidening Decision = getWideningDecision(I, VF);
6131 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6132 // Scalarize a widened load of address.
6134 I, VF, CM_Scalarize,
6135 (VF.getKnownMinValue() *
6136 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6137 else if (auto Group = getInterleavedAccessGroup(I)) {
6138 // Scalarize an interleave group of address loads.
6139 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6140 if (Instruction *Member = Group->getMember(I))
6142 Member, VF, CM_Scalarize,
6143 (VF.getKnownMinValue() *
6144 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6145 }
6146 }
6147 } else
6148 // Make sure I gets scalarized and a cost estimate without
6149 // scalarization overhead.
6150 ForcedScalars[VF].insert(I);
6151 }
6152}
6153
6155 assert(!VF.isScalar() &&
6156 "Trying to set a vectorization decision for a scalar VF");
6157
6158 for (BasicBlock *BB : TheLoop->blocks()) {
6159 // For each instruction in the old loop.
6160 for (Instruction &I : *BB) {
6161 CallInst *CI = dyn_cast<CallInst>(&I);
6162
6163 if (!CI)
6164 continue;
6165
6170
6171 Function *ScalarFunc = CI->getCalledFunction();
6172 Type *ScalarRetTy = CI->getType();
6173 SmallVector<Type *, 4> Tys, ScalarTys;
6174 bool MaskRequired = Legal->isMaskRequired(CI);
6175 for (auto &ArgOp : CI->args())
6176 ScalarTys.push_back(ArgOp->getType());
6177
6178 // Compute corresponding vector type for return value and arguments.
6179 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6180 for (Type *ScalarTy : ScalarTys)
6181 Tys.push_back(ToVectorTy(ScalarTy, VF));
6182
6183 // An in-loop reduction using an fmuladd intrinsic is a special case;
6184 // we don't want the normal cost for that intrinsic.
6186 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6189 std::nullopt, *RedCost);
6190 continue;
6191 }
6192
6193 // Estimate cost of scalarized vector call. The source operands are
6194 // assumed to be vectors, so we need to extract individual elements from
6195 // there, execute VF scalar calls, and then gather the result into the
6196 // vector return value.
6197 InstructionCost ScalarCallCost =
6198 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6199
6200 // Compute costs of unpacking argument values for the scalar calls and
6201 // packing the return values to a vector.
6202 InstructionCost ScalarizationCost =
6203 getScalarizationOverhead(CI, VF, CostKind);
6204
6205 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6206
6207 // Find the cost of vectorizing the call, if we can find a suitable
6208 // vector variant of the function.
6209 bool UsesMask = false;
6210 VFInfo FuncInfo;
6211 Function *VecFunc = nullptr;
6212 // Search through any available variants for one we can use at this VF.
6213 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6214 // Must match requested VF.
6215 if (Info.Shape.VF != VF)
6216 continue;
6217
6218 // Must take a mask argument if one is required
6219 if (MaskRequired && !Info.isMasked())
6220 continue;
6221
6222 // Check that all parameter kinds are supported
6223 bool ParamsOk = true;
6224 for (VFParameter Param : Info.Shape.Parameters) {
6225 switch (Param.ParamKind) {
6227 break;
6229 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6230 // Make sure the scalar parameter in the loop is invariant.
6231 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6232 TheLoop))
6233 ParamsOk = false;
6234 break;
6235 }
6237 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6238 // Find the stride for the scalar parameter in this loop and see if
6239 // it matches the stride for the variant.
6240 // TODO: do we need to figure out the cost of an extract to get the
6241 // first lane? Or do we hope that it will be folded away?
6242 ScalarEvolution *SE = PSE.getSE();
6243 const auto *SAR =
6244 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6245
6246 if (!SAR || SAR->getLoop() != TheLoop) {
6247 ParamsOk = false;
6248 break;
6249 }
6250
6251 const SCEVConstant *Step =
6252 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6253
6254 if (!Step ||
6255 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6256 ParamsOk = false;
6257
6258 break;
6259 }
6261 UsesMask = true;
6262 break;
6263 default:
6264 ParamsOk = false;
6265 break;
6266 }
6267 }
6268
6269 if (!ParamsOk)
6270 continue;
6271
6272 // Found a suitable candidate, stop here.
6273 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6274 FuncInfo = Info;
6275 break;
6276 }
6277
6278 // Add in the cost of synthesizing a mask if one wasn't required.
6279 InstructionCost MaskCost = 0;
6280 if (VecFunc && UsesMask && !MaskRequired)
6281 MaskCost = TTI.getShuffleCost(
6284 VecFunc->getFunctionType()->getContext()),
6285 VF));
6286
6287 if (TLI && VecFunc && !CI->isNoBuiltin())
6288 VectorCost =
6289 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6290
6291 // Find the cost of an intrinsic; some targets may have instructions that
6292 // perform the operation without needing an actual call.
6294 if (IID != Intrinsic::not_intrinsic)
6295 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6296
6297 InstructionCost Cost = ScalarCost;
6298 InstWidening Decision = CM_Scalarize;
6299
6300 if (VectorCost <= Cost) {
6301 Cost = VectorCost;
6302 Decision = CM_VectorCall;
6303 }
6304
6305 if (IntrinsicCost <= Cost) {
6306 Cost = IntrinsicCost;
6307 Decision = CM_IntrinsicCall;
6308 }
6309
6310 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6312 }
6313 }
6314}
6315
6318 ElementCount VF) {
6319 // If we know that this instruction will remain uniform, check the cost of
6320 // the scalar version.
6322 VF = ElementCount::getFixed(1);
6323
6324 if (VF.isVector() && isProfitableToScalarize(I, VF))
6325 return InstsToScalarize[VF][I];
6326
6327 // Forced scalars do not have any scalarization overhead.
6328 auto ForcedScalar = ForcedScalars.find(VF);
6329 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6330 auto InstSet = ForcedScalar->second;
6331 if (InstSet.count(I))
6333 VF.getKnownMinValue();
6334 }
6335
6336 Type *RetTy = I->getType();
6338 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6339 auto SE = PSE.getSE();
6341
6342 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6343 ElementCount VF) -> bool {
6344 if (VF.isScalar())
6345 return true;
6346
6347 auto Scalarized = InstsToScalarize.find(VF);
6348 assert(Scalarized != InstsToScalarize.end() &&
6349 "VF not yet analyzed for scalarization profitability");
6350 return !Scalarized->second.count(I) &&
6351 llvm::all_of(I->users(), [&](User *U) {
6352 auto *UI = cast<Instruction>(U);
6353 return !Scalarized->second.count(UI);
6354 });
6355 };
6356 (void) hasSingleCopyAfterVectorization;
6357
6358 Type *VectorTy;
6359 if (isScalarAfterVectorization(I, VF)) {
6360 // With the exception of GEPs and PHIs, after scalarization there should
6361 // only be one copy of the instruction generated in the loop. This is
6362 // because the VF is either 1, or any instructions that need scalarizing
6363 // have already been dealt with by the time we get here. As a result,
6364 // it means we don't have to multiply the instruction cost by VF.
6365 assert(I->getOpcode() == Instruction::GetElementPtr ||
6366 I->getOpcode() == Instruction::PHI ||
6367 (I->getOpcode() == Instruction::BitCast &&
6368 I->getType()->isPointerTy()) ||
6369 hasSingleCopyAfterVectorization(I, VF));
6370 VectorTy = RetTy;
6371 } else
6372 VectorTy = ToVectorTy(RetTy, VF);
6373
6374 if (VF.isVector() && VectorTy->isVectorTy() &&
6375 !TTI.getNumberOfParts(VectorTy))
6377
6378 // TODO: We need to estimate the cost of intrinsic calls.
6379 switch (I->getOpcode()) {
6380 case Instruction::GetElementPtr:
6381 // We mark this instruction as zero-cost because the cost of GEPs in
6382 // vectorized code depends on whether the corresponding memory instruction
6383 // is scalarized or not. Therefore, we handle GEPs with the memory
6384 // instruction cost.
6385 return 0;
6386 case Instruction::Br: {
6387 // In cases of scalarized and predicated instructions, there will be VF
6388 // predicated blocks in the vectorized loop. Each branch around these
6389 // blocks requires also an extract of its vector compare i1 element.
6390 // Note that the conditional branch from the loop latch will be replaced by
6391 // a single branch controlling the loop, so there is no extra overhead from
6392 // scalarization.
6393 bool ScalarPredicatedBB = false;
6394 BranchInst *BI = cast<BranchInst>(I);
6395 if (VF.isVector() && BI->isConditional() &&
6396 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6397 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6398 BI->getParent() != TheLoop->getLoopLatch())
6399 ScalarPredicatedBB = true;
6400
6401 if (ScalarPredicatedBB) {
6402 // Not possible to scalarize scalable vector with predicated instructions.
6403 if (VF.isScalable())
6405 // Return cost for branches around scalarized and predicated blocks.
6406 auto *Vec_i1Ty =
6407 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6408 return (
6410 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6411 /*Insert*/ false, /*Extract*/ true, CostKind) +
6412 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6413 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6414 // The back-edge branch will remain, as will all scalar branches.
6415 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6416 else
6417 // This branch will be eliminated by if-conversion.
6418 return 0;
6419 // Note: We currently assume zero cost for an unconditional branch inside
6420 // a predicated block since it will become a fall-through, although we
6421 // may decide in the future to call TTI for all branches.
6422 }
6423 case Instruction::PHI: {
6424 auto *Phi = cast<PHINode>(I);
6425
6426 // First-order recurrences are replaced by vector shuffles inside the loop.
6427 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6428 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6429 // penultimate value of the recurrence.
6430 // TODO: Consider vscale_range info.
6431 if (VF.isScalable() && VF.getKnownMinValue() == 1)
6434 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6436 cast<VectorType>(VectorTy), Mask, CostKind,
6437 VF.getKnownMinValue() - 1);
6438 }
6439
6440 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6441 // converted into select instructions. We require N - 1 selects per phi
6442 // node, where N is the number of incoming values.
6443 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6444 return (Phi->getNumIncomingValues() - 1) *
6446 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6447 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6449
6450 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6451 }
6452 case Instruction::UDiv:
6453 case Instruction::SDiv:
6454 case Instruction::URem:
6455 case Instruction::SRem:
6456 if (VF.isVector() && isPredicatedInst(I)) {
6457 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6458 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6459 ScalarCost : SafeDivisorCost;
6460 }
6461 // We've proven all lanes safe to speculate, fall through.
6462 [[fallthrough]];
6463 case Instruction::Add:
6464 case Instruction::FAdd:
6465 case Instruction::Sub:
6466 case Instruction::FSub:
6467 case Instruction::Mul:
6468 case Instruction::FMul:
6469 case Instruction::FDiv:
6470 case Instruction::FRem:
6471 case Instruction::Shl:
6472 case Instruction::LShr:
6473 case Instruction::AShr:
6474 case Instruction::And:
6475 case Instruction::Or:
6476 case Instruction::Xor: {
6477 // If we're speculating on the stride being 1, the multiplication may
6478 // fold away. We can generalize this for all operations using the notion
6479 // of neutral elements. (TODO)
6480 if (I->getOpcode() == Instruction::Mul &&
6481 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6482 PSE.getSCEV(I->getOperand(1))->isOne()))
6483 return 0;
6484
6485 // Detect reduction patterns
6486 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6487 return *RedCost;
6488
6489 // Certain instructions can be cheaper to vectorize if they have a constant
6490 // second vector operand. One example of this are shifts on x86.
6491 Value *Op2 = I->getOperand(1);
6492 auto Op2Info = TTI.getOperandInfo(Op2);
6493 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6494 Legal->isInvariant(Op2))
6496
6497 SmallVector<const Value *, 4> Operands(I->operand_values());
6499 I->getOpcode(), VectorTy, CostKind,
6500 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6501 Op2Info, Operands, I, TLI);
6502 }
6503 case Instruction::FNeg: {
6505 I->getOpcode(), VectorTy, CostKind,
6506 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6507 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6508 I->getOperand(0), I);
6509 }
6510 case Instruction::Select: {
6511 SelectInst *SI = cast<SelectInst>(I);
6512 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6513 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6514
6515 const Value *Op0, *Op1;
6516 using namespace llvm::PatternMatch;
6517 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6518 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6519 // select x, y, false --> x & y
6520 // select x, true, y --> x | y
6521 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6522 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6523 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6524 Op1->getType()->getScalarSizeInBits() == 1);
6525
6528 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6529 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6530 }
6531
6532 Type *CondTy = SI->getCondition()->getType();
6533 if (!ScalarCond)
6534 CondTy = VectorType::get(CondTy, VF);
6535
6537 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6538 Pred = Cmp->getPredicate();
6539 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6540 CostKind, I);
6541 }
6542 case Instruction::ICmp:
6543 case Instruction::FCmp: {
6544 Type *ValTy = I->getOperand(0)->getType();
6545 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6546 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6547 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6548 VectorTy = ToVectorTy(ValTy, VF);
6549 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6550 cast<CmpInst>(I)->getPredicate(), CostKind,
6551 I);
6552 }
6553 case Instruction::Store:
6554 case Instruction::Load: {
6555 ElementCount Width = VF;
6556 if (Width.isVector()) {
6557 InstWidening Decision = getWideningDecision(I, Width);
6558 assert(Decision != CM_Unknown &&
6559 "CM decision should be taken at this point");
6562 if (Decision == CM_Scalarize)
6563 Width = ElementCount::getFixed(1);
6564 }
6565 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
6566 return getMemoryInstructionCost(I, VF);
6567 }
6568 case Instruction::BitCast:
6569 if (I->getType()->isPointerTy())
6570 return 0;
6571 [[fallthrough]];
6572 case Instruction::ZExt:
6573 case Instruction::SExt:
6574 case Instruction::FPToUI:
6575 case Instruction::FPToSI:
6576 case Instruction::FPExt:
6577 case Instruction::PtrToInt:
6578 case Instruction::IntToPtr:
6579 case Instruction::SIToFP:
6580 case Instruction::UIToFP:
6581 case Instruction::Trunc:
6582 case Instruction::FPTrunc: {
6583 // Computes the CastContextHint from a Load/Store instruction.
6584 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6585 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6586 "Expected a load or a store!");
6587
6588 if (VF.isScalar() || !TheLoop->contains(I))
6590
6591 switch (getWideningDecision(I, VF)) {
6603 llvm_unreachable("Instr did not go through cost modelling?");
6606 llvm_unreachable_internal("Instr has invalid widening decision");
6607 }
6608
6609 llvm_unreachable("Unhandled case!");
6610 };
6611
6612 unsigned Opcode = I->getOpcode();
6614 // For Trunc, the context is the only user, which must be a StoreInst.
6615 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6616 if (I->hasOneUse())
6617 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6618 CCH = ComputeCCH(Store);
6619 }
6620 // For Z/Sext, the context is the operand, which must be a LoadInst.
6621 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6622 Opcode == Instruction::FPExt) {
6623 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6624 CCH = ComputeCCH(Load);
6625 }
6626
6627 // We optimize the truncation of induction variables having constant
6628 // integer steps. The cost of these truncations is the same as the scalar
6629 // operation.
6630 if (isOptimizableIVTruncate(I, VF)) {
6631 auto *Trunc = cast<TruncInst>(I);
6632 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6633 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6634 }
6635
6636 // Detect reduction patterns
6637 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6638 return *RedCost;
6639
6640 Type *SrcScalarTy = I->getOperand(0)->getType();
6641 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6642 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6643 SrcScalarTy =
6644 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6645 Type *SrcVecTy =
6646 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6647
6649 // If the result type is <= the source type, there will be no extend
6650 // after truncating the users to the minimal required bitwidth.
6651 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6652 (I->getOpcode() == Instruction::ZExt ||
6653 I->getOpcode() == Instruction::SExt))
6654 return 0;
6655 }
6656
6657 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6658 }
6659 case Instruction::Call:
6660 return getVectorCallCost(cast<CallInst>(I), VF);
6661 case Instruction::ExtractValue:
6663 case Instruction::Alloca:
6664 // We cannot easily widen alloca to a scalable alloca, as
6665 // the result would need to be a vector of pointers.
6666 if (VF.isScalable())
6668 [[fallthrough]];
6669 default:
6670 // This opcode is unknown. Assume that it is the same as 'mul'.
6671 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6672 } // end of switch.
6673}
6674
6676 // Ignore ephemeral values.
6678
6679 SmallVector<Value *, 4> DeadInterleavePointerOps;
6681 for (BasicBlock *BB : TheLoop->blocks())
6682 for (Instruction &I : *BB) {
6683 // Find all stores to invariant variables. Since they are going to sink
6684 // outside the loop we do not need calculate cost for them.
6685 StoreInst *SI;
6686 if ((SI = dyn_cast<StoreInst>(&I)) &&
6687 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
6688 ValuesToIgnore.insert(&I);
6689
6690 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6691 continue;
6692
6693 // Add instructions that would be trivially dead and are only used by
6694 // values already ignored to DeadOps to seed worklist.
6696 all_of(I.users(), [this](User *U) {
6697 return VecValuesToIgnore.contains(U) || ValuesToIgnore.contains(U);
6698 }))
6699 DeadOps.push_back(&I);
6700
6701 // For interleave groups, we only create a pointer for the start of the
6702 // interleave group. Queue up addresses of group members except the insert
6703 // position for further processing.
6704 if (isAccessInterleaved(&I)) {
6705 auto *Group = getInterleavedAccessGroup(&I);
6706 if (Group->getInsertPos() == &I)
6707 continue;
6708 Value *PointerOp = getLoadStorePointerOperand(&I);
6709 DeadInterleavePointerOps.push_back(PointerOp);
6710 }
6711 }
6712
6713 // Mark ops feeding interleave group members as free, if they are only used
6714 // by other dead computations.
6715 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6716 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6717 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6718 Instruction *UI = cast<Instruction>(U);
6719 return !VecValuesToIgnore.contains(U) &&
6720 (!isAccessInterleaved(UI) ||
6721 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6722 }))
6723 continue;
6724 VecValuesToIgnore.insert(Op);
6725 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6726 }
6727
6728 // Mark ops that would be trivially dead and are only used by ignored
6729 // instructions as free.
6730 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6731 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6732 // Skip any op that shouldn't be considered dead.
6733 if (!Op || !TheLoop->contains(Op) ||
6735 any_of(Op->users(), [this](User *U) {
6736 return !VecValuesToIgnore.contains(U) && !ValuesToIgnore.contains(U);
6737 }))
6738 continue;
6739
6740 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6741 // which applies for both scalar and vector versions. Otherwise it is only
6742 // dead in vector versions, so only add it to VecValuesToIgnore.
6743 if (all_of(Op->users(),
6744 [this](User *U) { return ValuesToIgnore.contains(U); }))
6745 ValuesToIgnore.insert(Op);
6746
6747 VecValuesToIgnore.insert(Op);
6748 DeadOps.append(Op->op_begin(), Op->op_end());
6749 }
6750
6751 // Ignore type-promoting instructions we identified during reduction
6752 // detection.
6753 for (const auto &Reduction : Legal->getReductionVars()) {
6754 const RecurrenceDescriptor &RedDes = Reduction.second;
6755 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6756 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6757 }
6758 // Ignore type-casting instructions we identified during induction
6759 // detection.
6760 for (const auto &Induction : Legal->getInductionVars()) {
6761 const InductionDescriptor &IndDes = Induction.second;
6762 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6763 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6764 }
6765}
6766
6768 for (const auto &Reduction : Legal->getReductionVars()) {
6769 PHINode *Phi = Reduction.first;
6770 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6771
6772 // We don't collect reductions that are type promoted (yet).
6773 if (RdxDesc.getRecurrenceType() != Phi->getType())
6774 continue;
6775
6776 // If the target would prefer this reduction to happen "in-loop", then we
6777 // want to record it as such.
6778 unsigned Opcode = RdxDesc.getOpcode();
6779 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6780 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6782 continue;
6783
6784 // Check that we can correctly put the reductions into the loop, by
6785 // finding the chain of operations that leads from the phi to the loop
6786 // exit value.
6787 SmallVector<Instruction *, 4> ReductionOperations =
6788 RdxDesc.getReductionOpChain(Phi, TheLoop);
6789 bool InLoop = !ReductionOperations.empty();
6790
6791 if (InLoop) {
6792 InLoopReductions.insert(Phi);
6793 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6794 Instruction *LastChain = Phi;
6795 for (auto *I : ReductionOperations) {
6796 InLoopReductionImmediateChains[I] = LastChain;
6797 LastChain = I;
6798 }
6799 }
6800 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6801 << " reduction for phi: " << *Phi << "\n");
6802 }
6803}
6804
6806 DebugLoc DL, const Twine &Name) {
6808 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
6809 return tryInsertInstruction(
6810 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
6811}
6812
6813// This function will select a scalable VF if the target supports scalable
6814// vectors and a fixed one otherwise.
6815// TODO: we could return a pair of values that specify the max VF and
6816// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6817// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6818// doesn't have a cost model that can choose which plan to execute if
6819// more than one is generated.
6822 unsigned WidestType;
6823 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6824
6829
6831 unsigned N = RegSize.getKnownMinValue() / WidestType;
6832 return ElementCount::get(N, RegSize.isScalable());
6833}
6834
6837 ElementCount VF = UserVF;
6838 // Outer loop handling: They may require CFG and instruction level
6839 // transformations before even evaluating whether vectorization is profitable.
6840 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6841 // the vectorization pipeline.
6842 if (!OrigLoop->isInnermost()) {
6843 // If the user doesn't provide a vectorization factor, determine a
6844 // reasonable one.
6845 if (UserVF.isZero()) {
6846 VF = determineVPlanVF(TTI, CM);
6847 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6848
6849 // Make sure we have a VF > 1 for stress testing.
6850 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6851 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6852 << "overriding computed VF.\n");
6853 VF = ElementCount::getFixed(4);
6854 }
6855 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6857 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6858 << "not supported by the target.\n");
6860 "Scalable vectorization requested but not supported by the target",
6861 "the scalable user-specified vectorization width for outer-loop "
6862 "vectorization cannot be used because the target does not support "
6863 "scalable vectors.",
6864 "ScalableVFUnfeasible", ORE, OrigLoop);
6866 }
6867 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6869 "VF needs to be a power of two");
6870 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6871 << "VF " << VF << " to build VPlans.\n");
6872 buildVPlans(VF, VF);
6873
6874 // For VPlan build stress testing, we bail out after VPlan construction.
6877
6878 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6879 }
6880
6881 LLVM_DEBUG(
6882 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6883 "VPlan-native path.\n");
6885}
6886
6887std::optional<VectorizationFactor>
6889 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6892
6893 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6894 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6895 return std::nullopt;
6896
6897 // Invalidate interleave groups if all blocks of loop will be predicated.
6898 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
6900 LLVM_DEBUG(
6901 dbgs()
6902 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6903 "which requires masked-interleaved support.\n");
6905 // Invalidating interleave groups also requires invalidating all decisions
6906 // based on them, which includes widening decisions and uniform and scalar
6907 // values.
6909 }
6910
6911 if (CM.foldTailByMasking())
6913
6914 ElementCount MaxUserVF =
6915 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6916 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
6917 if (!UserVF.isZero() && UserVFIsLegal) {
6919 "VF needs to be a power of two");
6920 // Collect the instructions (and their associated costs) that will be more
6921 // profitable to scalarize.
6923 if (CM.selectUserVectorizationFactor(UserVF)) {
6924 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6925 buildVPlansWithVPRecipes(UserVF, UserVF);
6926 if (!hasPlanWithVF(UserVF)) {
6927 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
6928 << ".\n");
6929 return std::nullopt;
6930 }
6931
6933 return {{UserVF, 0, 0}};
6934 } else
6935 reportVectorizationInfo("UserVF ignored because of invalid costs.",
6936 "InvalidCost", ORE, OrigLoop);
6937 }
6938
6939 // Collect the Vectorization Factor Candidates.
6940 SmallVector<ElementCount> VFCandidates;
6941 for (auto VF = ElementCount::getFixed(1);
6942 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
6943 VFCandidates.push_back(VF);
6944 for (auto VF = ElementCount::getScalable(1);
6945 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
6946 VFCandidates.push_back(VF);
6947
6949 for (const auto &VF : VFCandidates) {
6950 // Collect Uniform and Scalar instructions after vectorization with VF.
6952
6953 // Collect the instructions (and their associated costs) that will be more
6954 // profitable to scalarize.
6955 if (VF.isVector())
6957 }
6958
6959 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
6960 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
6961
6963 if (VPlans.empty())
6964 return std::nullopt;
6965 if (all_of(VPlans,
6966 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))
6968
6969 // Select the optimal vectorization factor according to the legacy cost-model.
6970 // This is now only used to verify the decisions by the new VPlan-based
6971 // cost-model and will be retired once the VPlan-based cost-model is
6972 // stabilized.
6973 VectorizationFactor VF = selectVectorizationFactor();
6974 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
6975 if (!hasPlanWithVF(VF.Width)) {
6976 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
6977 << ".\n");
6978 return std::nullopt;
6979 }
6980 return VF;
6981}
6982
6984 ElementCount VF) const {
6985 return CM.getInstructionCost(UI, VF);
6986}
6987
6988bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6989 return CM.ValuesToIgnore.contains(UI) ||
6990 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
6991 SkipCostComputation.contains(UI);
6992}
6993
6994InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6995 ElementCount VF) const {
6997 LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
6998 VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
6999
7000 // Cost modeling for inductions is inaccurate in the legacy cost model
7001 // compared to the recipes that are generated. To match here initially during
7002 // VPlan cost model bring up directly use the induction costs from the legacy
7003 // cost model. Note that we do this as pre-processing; the VPlan may not have
7004 // any recipes associated with the original induction increment instruction
7005 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7006 // the cost of induction phis and increments (both that are represented by
7007 // recipes and those that are not), to avoid distinguishing between them here,
7008 // and skip all recipes that represent induction phis and increments (the
7009 // former case) later on, if they exist, to avoid counting them twice.
7010 // Similarly we pre-compute the cost of any optimized truncates.
7011 // TODO: Switch to more accurate costing based on VPlan.
7012 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7013 Instruction *IVInc = cast<Instruction>(
7014 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7015 SmallVector<Instruction *> IVInsts = {IV, IVInc};
7016 for (User *U : IV->users()) {
7017 auto *CI = cast<Instruction>(U);
7018 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7019 continue;
7020 IVInsts.push_back(CI);
7021 }
7022 for (Instruction *IVInst : IVInsts) {
7023 if (!CostCtx.SkipCostComputation.insert(IVInst).second)
7024 continue;
7025 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7026 LLVM_DEBUG({
7027 dbgs() << "Cost of " << InductionCost << " for VF " << VF
7028 << ": induction instruction " << *IVInst << "\n";
7029 });
7030 Cost += InductionCost;
7031 }
7032 }
7033
7034 /// Compute the cost of all exiting conditions of the loop using the legacy
7035 /// cost model. This is to match the legacy behavior, which adds the cost of
7036 /// all exit conditions. Note that this over-estimates the cost, as there will
7037 /// be a single condition to control the vector loop.
7039 CM.TheLoop->getExitingBlocks(Exiting);
7040 SetVector<Instruction *> ExitInstrs;
7041 // Collect all exit conditions.
7042 for (BasicBlock *EB : Exiting) {
7043 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7044 if (!Term)
7045 continue;
7046 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7047 ExitInstrs.insert(CondI);
7048 }
7049 }
7050 // Compute the cost of all instructions only feeding the exit conditions.
7051 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7052 Instruction *CondI = ExitInstrs[I];
7053 if (!OrigLoop->contains(CondI) ||
7054 !CostCtx.SkipCostComputation.insert(CondI).second)
7055 continue;
7056 Cost += CostCtx.getLegacyCost(CondI, VF);
7057 for (Value *Op : CondI->operands()) {
7058 auto *OpI = dyn_cast<Instruction>(Op);
7059 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7060 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7061 !ExitInstrs.contains(cast<Instruction>(U));
7062 }))
7063 continue;
7064 ExitInstrs.insert(OpI);
7065 }
7066 }
7067
7068 // The legacy cost model has special logic to compute the cost of in-loop
7069 // reductions, which may be smaller than the sum of all instructions involved
7070 // in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7071 // which the legacy cost model uses to assign cost. Pre-compute their costs
7072 // for now.
7073 // TODO: Switch to costing based on VPlan once the logic has been ported.
7074 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7075 if (!CM.isInLoopReduction(RedPhi) &&
7077 RdxDesc.getRecurrenceKind()))
7078 continue;
7079
7080 // AnyOf reduction codegen may remove the select. To match the legacy cost
7081 // model, pre-compute the cost for AnyOf reductions here.
7083 RdxDesc.getRecurrenceKind())) {
7084 auto *Select = cast<SelectInst>(*find_if(
7085 RedPhi->users(), [](User *U) { return isa<SelectInst>(U); }));
7086 assert(!CostCtx.SkipCostComputation.contains(Select) &&
7087 "reduction op visited multiple times");
7088 CostCtx.SkipCostComputation.insert(Select);
7089 auto ReductionCost = CostCtx.getLegacyCost(Select, VF);
7090 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7091 << ":\n any-of reduction " << *Select << "\n");
7092 Cost += ReductionCost;
7093 continue;
7094 }
7095
7096 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7097 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7098 ChainOps.end());
7099 // Also include the operands of instructions in the chain, as the cost-model
7100 // may mark extends as free.
7101 for (auto *ChainOp : ChainOps) {
7102 for (Value *Op : ChainOp->operands()) {
7103 if (auto *I = dyn_cast<Instruction>(Op))
7104 ChainOpsAndOperands.insert(I);
7105 }
7106 }
7107
7108 // Pre-compute the cost for I, if it has a reduction pattern cost.
7109 for (Instruction *I : ChainOpsAndOperands) {
7110 auto ReductionCost = CM.getReductionPatternCost(
7111 I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7112 if (!ReductionCost)
7113 continue;
7114
7115 assert(!CostCtx.SkipCostComputation.contains(I) &&
7116 "reduction op visited multiple times");
7117 CostCtx.SkipCostComputation.insert(I);
7118 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7119 << ":\n in-loop reduction " << *I << "\n");
7120 Cost += *ReductionCost;
7121 }
7122 }
7123
7124 // Pre-compute the costs for branches except for the backedge, as the number
7125 // of replicate regions in a VPlan may not directly match the number of
7126 // branches, which would lead to different decisions.
7127 // TODO: Compute cost of branches for each replicate region in the VPlan,
7128 // which is more accurate than the legacy cost model.
7129 for (BasicBlock *BB : OrigLoop->blocks()) {
7130 if (BB == OrigLoop->getLoopLatch())
7131 continue;
7132 CostCtx.SkipCostComputation.insert(BB->getTerminator());
7133 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7134 Cost += BranchCost;
7135 }
7136 // Now compute and add the VPlan-based cost.
7137 Cost += Plan.cost(VF, CostCtx);
7138 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
7139 return Cost;
7140}
7141
7143 // If there is a single VPlan with a single VF, return it directly.
7144 VPlan &FirstPlan = *VPlans[0];
7145 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7146 return FirstPlan;
7147
7148 VPlan *BestPlan = &FirstPlan;
7150 assert(hasPlanWithVF(ScalarVF) &&
7151 "More than a single plan/VF w/o any plan having scalar VF");
7152
7153 // TODO: Compute scalar cost using VPlan-based cost model.
7154 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7155 VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
7156
7157 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7158 if (ForceVectorization) {
7159 // Ignore scalar width, because the user explicitly wants vectorization.
7160 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7161 // evaluation.
7162 BestFactor.Cost = InstructionCost::getMax();
7163 }
7164
7165 for (auto &P : VPlans) {
7166 for (ElementCount VF : P->vectorFactors()) {
7167 if (VF.isScalar())
7168 continue;
7169 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7170 LLVM_DEBUG(
7171 dbgs()
7172 << "LV: Not considering vector loop of width " << VF
7173 << " because it will not generate any vector instructions.\n");
7174 continue;
7175 }
7176
7177 InstructionCost Cost = cost(*P, VF);
7178 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7179 if (isMoreProfitable(CurrentFactor, BestFactor)) {
7180 BestFactor = CurrentFactor;
7181 BestPlan = &*P;
7182 }
7183 }
7184 }
7185 BestPlan->setVF(BestFactor.Width);
7186 return *BestPlan;
7187}
7188
7190 assert(count_if(VPlans,
7191 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7192 1 &&
7193 "Best VF has not a single VPlan.");
7194
7195 for (const VPlanPtr &Plan : VPlans) {
7196 if (Plan->hasVF(VF))
7197 return *Plan.get();
7198 }
7199 llvm_unreachable("No plan found!");
7200}
7201
7204 // Reserve first location for self reference to the LoopID metadata node.
7205 MDs.push_back(nullptr);
7206 bool IsUnrollMetadata = false;
7207 MDNode *LoopID = L->getLoopID();
7208 if (LoopID) {
7209 // First find existing loop unrolling disable metadata.
7210 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7211 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7212 if (MD) {
7213 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7214 IsUnrollMetadata =
7215 S && S->getString().starts_with("llvm.loop.unroll.disable");
7216 }
7217 MDs.push_back(LoopID->getOperand(i));
7218 }
7219 }
7220
7221 if (!IsUnrollMetadata) {
7222 // Add runtime unroll disable metadata.
7223 LLVMContext &Context = L->getHeader()->getContext();
7224 SmallVector<Metadata *, 1> DisableOperands;
7225 DisableOperands.push_back(
7226 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7227 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7228 MDs.push_back(DisableNode);
7229 MDNode *NewLoopID = MDNode::get(Context, MDs);
7230 // Set operand 0 to refer to the loop id itself.
7231 NewLoopID->replaceOperandWith(0, NewLoopID);
7232 L->setLoopID(NewLoopID);
7233 }
7234}
7235
7236// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7237// create a merge phi node for it and add it to \p ReductionResumeValues.
7239 VPInstruction *RedResult,
7241 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
7242 bool VectorizingEpilogue) {
7243 if (!RedResult ||
7245 return;
7246
7247 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7248 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7249
7250 Value *FinalValue =
7251 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7252 auto *ResumePhi =
7253 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7254 if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
7255 RdxDesc.getRecurrenceKind())) {
7256 auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
7257 assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
7258 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
7259 ResumePhi = cast<PHINode>(Cmp->getOperand(0));
7260 }
7261 assert((!VectorizingEpilogue || ResumePhi) &&
7262 "when vectorizing the epilogue loop, we need a resume phi from main "
7263 "vector loop");
7264
7265 // TODO: bc.merge.rdx should not be created here, instead it should be
7266 // modeled in VPlan.
7267 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7268 // Create a phi node that merges control-flow from the backedge-taken check
7269 // block and the middle block.
7270 auto *BCBlockPhi =
7271 PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7272 LoopScalarPreHeader->getTerminator()->getIterator());
7273
7274 // If we are fixing reductions in the epilogue loop then we should already
7275 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7276 // we carry over the incoming values correctly.
7277 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7278 if (Incoming == LoopMiddleBlock)
7279 BCBlockPhi->addIncoming(FinalValue, Incoming);
7280 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7281 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7282 Incoming);
7283 else
7284 BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
7285 }
7286
7287 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7288 // TODO: This fixup should instead be modeled in VPlan.
7289 // Fix the scalar loop reduction variable with the incoming reduction sum
7290 // from the vector body and from the backedge value.
7291 int IncomingEdgeBlockIdx =
7292 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7293 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7294 // Pick the other block.
7295 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7296 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7297 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7298 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7299
7300 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7301}
7302
7303std::pair<DenseMap<const SCEV *, Value *>,
7306 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7307 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7308 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7309 assert(BestVPlan.hasVF(BestVF) &&
7310 "Trying to execute plan with unsupported VF");
7311 assert(BestVPlan.hasUF(BestUF) &&
7312 "Trying to execute plan with unsupported UF");
7313 assert(
7314 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7315 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7316 (void)IsEpilogueVectorization;
7317
7318 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7319
7320 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7321 << ", UF=" << BestUF << '\n');
7322 BestVPlan.setName("Final VPlan");
7323 LLVM_DEBUG(BestVPlan.dump());
7324
7325 // Perform the actual loop transformation.
7326 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7327 OrigLoop->getHeader()->getContext());
7328
7329 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7330 // before making any changes to the CFG.
7331 if (!BestVPlan.getPreheader()->empty()) {
7332 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7334 BestVPlan.getPreheader()->execute(&State);
7335 }
7336 if (!ILV.getTripCount())
7337 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7338 else
7339 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7340 "count during epilogue vectorization");
7341
7342 // 1. Set up the skeleton for vectorization, including vector pre-header and
7343 // middle block. The vector loop is created during VPlan execution.
7344 Value *CanonicalIVStartValue;
7345 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7346 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7347 : State.ExpandedSCEVs);
7348#ifdef EXPENSIVE_CHECKS
7349 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7350#endif
7351
7352 // Only use noalias metadata when using memory checks guaranteeing no overlap
7353 // across all iterations.
7354 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7355 std::unique_ptr<LoopVersioning> LVer = nullptr;
7356 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7358
7359 // We currently don't use LoopVersioning for the actual loop cloning but we
7360 // still use it to add the noalias metadata.
7361 // TODO: Find a better way to re-use LoopVersioning functionality to add
7362 // metadata.
7363 LVer = std::make_unique<LoopVersioning>(
7364 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7365 PSE.getSE());
7366 State.LVer = &*LVer;
7368 }
7369
7371
7372 //===------------------------------------------------===//
7373 //
7374 // Notice: any optimization or new instruction that go
7375 // into the code below should also be implemented in
7376 // the cost-model.
7377 //
7378 //===------------------------------------------------===//
7379
7380 // 2. Copy and widen instructions from the old loop into the new loop.
7381 BestVPlan.prepareToExecute(ILV.getTripCount(),
7382 ILV.getOrCreateVectorTripCount(nullptr),
7383 CanonicalIVStartValue, State);
7384
7385 BestVPlan.execute(&State);
7386
7387 // 2.5 Collect reduction resume values.
7389 auto *ExitVPBB =
7390 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7391 for (VPRecipeBase &R : *ExitVPBB) {
7393 dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
7394 State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
7395 }
7396
7397 // 2.6. Maintain Loop Hints
7398 // Keep all loop hints from the original loop on the vector loop (we'll
7399 // replace the vectorizer-specific hints below).
7400 MDNode *OrigLoopID = OrigLoop->getLoopID();
7401
7402 std::optional<MDNode *> VectorizedLoopID =
7405
7406 VPBasicBlock *HeaderVPBB =
7408 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7409 if (VectorizedLoopID)
7410 L->setLoopID(*VectorizedLoopID);
7411 else {
7412 // Keep all loop hints from the original loop on the vector loop (we'll
7413 // replace the vectorizer-specific hints below).
7414 if (MDNode *LID = OrigLoop->getLoopID())
7415 L->setLoopID(LID);
7416
7417 LoopVectorizeHints Hints(L, true, *ORE);
7418 Hints.setAlreadyVectorized();
7419 }
7421 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7422 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7424
7425 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7426 // predication, updating analyses.
7427 ILV.fixVectorizedLoop(State, BestVPlan);
7428
7430
7431 // 4. Adjust branch weight of the branch in the middle block.
7432 auto *MiddleTerm =
7433 cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7434 if (MiddleTerm->isConditional() &&
7435 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7436 // Assume that `Count % VectorTripCount` is equally distributed.
7437 unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7438 assert(TripCount > 0 && "trip count should not be zero");
7439 const uint32_t Weights[] = {1, TripCount - 1};
7440 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7441 }
7442
7443 return {State.ExpandedSCEVs, ReductionResumeValues};
7444}
7445
7446#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7448 for (const auto &Plan : VPlans)
7450 Plan->printDOT(O);
7451 else
7452 Plan->print(O);
7453}
7454#endif
7455
7456//===--------------------------------------------------------------------===//
7457// EpilogueVectorizerMainLoop
7458//===--------------------------------------------------------------------===//
7459
7460/// This function is partially responsible for generating the control flow
7461/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7462std::pair<BasicBlock *, Value *>
7464 const SCEV2ValueTy &ExpandedSCEVs) {
7466
7467 // Generate the code to check the minimum iteration count of the vector
7468 // epilogue (see below).
7472
7473 // Generate the code to check any assumptions that we've made for SCEV
7474 // expressions.
7476
7477 // Generate the code that checks at runtime if arrays overlap. We put the
7478 // checks into a separate block to make the more common case of few elements
7479 // faster.
7481
7482 // Generate the iteration count check for the main loop, *after* the check
7483 // for the epilogue loop, so that the path-length is shorter for the case
7484 // that goes directly through the vector epilogue. The longer-path length for
7485 // the main loop is compensated for, by the gain from vectorizing the larger
7486 // trip count. Note: the branch will get updated later on when we vectorize
7487 // the epilogue.
7490
7491 // Generate the induction variable.
7493
7494 // Skip induction resume value creation here because they will be created in
7495 // the second pass for the scalar loop. The induction resume values for the
7496 // inductions in the epilogue loop are created before executing the plan for
7497 // the epilogue loop.
7498
7499 return {LoopVectorPreHeader, nullptr};
7500}
7501
7503 LLVM_DEBUG({
7504 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7505 << "Main Loop VF:" << EPI.MainLoopVF
7506 << ", Main Loop UF:" << EPI.MainLoopUF
7507 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7508 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7509 });
7510}
7511
7514 dbgs() << "intermediate fn:\n"
7515 << *OrigLoop->getHeader()->getParent() << "\n";
7516 });
7517}
7518
7519BasicBlock *
7521 bool ForEpilogue) {
7522 assert(Bypass && "Expected valid bypass basic block.");
7523 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7524 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7525 Value *Count = getTripCount();
7526 // Reuse existing vector loop preheader for TC checks.
7527 // Note that new preheader block is generated for vector loop.
7528 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7529 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7530
7531 // Generate code to check if the loop's trip count is less than VF * UF of the
7532 // main vector loop.
7533 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7534 : VF.isVector())
7537
7538 Value *CheckMinIters = Builder.CreateICmp(
7539 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7540 "min.iters.check");
7541
7542 if (!ForEpilogue)
7543 TCCheckBlock->setName("vector.main.loop.iter.check");
7544
7545 // Create new preheader for vector loop.
7546 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7547 DT, LI, nullptr, "vector.ph");
7548
7549 if (ForEpilogue) {
7550 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7551 DT->getNode(Bypass)->getIDom()) &&
7552 "TC check is expected to dominate Bypass");
7553
7554 // Update dominator for Bypass.
7555 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7556 LoopBypassBlocks.push_back(TCCheckBlock);
7557
7558 // Save the trip count so we don't have to regenerate it in the
7559 // vec.epilog.iter.check. This is safe to do because the trip count
7560 // generated here dominates the vector epilog iter check.
7561 EPI.TripCount = Count;
7562 }
7563
7564 BranchInst &BI =
7565 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7567 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7568 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7569
7570 return TCCheckBlock;
7571}
7572
7573//===--------------------------------------------------------------------===//
7574// EpilogueVectorizerEpilogueLoop
7575//===--------------------------------------------------------------------===//
7576
7577/// This function is partially responsible for generating the control flow
7578/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7579std::pair<BasicBlock *, Value *>
7581 const SCEV2ValueTy &ExpandedSCEVs) {
7582 createVectorLoopSkeleton("vec.epilog.");
7583
7584 // Now, compare the remaining count and if there aren't enough iterations to
7585 // execute the vectorized epilogue skip to the scalar part.
7586 LoopVectorPreHeader->setName("vec.epilog.ph");
7587 BasicBlock *VecEpilogueIterationCountCheck =
7589 nullptr, "vec.epilog.iter.check", true);
7591 VecEpilogueIterationCountCheck);
7592
7593 // Adjust the control flow taking the state info from the main loop
7594 // vectorization into account.
7596 "expected this to be saved from the previous pass.");
7598 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7599
7602
7604 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7605
7606 if (EPI.SCEVSafetyCheck)
7608 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7609 if (EPI.MemSafetyCheck)
7611 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7612
7614 VecEpilogueIterationCountCheck,
7615 VecEpilogueIterationCountCheck->getSinglePredecessor());
7616
7619 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7620 // If there is an epilogue which must run, there's no edge from the
7621 // middle block to exit blocks and thus no need to update the immediate
7622 // dominator of the exit blocks.
7625
7626 // Keep track of bypass blocks, as they feed start values to the induction and
7627 // reduction phis in the scalar loop preheader.
7628 if (EPI.SCEVSafetyCheck)
7630 if (EPI.MemSafetyCheck)
7633
7634 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7635 // reductions which merge control-flow from the latch block and the middle
7636 // block. Update the incoming values here and move the Phi into the preheader.
7637 SmallVector<PHINode *, 4> PhisInBlock;
7638 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7639 PhisInBlock.push_back(&Phi);
7640
7641 for (PHINode *Phi : PhisInBlock) {
7642 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7643 Phi->replaceIncomingBlockWith(
7644 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7645 VecEpilogueIterationCountCheck);
7646
7647 // If the phi doesn't have an incoming value from the
7648 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7649 // value and also those from other check blocks. This is needed for
7650 // reduction phis only.
7651 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7652 return EPI.EpilogueIterationCountCheck == IncB;
7653 }))
7654 continue;
7655 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7656 if (EPI.SCEVSafetyCheck)
7657 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7658 if (EPI.MemSafetyCheck)
7659 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7660 }
7661
7662 // Generate a resume induction for the vector epilogue and put it in the
7663 // vector epilogue preheader
7664 Type *IdxTy = Legal->getWidestInductionType();
7665 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7667 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7668 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7670
7671 // Generate induction resume values. These variables save the new starting
7672 // indexes for the scalar loop. They are used to test if there are any tail
7673 // iterations left once the vector loop has completed.
7674 // Note that when the vectorized epilogue is skipped due to iteration count
7675 // check, then the resume value for the induction variable comes from
7676 // the trip count of the main vector loop, hence passing the AdditionalBypass
7677 // argument.
7678 createInductionResumeValues(ExpandedSCEVs,
7679 {VecEpilogueIterationCountCheck,
7680 EPI.VectorTripCount} /* AdditionalBypass */);
7681
7682 return {LoopVectorPreHeader, EPResumeVal};
7683}
7684
7685BasicBlock *
7687 BasicBlock *Bypass, BasicBlock *Insert) {
7688
7690 "Expected trip count to have been safed in the first pass.");
7691 assert(
7692 (!isa<Instruction>(EPI.TripCount) ||
7693 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7694 "saved trip count does not dominate insertion point.");
7695 Value *TC = EPI.TripCount;
7696 IRBuilder<> Builder(Insert->getTerminator());
7697 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7698
7699 // Generate code to check if the loop's trip count is less than VF * UF of the
7700 // vector epilogue loop.
7701 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7704
7705 Value *CheckMinIters =
7706 Builder.CreateICmp(P, Count,
7709 "min.epilog.iters.check");
7710
7711 BranchInst &BI =
7712 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7714 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7715 unsigned EpilogueLoopStep =
7717 // We assume the remaining `Count` is equally distributed in
7718 // [0, MainLoopStep)
7719 // So the probability for `Count < EpilogueLoopStep` should be
7720 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7721 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7722 const uint32_t Weights[] = {EstimatedSkipCount,
7723 MainLoopStep - EstimatedSkipCount};
7724 setBranchWeights(BI, Weights, /*IsExpected=*/false);
7725 }
7726 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7727 LoopBypassBlocks.push_back(Insert);
7728 return Insert;
7729}
7730
7732 LLVM_DEBUG({
7733 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7734 << "Epilogue Loop VF:" << EPI.EpilogueVF
7735 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7736 });
7737}
7738
7741 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7742 });
7743}
7744
7746 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7747 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7748 bool PredicateAtRangeStart = Predicate(Range.Start);
7749
7750 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7751 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7752 Range.End = TmpVF;
7753 break;
7754 }
7755
7756 return PredicateAtRangeStart;
7757}
7758
7759/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7760/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7761/// of VF's starting at a given VF and extending it as much as possible. Each
7762/// vectorization decision can potentially shorten this sub-range during
7763/// buildVPlan().
7765 ElementCount MaxVF) {
7766 auto MaxVFTimes2 = MaxVF * 2;
7767 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7768 VFRange SubRange = {VF, MaxVFTimes2};
7769 VPlans.push_back(buildVPlan(SubRange));
7770 VF = SubRange.End;
7771 }
7772}
7773
7774iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
7776 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
7777 if (auto *I = dyn_cast<Instruction>(Op)) {
7778 if (auto *R = Ingredient2Recipe.lookup(I))
7779 return R->getVPSingleValue();
7780 }
7781 return Plan.getOrAddLiveIn(Op);
7782 };
7783 return map_range(Operands, Fn);
7784}
7785
7787 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7788
7789 // Look for cached value.
7790 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7791 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7792 if (ECEntryIt != EdgeMaskCache.end())
7793 return ECEntryIt->second;
7794
7795 VPValue *SrcMask = getBlockInMask(Src);
7796
7797 // The terminator has to be a branch inst!
7798 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7799 assert(BI && "Unexpected terminator found");
7800
7801 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7802 return EdgeMaskCache[Edge] = SrcMask;
7803
7804 // If source is an exiting block, we know the exit edge is dynamically dead
7805 // in the vector loop, and thus we don't need to restrict the mask. Avoid
7806 // adding uses of an otherwise potentially dead instruction.
7807 if (OrigLoop->isLoopExiting(Src))
7808 return EdgeMaskCache[Edge] = SrcMask;
7809
7810 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);
7811 assert(EdgeMask && "No Edge Mask found for condition");
7812
7813 if (BI->getSuccessor(0) != Dst)
7814 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7815
7816 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7817 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
7818 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
7819 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7820 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
7821 }
7822
7823 return EdgeMaskCache[Edge] = EdgeMask;
7824}
7825
7827 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7828
7829 // Look for cached value.
7830 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7831 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
7832 assert(ECEntryIt != EdgeMaskCache.end() &&
7833 "looking up mask for edge which has not been created");
7834 return ECEntryIt->second;
7835}
7836
7838 BasicBlock *Header = OrigLoop->getHeader();
7839
7840 // When not folding the tail, use nullptr to model all-true mask.
7841 if (!CM.foldTailByMasking()) {
7842 BlockMaskCache[Header] = nullptr;
7843 return;
7844 }
7845
7846 // Introduce the early-exit compare IV <= BTC to form header block mask.
7847 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
7848 // constructing the desired canonical IV in the header block as its first
7849 // non-phi instructions.
7850
7851 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7852 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
7853 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
7854 HeaderVPBB->insert(IV, NewInsertionPoint);
7855
7856 VPBuilder::InsertPointGuard Guard(Builder);
7857 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
7858 VPValue *BlockMask = nullptr;
7860 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
7861 BlockMaskCache[Header] = BlockMask;
7862}
7863
7865 // Return the cached value.
7866 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
7867 assert(BCEntryIt != BlockMaskCache.end() &&
7868 "Trying to access mask for block without one.");
7869 return BCEntryIt->second;
7870}
7871
7873 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7874 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
7875 assert(OrigLoop->getHeader() != BB &&
7876 "Loop header must have cached block mask");
7877
7878 // All-one mask is modelled as no-mask following the convention for masked
7879 // load/store/gather/scatter. Initialize BlockMask to no-mask.
7880 VPValue *BlockMask = nullptr;
7881 // This is the block mask. We OR all incoming edges.
7882 for (auto *Predecessor : predecessors(BB)) {
7883 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
7884 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
7885 BlockMaskCache[BB] = EdgeMask;
7886 return;
7887 }
7888
7889 if (!BlockMask) { // BlockMask has its initialized nullptr value.
7890 BlockMask = EdgeMask;
7891 continue;
7892 }
7893
7894 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
7895 }
7896
7897 BlockMaskCache[BB] = BlockMask;
7898}
7899
7901VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
7902 VFRange &Range) {
7903 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7904 "Must be called with either a load or store");
7905
7906 auto willWiden = [&](ElementCount VF) -> bool {
7908 CM.getWideningDecision(I, VF);
7910 "CM decision should be taken at this point.");
7912 return true;
7913 if (CM.isScalarAfterVectorization(I, VF) ||
7914 CM.isProfitableToScalarize(I, VF))
7915 return false;
7917 };
7918
7920 return nullptr;
7921
7922 VPValue *Mask = nullptr;
7923 if (Legal->isMaskRequired(I))
7924 Mask = getBlockInMask(I->getParent());
7925
7926 // Determine if the pointer operand of the access is either consecutive or
7927 // reverse consecutive.
7929 CM.getWideningDecision(I, Range.Start);
7931 bool Consecutive =
7933
7934 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
7935 if (Consecutive) {
7936 auto *GEP = dyn_cast<GetElementPtrInst>(
7937 Ptr->getUnderlyingValue()->stripPointerCasts());
7938 auto *VectorPtr = new VPVectorPointerRecipe(
7939 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
7940 I->getDebugLoc());
7941 Builder.getInsertBlock()->appendRecipe(VectorPtr);
7942 Ptr = VectorPtr;
7943 }
7944 if (LoadInst *Load = dyn_cast<LoadInst>(I))
7945 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7946 I->getDebugLoc());
7947
7948 StoreInst *Store = cast<StoreInst>(I);
7949 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
7950 Reverse, I->getDebugLoc());
7951}
7952
7953/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
7954/// insert a recipe to expand the step for the induction recipe.
7957 VPValue *Start, const InductionDescriptor &IndDesc,
7958 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
7959 assert(IndDesc.getStartValue() ==
7960 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
7961 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7962 "step must be loop invariant");
7963
7964 VPValue *Step =
7966 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
7967 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
7968 }
7969 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
7970 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
7971}
7972
7973VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
7975
7976 // Check if this is an integer or fp induction. If so, build the recipe that
7977 // produces its scalar and vector values.
7978 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7979 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
7980 *PSE.getSE(), *OrigLoop);
7981
7982 // Check if this is pointer induction. If so, build the recipe for it.
7983 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7984 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
7985 *PSE.getSE());
7987 Phi, Operands[0], Step, *II,
7989 [&](ElementCount VF) {
7990 return CM.isScalarAfterVectorization(Phi, VF);
7991 },
7992 Range));
7993 }
7994 return nullptr;
7995}
7996
7997VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
7999 // Optimize the special case where the source is a constant integer
8000 // induction variable. Notice that we can only optimize the 'trunc' case
8001 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8002 // (c) other casts depend on pointer size.
8003
8004 // Determine whether \p K is a truncation based on an induction variable that
8005 // can be optimized.
8006 auto isOptimizableIVTruncate =
8007 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8008 return [=](ElementCount VF) -> bool {
8009 return CM.isOptimizableIVTruncate(K, VF);
8010 };
8011 };
8012
8014 isOptimizableIVTruncate(I), Range)) {
8015
8016 auto *Phi = cast<PHINode>(I->getOperand(0));
8018 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8019 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8020 *OrigLoop);
8021 }
8022 return nullptr;
8023}
8024
8025VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8027 unsigned NumIncoming = Phi->getNumIncomingValues();
8028
8029 // We know that all PHIs in non-header blocks are converted into selects, so
8030 // we don't have to worry about the insertion order and we can just use the
8031 // builder. At this point we generate the predication tree. There may be
8032 // duplications since this is a simple recursive scan, but future
8033 // optimizations will clean it up.
8034 // TODO: At the moment the first mask is always skipped, but it would be
8035 // better to skip the most expensive mask.
8036 SmallVector<VPValue *, 2> OperandsWithMask;
8037
8038 for (unsigned In = 0; In < NumIncoming; In++) {
8039 OperandsWithMask.push_back(Operands[In]);
8040 VPValue *EdgeMask =
8041 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8042 if (!EdgeMask) {
8043 assert(In == 0 && "Both null and non-null edge masks found");
8045 "Distinct incoming values with one having a full mask");
8046 break;
8047 }
8048 if (In == 0)
8049 continue;
8050 OperandsWithMask.push_back(EdgeMask);
8051 }
8052 return new VPBlendRecipe(Phi, OperandsWithMask);
8053}
8054
8055VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8057 VFRange &Range) {
8059 [this, CI](ElementCount VF) {
8060 return CM.isScalarWithPredication(CI, VF);
8061 },
8062 Range);
8063
8064 if (IsPredicated)
8065 return nullptr;
8066
8068 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8069 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8070 ID == Intrinsic::pseudoprobe ||
8071 ID == Intrinsic::experimental_noalias_scope_decl))
8072 return nullptr;
8073
8074 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8075 Ops.push_back(Operands.back());
8076
8077 // Is it beneficial to perform intrinsic call compared to lib call?
8078 bool ShouldUseVectorIntrinsic =
8080 [&](ElementCount VF) -> bool {
8081 return CM.getCallWideningDecision(CI, VF).Kind ==
8083 },
8084 Range);
8085 if (ShouldUseVectorIntrinsic)
8086 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID,
8087 CI->getDebugLoc());
8088
8089 Function *Variant = nullptr;
8090 std::optional<unsigned> MaskPos;
8091 // Is better to call a vectorized version of the function than to to scalarize
8092 // the call?
8093 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8094 [&](ElementCount VF) -> bool {
8095 // The following case may be scalarized depending on the VF.
8096 // The flag shows whether we can use a usual Call for vectorized
8097 // version of the instruction.
8098
8099 // If we've found a variant at a previous VF, then stop looking. A
8100 // vectorized variant of a function expects input in a certain shape
8101 // -- basically the number of input registers, the number of lanes
8102 // per register, and whether there's a mask required.
8103 // We store a pointer to the variant in the VPWidenCallRecipe, so
8104 // once we have an appropriate variant it's only valid for that VF.
8105 // This will force a different vplan to be generated for each VF that
8106 // finds a valid variant.
8107 if (Variant)
8108 return false;
8110 CM.getCallWideningDecision(CI, VF);
8112 Variant = Decision.Variant;
8113 MaskPos = Decision.MaskPos;
8114 return true;
8115 }
8116
8117 return false;
8118 },
8119 Range);
8120 if (ShouldUseVectorCall) {
8121 if (MaskPos.has_value()) {
8122 // We have 2 cases that would require a mask:
8123 // 1) The block needs to be predicated, either due to a conditional
8124 // in the scalar loop or use of an active lane mask with
8125 // tail-folding, and we use the appropriate mask for the block.
8126 // 2) No mask is required for the block, but the only available
8127 // vector variant at this VF requires a mask, so we synthesize an
8128 // all-true mask.
8129 VPValue *Mask = nullptr;
8130 if (Legal->isMaskRequired(CI))
8131 Mask = getBlockInMask(CI->getParent());
8132 else
8134 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8135
8136 Ops.insert(Ops.begin() + *MaskPos, Mask);
8137 }
8138
8139 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()),
8141 Variant);
8142 }
8143
8144 return nullptr;
8145}
8146
8147bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8148 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8149 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8150 // Instruction should be widened, unless it is scalar after vectorization,
8151 // scalarization is profitable or it is predicated.
8152 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8153 return CM.isScalarAfterVectorization(I, VF) ||
8154 CM.isProfitableToScalarize(I, VF) ||
8155 CM.isScalarWithPredication(I, VF);
8156 };
8158 Range);
8159}
8160
8161VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8163 VPBasicBlock *VPBB) {
8164 switch (I->getOpcode()) {
8165 default:
8166 return nullptr;
8167 case Instruction::SDiv:
8168 case Instruction::UDiv:
8169 case Instruction::SRem:
8170 case Instruction::URem: {
8171 // If not provably safe, use a select to form a safe divisor before widening the
8172 // div/rem operation itself. Otherwise fall through to general handling below.
8173 if (CM.isPredicatedInst(I)) {
8174 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8175 VPValue *Mask = getBlockInMask(I->getParent());
8176 VPValue *One =
8177 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8178 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8179 Ops[1] = SafeRHS;
8180 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8181 }
8182 [[fallthrough]];
8183 }
8184 case Instruction::Add:
8185 case Instruction::And:
8186 case Instruction::AShr:
8187 case Instruction::FAdd:
8188 case Instruction::FCmp:
8189 case Instruction::FDiv:
8190 case Instruction::FMul:
8191 case Instruction::FNeg:
8192 case Instruction::FRem:
8193 case Instruction::FSub:
8194 case Instruction::ICmp:
8195 case Instruction::LShr:
8196 case Instruction::Mul:
8197 case Instruction::Or:
8198 case Instruction::Select:
8199 case Instruction::Shl:
8200 case Instruction::Sub:
8201 case Instruction::Xor:
8202 case Instruction::Freeze:
8203 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8204 };
8205}
8206
8208 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8209 for (VPHeaderPHIRecipe *R : PhisToFix) {
8210 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8211 VPRecipeBase *IncR =
8212 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8213 R->addOperand(IncR->getVPSingleValue());
8214 }
8215}
8216
8218 VFRange &Range) {
8220 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8221 Range);
8222
8223 bool IsPredicated = CM.isPredicatedInst(I);
8224
8225 // Even if the instruction is not marked as uniform, there are certain
8226 // intrinsic calls that can be effectively treated as such, so we check for
8227 // them here. Conservatively, we only do this for scalable vectors, since
8228 // for fixed-width VFs we can always fall back on full scalarization.
8229 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8230 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8231 case Intrinsic::assume:
8232 case Intrinsic::lifetime_start:
8233 case Intrinsic::lifetime_end:
8234 // For scalable vectors if one of the operands is variant then we still
8235 // want to mark as uniform, which will generate one instruction for just
8236 // the first lane of the vector. We can't scalarize the call in the same
8237 // way as for fixed-width vectors because we don't know how many lanes
8238 // there are.
8239 //
8240 // The reasons for doing it this way for scalable vectors are:
8241 // 1. For the assume intrinsic generating the instruction for the first
8242 // lane is still be better than not generating any at all. For
8243 // example, the input may be a splat across all lanes.
8244 // 2. For the lifetime start/end intrinsics the pointer operand only
8245 // does anything useful when the input comes from a stack object,
8246 // which suggests it should always be uniform. For non-stack objects
8247 // the effect is to poison the object, which still allows us to
8248 // remove the call.
8249 IsUniform = true;
8250 break;
8251 default:
8252 break;
8253 }
8254 }
8255 VPValue *BlockInMask = nullptr;
8256 if (!IsPredicated) {
8257 // Finalize the recipe for Instr, first if it is not predicated.
8258 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8259 } else {
8260 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8261 // Instructions marked for predication are replicated and a mask operand is
8262 // added initially. Masked replicate recipes will later be placed under an
8263 // if-then construct to prevent side-effects. Generate recipes to compute
8264 // the block mask for this region.
8265 BlockInMask = getBlockInMask(I->getParent());
8266 }
8267
8268 // Note that there is some custom logic to mark some intrinsics as uniform
8269 // manually above for scalable vectors, which this assert needs to account for
8270 // as well.
8271 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8272 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8273 "Should not predicate a uniform recipe");
8274 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8275 IsUniform, BlockInMask);
8276 return Recipe;
8277}
8278
8282 VFRange &Range, VPBasicBlock *VPBB) {
8283 // First, check for specific widening recipes that deal with inductions, Phi
8284 // nodes, calls and memory operations.
8285 VPRecipeBase *Recipe;
8286 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8287 if (Phi->getParent() != OrigLoop->getHeader())
8288 return tryToBlend(Phi, Operands);
8289
8290 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8291 return Recipe;
8292
8293 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8294 assert((Legal->isReductionVariable(Phi) ||
8295 Legal->isFixedOrderRecurrence(Phi)) &&
8296 "can only widen reductions and fixed-order recurrences here");
8297 VPValue *StartV = Operands[0];
8298 if (Legal->isReductionVariable(Phi)) {
8299 const RecurrenceDescriptor &RdxDesc =
8300 Legal->getReductionVars().find(Phi)->second;
8301 assert(RdxDesc.getRecurrenceStartValue() ==
8302 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8303 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8304 CM.isInLoopReduction(Phi),
8305 CM.useOrderedReductions(RdxDesc));
8306 } else {
8307 // TODO: Currently fixed-order recurrences are modeled as chains of
8308 // first-order recurrences. If there are no users of the intermediate
8309 // recurrences in the chain, the fixed order recurrence should be modeled
8310 // directly, enabling more efficient codegen.
8311 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8312 }
8313
8314 PhisToFix.push_back(PhiRecipe);
8315 return PhiRecipe;
8316 }
8317
8318 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8319 cast<TruncInst>(Instr), Operands, Range)))
8320 return Recipe;
8321
8322 // All widen recipes below deal only with VF > 1.
8324 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8325 return nullptr;
8326
8327 if (auto *CI = dyn_cast<CallInst>(Instr))
8328 return tryToWidenCall(CI, Operands, Range);
8329
8330 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8331 return tryToWidenMemory(Instr, Operands, Range);
8332
8333 if (!shouldWiden(Instr, Range))
8334 return nullptr;
8335
8336 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8337 return new VPWidenGEPRecipe(GEP,
8338 make_range(Operands.begin(), Operands.end()));
8339
8340 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8341 return new VPWidenSelectRecipe(
8342 *SI, make_range(Operands.begin(), Operands.end()));
8343 }
8344
8345 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8346 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8347 *CI);
8348 }
8349
8350 return tryToWiden(Instr, Operands, VPBB);
8351}
8352
8353void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8354 ElementCount MaxVF) {
8355 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8356
8357 auto MaxVFTimes2 = MaxVF * 2;
8358 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8359 VFRange SubRange = {VF, MaxVFTimes2};
8360 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8361 // Now optimize the initial VPlan.
8362 if (!Plan->hasVF(ElementCount::getFixed(1)))
8364 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8365 VPlanTransforms::optimize(*Plan, *PSE.getSE());
8366 // TODO: try to put it close to addActiveLaneMask().
8367 // Discard the plan if it is not EVL-compatible
8368 if (CM.foldTailWithEVL() &&
8370 break;
8371 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8372 VPlans.push_back(std::move(Plan));
8373 }
8374 VF = SubRange.End;
8375 }
8376}
8377
8378// Add the necessary canonical IV and branch recipes required to control the
8379// loop.
8380static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8381 DebugLoc DL) {
8382 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8383 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8384
8385 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8386 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8387 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8388 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8389 Header->insert(CanonicalIVPHI, Header->begin());
8390
8391 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8392 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8393 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8394 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8395 "index.next");
8396 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8397
8398 // Add the BranchOnCount VPInstruction to the latch.
8400 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8401}
8402
8403// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8404// original exit block.
8405static void addUsersInExitBlock(Loop *OrigLoop, VPRecipeBuilder &Builder,
8406 VPlan &Plan) {
8407 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8408 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8409 // Only handle single-exit loops with unique exit blocks for now.
8410 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8411 return;
8412
8413 // Introduce VPUsers modeling the exit values.
8414 for (PHINode &ExitPhi : ExitBB->phis()) {
8415 Value *IncomingValue =
8416 ExitPhi.getIncomingValueForBlock(ExitingBB);
8417 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
8418 // Exit values for inductions are computed and updated outside of VPlan and
8419 // independent of induction recipes.
8420 // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8421 // live-outs.
8422 if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8423 !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8424 isa<VPWidenPointerInductionRecipe>(V))
8425 continue;
8426 Plan.addLiveOut(&ExitPhi, V);
8427 }
8428}
8429
8430/// Feed a resume value for every FOR from the vector loop to the scalar loop,
8431/// if middle block branches to scalar preheader, by introducing ExtractFromEnd
8432/// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the
8433/// latter and corresponds to the scalar header.
8435 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8436
8437 // Start by finding out if middle block branches to scalar preheader, which is
8438 // not a VPIRBasicBlock, unlike Exit block - the other possible successor of
8439 // middle block.
8440 // TODO: Should be replaced by
8441 // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
8442 // scalar region is modeled as well.
8443 VPBasicBlock *ScalarPHVPBB = nullptr;
8444 auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
8445 for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) {
8446 if (isa<VPIRBasicBlock>(Succ))
8447 continue;
8448 assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?");
8449 ScalarPHVPBB = cast<VPBasicBlock>(Succ);
8450 }
8451 if (!ScalarPHVPBB)
8452 return;
8453
8454 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8455 VPBuilder MiddleBuilder(MiddleVPBB);
8456 // Reset insert point so new recipes are inserted before terminator and
8457 // condition, if there is either the former or both.
8458 if (auto *Terminator = MiddleVPBB->getTerminator()) {
8459 auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
8460 assert((!Condition || Condition->getParent() == MiddleVPBB) &&
8461 "Condition expected in MiddleVPBB");
8462 MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator);
8463 }
8464 VPValue *OneVPV = Plan.getOrAddLiveIn(
8465 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
8466
8467 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8468 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
8469 if (!FOR)
8470 continue;
8471
8472 // Extract the resume value and create a new VPLiveOut for it.
8473 auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd,
8474 {FOR->getBackedgeValue(), OneVPV},
8475 {}, "vector.recur.extract");
8476 auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
8477 VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
8478 "scalar.recur.init");
8479 Plan.addLiveOut(cast<PHINode>(FOR->getUnderlyingInstr()), ResumePhiRecipe);
8480 }
8481}
8482
8484LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8485
8487
8488 // ---------------------------------------------------------------------------
8489 // Build initial VPlan: Scan the body of the loop in a topological order to
8490 // visit each basic block after having visited its predecessor basic blocks.
8491 // ---------------------------------------------------------------------------
8492
8493 // Create initial VPlan skeleton, having a basic block for the pre-header
8494 // which contains SCEV expansions that need to happen before the CFG is
8495 // modified; a basic block for the vector pre-header, followed by a region for
8496 // the vector loop, followed by the middle basic block. The skeleton vector
8497 // loop region contains a header and latch basic blocks.
8498
8499 bool RequiresScalarEpilogueCheck =
8501 [this](ElementCount VF) {
8502 return !CM.requiresScalarEpilogue(VF.isVector());
8503 },
8504 Range);
8506 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8507 *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),
8508 OrigLoop);
8509
8510 // Don't use getDecisionAndClampRange here, because we don't know the UF
8511 // so this function is better to be conservative, rather than to split
8512 // it up into different VPlans.
8513 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8514 bool IVUpdateMayOverflow = false;
8515 for (ElementCount VF : Range)
8516 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8517
8519 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8520 // When not folding the tail, we know that the induction increment will not
8521 // overflow.
8522 bool HasNUW = Style == TailFoldingStyle::None;
8523 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8524
8525 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8526
8527 // ---------------------------------------------------------------------------
8528 // Pre-construction: record ingredients whose recipes we'll need to further
8529 // process after constructing the initial VPlan.
8530 // ---------------------------------------------------------------------------
8531
8532 // For each interleave group which is relevant for this (possibly trimmed)
8533 // Range, add it to the set of groups to be later applied to the VPlan and add
8534 // placeholders for its members' Recipes which we'll be replacing with a
8535 // single VPInterleaveRecipe.
8537 auto applyIG = [IG, this](ElementCount VF) -> bool {
8538 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8539 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8541 // For scalable vectors, the only interleave factor currently supported
8542 // is 2 since we require the (de)interleave2 intrinsics instead of
8543 // shufflevectors.
8544 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8545 "Unsupported interleave factor for scalable vectors");
8546 return Result;
8547 };
8548 if (!getDecisionAndClampRange(applyIG, Range))
8549 continue;
8550 InterleaveGroups.insert(IG);
8551 };
8552
8553 // ---------------------------------------------------------------------------
8554 // Construct recipes for the instructions in the loop
8555 // ---------------------------------------------------------------------------
8556
8557 // Scan the body of the loop in a topological order to visit each basic block
8558 // after having visited its predecessor basic blocks.
8559 LoopBlocksDFS DFS(OrigLoop);
8560 DFS.perform(LI);
8561
8562 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
8563 VPBasicBlock *VPBB = HeaderVPBB;
8564 BasicBlock *HeaderBB = OrigLoop->getHeader();
8565 bool NeedsMasks =
8566 CM.foldTailByMasking() ||
8567 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
8568 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8569 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8570 });
8571 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8572 // Relevant instructions from basic block BB will be grouped into VPRecipe
8573 // ingredients and fill a new VPBasicBlock.
8574 if (VPBB != HeaderVPBB)
8575 VPBB->setName(BB->getName());
8576 Builder.setInsertPoint(VPBB);
8577
8578 if (VPBB == HeaderVPBB)
8579 RecipeBuilder.createHeaderMask();
8580 else if (NeedsMasks)
8581 RecipeBuilder.createBlockInMask(BB);
8582
8583 // Introduce each ingredient into VPlan.
8584 // TODO: Model and preserve debug intrinsics in VPlan.
8585 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8586 Instruction *Instr = &I;
8588 auto *Phi = dyn_cast<PHINode>(Instr);
8589 if (Phi && Phi->getParent() == HeaderBB) {
8590 Operands.push_back(Plan->getOrAddLiveIn(
8591 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8592 } else {
8593 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
8594 Operands = {OpRange.begin(), OpRange.end()};
8595 }
8596
8597 // Invariant stores inside loop will be deleted and a single store
8598 // with the final reduction value will be added to the exit block
8599 StoreInst *SI;
8600 if ((SI = dyn_cast<StoreInst>(&I)) &&
8601 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8602 continue;
8603
8604 VPRecipeBase *Recipe =
8605 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8606 if (!Recipe)
8607 Recipe = RecipeBuilder.handleReplication(Instr, Range);
8608
8609 RecipeBuilder.setRecipe(Instr, Recipe);
8610 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8611 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8612 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8613 // recipes and need to be moved to the phi section of HeaderVPBB:
8614 // * tail-folding (non-phi recipes computing the header mask are
8615 // introduced earlier than regular header phi recipes, and should appear
8616 // after them)
8617 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8618
8619 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8620 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8621 "unexpected recipe needs moving");
8622 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8623 } else
8624 VPBB->appendRecipe(Recipe);
8625 }
8626
8628 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8629 }
8630
8631 // After here, VPBB should not be used.
8632 VPBB = nullptr;
8633
8634 if (CM.requiresScalarEpilogue(Range)) {
8635 // No edge from the middle block to the unique exit block has been inserted
8636 // and there is nothing to fix from vector loop; phis should have incoming
8637 // from scalar loop only.
8638 } else
8639 addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan);
8640
8641 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8642 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8643 "entry block must be set to a VPRegionBlock having a non-empty entry "
8644 "VPBasicBlock");
8645 RecipeBuilder.fixHeaderPhis();
8646
8648
8649 // ---------------------------------------------------------------------------
8650 // Transform initial VPlan: Apply previously taken decisions, in order, to
8651 // bring the VPlan to its final state.
8652 // ---------------------------------------------------------------------------
8653
8654 // Adjust the recipes for any inloop reductions.
8655 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
8656
8657 // Interleave memory: for each Interleave Group we marked earlier as relevant
8658 // for this VPlan, replace the Recipes widening its memory instructions with a
8659 // single VPInterleaveRecipe at its insertion point.
8660 for (const auto *IG : InterleaveGroups) {
8661 auto *Recipe =
8662 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
8663 SmallVector<VPValue *, 4> StoredValues;
8664 for (unsigned i = 0; i < IG->getFactor(); ++i)
8665 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8666 auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
8667 StoredValues.push_back(StoreR->getStoredValue());
8668 }
8669
8670 bool NeedsMaskForGaps =
8671 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8672 assert((!NeedsMaskForGaps || useMaskedInterleavedAccesses(CM.TTI)) &&
8673 "masked interleaved groups are not allowed.");
8674 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8675 Recipe->getMask(), NeedsMaskForGaps);
8676 VPIG->insertBefore(Recipe);
8677 unsigned J = 0;
8678 for (unsigned i = 0; i < IG->getFactor(); ++i)
8679 if (Instruction *Member = IG->getMember(i)) {
8680 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8681 if (!Member->getType()->isVoidTy()) {
8682 VPValue *OriginalV = MemberR->getVPSingleValue();
8683 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8684 J++;
8685 }
8686 MemberR->eraseFromParent();
8687 }
8688 }
8689
8690 for (ElementCount VF : Range)
8691 Plan->addVF(VF);
8692 Plan->setName("Initial VPlan");
8693
8694 // Replace VPValues for known constant strides guaranteed by predicate scalar
8695 // evolution.
8696 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8697 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8698 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8699 // Only handle constant strides for now.
8700 if (!ScevStride)
8701 continue;
8702
8703 auto *CI = Plan->getOrAddLiveIn(
8704 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
8705 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
8706 StrideVPV->replaceAllUsesWith(CI);
8707
8708 // The versioned value may not be used in the loop directly but through a
8709 // sext/zext. Add new live-ins in those cases.
8710 for (Value *U : StrideV->users()) {
8711 if (!isa<SExtInst, ZExtInst>(U))
8712 continue;
8713 VPValue *StrideVPV = Plan->getLiveIn(U);
8714 if (!StrideVPV)
8715 continue;
8716 unsigned BW = U->getType()->getScalarSizeInBits();
8717 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
8718 : ScevStride->getAPInt().zext(BW);
8719 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
8720 StrideVPV->replaceAllUsesWith(CI);
8721 }
8722 }
8723
8725 return Legal->blockNeedsPredication(BB);
8726 });
8727
8728 // Sink users of fixed-order recurrence past the recipe defining the previous
8729 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8731 return nullptr;
8732
8733 if (useActiveLaneMask(Style)) {
8734 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8735 // TailFoldingStyle is visible there.
8736 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8737 bool WithoutRuntimeCheck =
8739 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8740 WithoutRuntimeCheck);
8741 }
8742 return Plan;
8743}
8744
8745VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8746 // Outer loop handling: They may require CFG and instruction level
8747 // transformations before even evaluating whether vectorization is profitable.
8748 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8749 // the vectorization pipeline.
8750 assert(!OrigLoop->isInnermost());
8751 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8752
8753 // Create new empty VPlan
8754 auto Plan = VPlan::createInitialVPlan(
8755 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8756 *PSE.getSE(), true, false, OrigLoop);
8757
8758 // Build hierarchical CFG
8759 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8760 HCFGBuilder.buildHierarchicalCFG();
8761
8762 for (ElementCount VF : Range)
8763 Plan->addVF(VF);
8764
8766 Plan,
8767 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8768 *PSE.getSE(), *TLI);
8769
8770 // Remove the existing terminator of the exiting block of the top-most region.
8771 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8772 auto *Term =
8773 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8774 Term->eraseFromParent();
8775
8776 // Tail folding is not supported for outer loops, so the induction increment
8777 // is guaranteed to not wrap.
8778 bool HasNUW = true;
8779 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8780 DebugLoc());
8781 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8782 return Plan;
8783}
8784
8785// Adjust the recipes for reductions. For in-loop reductions the chain of
8786// instructions leading from the loop exit instr to the phi need to be converted
8787// to reductions, with one operand being vector and the other being the scalar
8788// reduction chain. For other reductions, a select is introduced between the phi
8789// and live-out recipes when folding the tail.
8790//
8791// A ComputeReductionResult recipe is added to the middle block, also for
8792// in-loop reductions which compute their result in-loop, because generating
8793// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8794//
8795// Adjust AnyOf reductions; replace the reduction phi for the selected value
8796// with a boolean reduction phi node to check if the condition is true in any
8797// iteration. The final value is selected by the final ComputeReductionResult.
8798void LoopVectorizationPlanner::adjustRecipesForReductions(
8799 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8800 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8801 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8802 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8803 // sank outside of the loop would keep the same order as they had in the
8804 // original loop.
8805 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8806 for (VPRecipeBase &R : Header->phis()) {
8807 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8808 ReductionPHIList.emplace_back(ReductionPhi);
8809 }
8810 bool HasIntermediateStore = false;
8811 stable_sort(ReductionPHIList,
8812 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8813 const VPReductionPHIRecipe *R2) {
8814 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8815 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8816 HasIntermediateStore |= IS1 || IS2;
8817
8818 // If neither of the recipes has an intermediate store, keep the
8819 // order the same.
8820 if (!IS1 && !IS2)
8821 return false;
8822
8823 // If only one of the recipes has an intermediate store, then
8824 // move it towards the beginning of the list.
8825 if (IS1 && !IS2)
8826 return true;
8827
8828 if (!IS1 && IS2)
8829 return false;
8830
8831 // If both recipes have an intermediate store, then the recipe
8832 // with the later store should be processed earlier. So it
8833 // should go to the beginning of the list.
8834 return DT->dominates(IS2, IS1);
8835 });
8836
8837 if (HasIntermediateStore && ReductionPHIList.size() > 1)
8838 for (VPRecipeBase *R : ReductionPHIList)
8839 R->moveBefore(*Header, Header->getFirstNonPhi());
8840
8841 for (VPRecipeBase &R : Header->phis()) {
8842 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8843 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8844 continue;
8845
8846 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8847 RecurKind Kind = RdxDesc.getRecurrenceKind();
8849 "AnyOf reductions are not allowed for in-loop reductions");
8850
8851 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8853 Worklist.insert(PhiR);
8854 for (unsigned I = 0; I != Worklist.size(); ++I) {
8855 VPSingleDefRecipe *Cur = Worklist[I];
8856 for (VPUser *U : Cur->users()) {
8857 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8858 if (!UserRecipe) {
8859 assert(isa<VPLiveOut>(U) &&
8860 "U must either be a VPSingleDef or VPLiveOut");
8861 continue;
8862 }
8863 Worklist.insert(UserRecipe);
8864 }
8865 }
8866
8867 // Visit operation "Links" along the reduction chain top-down starting from
8868 // the phi until LoopExitValue. We keep track of the previous item
8869 // (PreviousLink) to tell which of the two operands of a Link will remain
8870 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8871 // the select instructions. Blend recipes of in-loop reduction phi's will
8872 // get folded to their non-phi operand, as the reduction recipe handles the
8873 // condition directly.
8874 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8875 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
8876 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8877
8878 // Index of the first operand which holds a non-mask vector operand.
8879 unsigned IndexOfFirstOperand;
8880 // Recognize a call to the llvm.fmuladd intrinsic.
8881 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8882 VPValue *VecOp;
8883 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8884 if (IsFMulAdd) {
8885 assert(
8887 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8888 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8889 isa<VPWidenCallRecipe>(CurrentLink)) &&
8890 CurrentLink->getOperand(2) == PreviousLink &&
8891 "expected a call where the previous link is the added operand");
8892
8893 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8894 // need to create an fmul recipe (multiplying the first two operands of
8895 // the fmuladd together) to use as the vector operand for the fadd
8896 // reduction.
8897 VPInstruction *FMulRecipe = new VPInstruction(
8898 Instruction::FMul,
8899 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
8900 CurrentLinkI->getFastMathFlags());
8901 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
8902 VecOp = FMulRecipe;
8903 } else {
8904 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
8905 if (PhiR->isInLoop() && Blend) {
8906 assert(Blend->getNumIncomingValues() == 2 &&
8907 "Blend must have 2 incoming values");
8908 if (Blend->getIncomingValue(0) == PhiR)
8909 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
8910 else {
8911 assert(Blend->getIncomingValue(1) == PhiR &&
8912 "PhiR must be an operand of the blend");
8913 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
8914 }
8915 continue;
8916 }
8917
8919 if (isa<VPWidenRecipe>(CurrentLink)) {
8920 assert(isa<CmpInst>(CurrentLinkI) &&
8921 "need to have the compare of the select");
8922 continue;
8923 }
8924 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
8925 "must be a select recipe");
8926 IndexOfFirstOperand = 1;
8927 } else {
8928 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
8929 "Expected to replace a VPWidenSC");
8930 IndexOfFirstOperand = 0;
8931 }
8932 // Note that for non-commutable operands (cmp-selects), the semantics of
8933 // the cmp-select are captured in the recurrence kind.
8934 unsigned VecOpId =
8935 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
8936 ? IndexOfFirstOperand + 1
8937 : IndexOfFirstOperand;
8938 VecOp = CurrentLink->getOperand(VecOpId);
8939 assert(VecOp != PreviousLink &&
8940 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
8941 (VecOpId - IndexOfFirstOperand)) ==
8942 PreviousLink &&
8943 "PreviousLink must be the operand other than VecOp");
8944 }
8945
8946 BasicBlock *BB = CurrentLinkI->getParent();
8947 VPValue *CondOp = nullptr;
8949 CondOp = RecipeBuilder.getBlockInMask(BB);
8950
8951 VPReductionRecipe *RedRecipe =
8952 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
8953 CondOp, CM.useOrderedReductions(RdxDesc));
8954 // Append the recipe to the end of the VPBasicBlock because we need to
8955 // ensure that it comes after all of it's inputs, including CondOp.
8956 // Note that this transformation may leave over dead recipes (including
8957 // CurrentLink), which will be cleaned by a later VPlan transform.
8958 LinkVPBB->appendRecipe(RedRecipe);
8959 CurrentLink->replaceAllUsesWith(RedRecipe);
8960 PreviousLink = RedRecipe;
8961 }
8962 }
8963 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8964 Builder.setInsertPoint(&*LatchVPBB->begin());
8965 VPBasicBlock *MiddleVPBB =
8966 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
8967 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8968 for (VPRecipeBase &R :
8969 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8970 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8971 if (!PhiR)
8972 continue;
8973
8974 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8975 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8976 // with a boolean reduction phi node to check if the condition is true in
8977 // any iteration. The final value is selected by the final
8978 // ComputeReductionResult.
8980 RdxDesc.getRecurrenceKind())) {
8981 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
8982 return isa<VPWidenSelectRecipe>(U) ||
8983 (isa<VPReplicateRecipe>(U) &&
8984 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
8985 Instruction::Select);
8986 }));
8987 VPValue *Cmp = Select->getOperand(0);
8988 // If the compare is checking the reduction PHI node, adjust it to check
8989 // the start value.
8990 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
8991 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
8992 if (CmpR->getOperand(I) == PhiR)
8993 CmpR->setOperand(I, PhiR->getStartValue());
8994 }
8995 VPBuilder::InsertPointGuard Guard(Builder);
8996 Builder.setInsertPoint(Select);
8997
8998 // If the true value of the select is the reduction phi, the new value is
8999 // selected if the negated condition is true in any iteration.
9000 if (Select->getOperand(1) == PhiR)
9001 Cmp = Builder.createNot(Cmp);
9002 VPValue *Or = Builder.createOr(PhiR, Cmp);
9003 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9004
9005 // Convert the reduction phi to operate on bools.
9006 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9007 OrigLoop->getHeader()->getContext())));
9008 }
9009
9010 // If tail is folded by masking, introduce selects between the phi
9011 // and the live-out instruction of each reduction, at the beginning of the
9012 // dedicated latch block.
9013 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9014 auto *NewExitingVPV = PhiR->getBackedgeValue();
9015 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9016 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9017 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9018 "reduction recipe must be defined before latch");
9019 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9020 std::optional<FastMathFlags> FMFs =
9021 PhiTy->isFloatingPointTy()
9022 ? std::make_optional(RdxDesc.getFastMathFlags())
9023 : std::nullopt;
9024 NewExitingVPV =
9025 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9026 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9027 return isa<VPInstruction>(&U) &&
9028 cast<VPInstruction>(&U)->getOpcode() ==
9030 });
9033 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9035 PhiR->setOperand(1, NewExitingVPV);
9036 }
9037
9038 // If the vector reduction can be performed in a smaller type, we truncate
9039 // then extend the loop exit value to enable InstCombine to evaluate the
9040 // entire expression in the smaller type.
9041 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9042 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9044 RdxDesc.getRecurrenceKind())) {
9045 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9046 Type *RdxTy = RdxDesc.getRecurrenceType();
9047 auto *Trunc =
9048 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9049 auto *Extnd =
9050 RdxDesc.isSigned()
9051 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9052 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9053
9054 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9055 Extnd->insertAfter(Trunc);
9056 if (PhiR->getOperand(1) == NewExitingVPV)
9057 PhiR->setOperand(1, Extnd->getVPSingleValue());
9058 NewExitingVPV = Extnd;
9059 }
9060
9061 // We want code in the middle block to appear to execute on the location of
9062 // the scalar loop's latch terminator because: (a) it is all compiler
9063 // generated, (b) these instructions are always executed after evaluating
9064 // the latch conditional branch, and (c) other passes may add new
9065 // predecessors which terminate on this line. This is the easiest way to
9066 // ensure we don't accidentally cause an extra step back into the loop while
9067 // debugging.
9068 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9069
9070 // TODO: At the moment ComputeReductionResult also drives creation of the
9071 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9072 // even for in-loop reductions, until the reduction resume value handling is
9073 // also modeled in VPlan.
9074 auto *FinalReductionResult = new VPInstruction(
9075 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9076 FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9077 OrigExitingVPV->replaceUsesWithIf(
9078 FinalReductionResult,
9079 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9080 }
9081
9083}
9084
9087 "Not a pointer induction according to InductionDescriptor!");
9088 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9089 "Unexpected type.");
9091 "Recipe should have been replaced");
9092
9093 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9094 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9095 Type *PhiType = IndDesc.getStep()->getType();
9096
9097 // Build a pointer phi
9098 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9099 Type *ScStValueType = ScalarStartValue->getType();
9100 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
9101 CanonicalIV->getIterator());
9102
9103 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9104 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9105
9106 // A pointer induction, performed by using a gep
9107 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9108
9109 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9110 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9111 Value *NumUnrolledElems =
9112 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9113 Value *InductionGEP = GetElementPtrInst::Create(
9114 State.Builder.getInt8Ty(), NewPointerPhi,
9115 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9116 InductionLoc);
9117 // Add induction update using an incorrect block temporarily. The phi node
9118 // will be fixed after VPlan execution. Note that at this point the latch
9119 // block cannot be used, as it does not exist yet.
9120 // TODO: Model increment value in VPlan, by turning the recipe into a
9121 // multi-def and a subclass of VPHeaderPHIRecipe.
9122 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9123
9124 // Create UF many actual address geps that use the pointer
9125 // phi as base and a vectorized version of the step value
9126 // (<step*0, ..., step*N>) as offset.
9127 for (unsigned Part = 0; Part < State.UF; ++Part) {
9128 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9129 Value *StartOffsetScalar =
9130 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9131 Value *StartOffset =
9132 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9133 // Create a vector of consecutive numbers from zero to VF.
9134 StartOffset = State.Builder.CreateAdd(
9135 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9136
9137 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9138 "scalar step must be the same across all parts");
9139 Value *GEP = State.Builder.CreateGEP(
9140 State.Builder.getInt8Ty(), NewPointerPhi,
9141 State.Builder.CreateMul(
9142 StartOffset,
9143 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9144 "vector.gep"));
9145 State.set(this, GEP, Part);
9146 }
9147}
9148
9150 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9151
9152 // Fast-math-flags propagate from the original induction instruction.
9154 if (FPBinOp)
9155 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9156
9157 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9158 Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
9159 Value *DerivedIV = emitTransformedIndex(
9160 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9161 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9162 DerivedIV->setName("offset.idx");
9163 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9164
9165 State.set(this, DerivedIV, VPIteration(0, 0));
9166}
9167
9170 if (State.Instance) { // Generate a single instance.
9171 assert((State.VF.isScalar() || !isUniform()) &&
9172 "uniform recipe shouldn't be predicated");
9173 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9174 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9175 // Insert scalar instance packing it into a vector.
9176 if (State.VF.isVector() && shouldPack()) {
9177 // If we're constructing lane 0, initialize to start from poison.
9178 if (State.Instance->Lane.isFirstLane()) {
9179 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9181 VectorType::get(UI->getType(), State.VF));
9182 State.set(this, Poison, State.Instance->Part);
9183 }
9184 State.packScalarIntoVectorValue(this, *State.Instance);
9185 }
9186 return;
9187 }
9188
9189 if (IsUniform) {
9190 // If the recipe is uniform across all parts (instead of just per VF), only
9191 // generate a single instance.
9192 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9193 all_of(operands(), [](VPValue *Op) {
9194 return Op->isDefinedOutsideVectorRegions();
9195 })) {
9196 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9197 if (user_begin() != user_end()) {
9198 for (unsigned Part = 1; Part < State.UF; ++Part)
9199 State.set(this, State.get(this, VPIteration(0, 0)),
9200 VPIteration(Part, 0));
9201 }
9202 return;
9203 }
9204
9205 // Uniform within VL means we need to generate lane 0 only for each
9206 // unrolled copy.
9207 for (unsigned Part = 0; Part < State.UF; ++Part)
9208 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9209 return;
9210 }
9211
9212 // A store of a loop varying value to a uniform address only needs the last
9213 // copy of the store.
9214 if (isa<StoreInst>(UI) &&
9216 auto Lane = VPLane::getLastLaneForVF(State.VF);
9217 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9218 State);
9219 return;
9220 }
9221
9222 // Generate scalar instances for all VF lanes of all UF parts.
9223 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9224 const unsigned EndLane = State.VF.getKnownMinValue();
9225 for (unsigned Part = 0; Part < State.UF; ++Part)
9226 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9227 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9228}
9229
9231 auto *LI = cast<LoadInst>(&Ingredient);
9232
9233 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9234 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9235 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9236 bool CreateGather = !isConsecutive();
9237
9238 auto &Builder = State.Builder;
9240 for (unsigned Part = 0; Part < State.UF; ++Part) {
9241 Value *NewLI;
9242 Value *Mask = nullptr;
9243 if (auto *VPMask = getMask()) {
9244 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9245 // of a null all-one mask is a null mask.
9246 Mask = State.get(VPMask, Part);
9247 if (isReverse())
9248 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9249 }
9250
9251 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
9252 if (CreateGather) {
9253 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
9254 "wide.masked.gather");
9255 } else if (Mask) {
9256 NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
9257 PoisonValue::get(DataTy),
9258 "wide.masked.load");
9259 } else {
9260 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
9261 }
9262 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9263 State.addMetadata(NewLI, LI);
9264 if (Reverse)
9265 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9266 State.set(this, NewLI, Part);
9267 }
9268}
9269
9270/// Use all-true mask for reverse rather than actual mask, as it avoids a
9271/// dependence w/o affecting the result.
9273 Value *EVL, const Twine &Name) {
9274 VectorType *ValTy = cast<VectorType>(Operand->getType());
9275 Value *AllTrueMask =
9276 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
9277 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
9278 {Operand, AllTrueMask, EVL}, nullptr, Name);
9279}
9280
9282 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9283 "explicit vector length.");
9284 auto *LI = cast<LoadInst>(&Ingredient);
9285
9286 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9287 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9288 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9289 bool CreateGather = !isConsecutive();
9290
9291 auto &Builder = State.Builder;
9293 CallInst *NewLI;
9294 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9295 Value *Addr = State.get(getAddr(), 0, !CreateGather);
9296 Value *Mask = nullptr;
9297 if (VPValue *VPMask = getMask()) {
9298 Mask = State.get(VPMask, 0);
9299 if (isReverse())
9300 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9301 } else {
9302 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9303 }
9304
9305 if (CreateGather) {
9306 NewLI =
9307 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9308 nullptr, "wide.masked.gather");
9309 } else {
9310 VectorBuilder VBuilder(Builder);
9311 VBuilder.setEVL(EVL).setMask(Mask);
9312 NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
9313 Instruction::Load, DataTy, Addr, "vp.op.load"));
9314 }
9315 NewLI->addParamAttr(
9316 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
9317 State.addMetadata(NewLI, LI);
9318 Instruction *Res = NewLI;
9319 if (isReverse())
9320 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
9321 State.set(this, Res, 0);
9322}
9323
9325 auto *SI = cast<StoreInst>(&Ingredient);
9326
9327 VPValue *StoredVPValue = getStoredValue();
9328 bool CreateScatter = !isConsecutive();
9329 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9330
9331 auto &Builder = State.Builder;
9333
9334 for (unsigned Part = 0; Part < State.UF; ++Part) {
9335 Instruction *NewSI = nullptr;
9336 Value *Mask = nullptr;
9337 if (auto *VPMask = getMask()) {
9338 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9339 // of a null all-one mask is a null mask.
9340 Mask = State.get(VPMask, Part);
9341 if (isReverse())
9342 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9343 }
9344
9345 Value *StoredVal = State.get(StoredVPValue, Part);
9346 if (isReverse()) {
9347 // If we store to reverse consecutive memory locations, then we need
9348 // to reverse the order of elements in the stored value.
9349 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9350 // We don't want to update the value in the map as it might be used in
9351 // another expression. So don't call resetVectorValue(StoredVal).
9352 }
9353 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
9354 if (CreateScatter)
9355 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
9356 else if (Mask)
9357 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
9358 else
9359 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
9360 State.addMetadata(NewSI, SI);
9361 }
9362}
9363
9365 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9366 "explicit vector length.");
9367 auto *SI = cast<StoreInst>(&Ingredient);
9368
9369 VPValue *StoredValue = getStoredValue();
9370 bool CreateScatter = !isConsecutive();
9371 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9372
9373 auto &Builder = State.Builder;
9375
9376 CallInst *NewSI = nullptr;
9377 Value *StoredVal = State.get(StoredValue, 0);
9378 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9379 if (isReverse())
9380 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
9381 Value *Mask = nullptr;
9382 if (VPValue *VPMask = getMask()) {
9383 Mask = State.get(VPMask, 0);
9384 if (isReverse())
9385 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9386 } else {
9387 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9388 }
9389 Value *Addr = State.get(getAddr(), 0, !CreateScatter);
9390 if (CreateScatter) {
9391 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9392 Intrinsic::vp_scatter,
9393 {StoredVal, Addr, Mask, EVL});
9394 } else {
9395 VectorBuilder VBuilder(Builder);
9396 VBuilder.setEVL(EVL).setMask(Mask);
9397 NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
9398 Instruction::Store, Type::getVoidTy(EVL->getContext()),
9399 {StoredVal, Addr}));
9400 }
9401 NewSI->addParamAttr(
9402 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
9403 State.addMetadata(NewSI, SI);
9404}
9405
9406// Determine how to lower the scalar epilogue, which depends on 1) optimising
9407// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9408// predication, and 4) a TTI hook that analyses whether the loop is suitable
9409// for predication.
9414 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9415 // don't look at hints or options, and don't request a scalar epilogue.
9416 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9417 // LoopAccessInfo (due to code dependency and not being able to reliably get
9418 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9419 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9420 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9421 // back to the old way and vectorize with versioning when forced. See D81345.)
9422 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9426
9427 // 2) If set, obey the directives
9428 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9436 };
9437 }
9438
9439 // 3) If set, obey the hints
9440 switch (Hints.getPredicate()) {
9445 };
9446
9447 // 4) if the TTI hook indicates this is profitable, request predication.
9448 TailFoldingInfo TFI(TLI, &LVL, IAI);
9451
9453}
9454
9455// Process the loop in the VPlan-native vectorization path. This path builds
9456// VPlan upfront in the vectorization pipeline, which allows to apply
9457// VPlan-to-VPlan transformations from the very beginning without modifying the
9458// input LLVM IR.
9465 LoopVectorizationRequirements &Requirements) {
9466
9467 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9468 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9469 return false;
9470 }
9471 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9472 Function *F = L->getHeader()->getParent();
9473 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9474
9476 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9477
9478 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9479 &Hints, IAI);
9480 // Use the planner for outer loop vectorization.
9481 // TODO: CM is not used at this point inside the planner. Turn CM into an
9482 // optional argument if we don't need it in the future.
9483 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9484 ORE);
9485
9486 // Get user vectorization factor.
9487 ElementCount UserVF = Hints.getWidth();
9488
9490
9491 // Plan how to best vectorize, return the best VF and its cost.
9492 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9493
9494 // If we are stress testing VPlan builds, do not attempt to generate vector
9495 // code. Masked vector code generation support will follow soon.
9496 // Also, do not attempt to vectorize if no vector code will be produced.
9498 return false;
9499
9500 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9501
9502 {
9503 bool AddBranchWeights =
9504 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9505 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9506 F->getDataLayout(), AddBranchWeights);
9507 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9508 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9509 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9510 << L->getHeader()->getParent()->getName() << "\"\n");
9511 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9512 }
9513
9514 reportVectorization(ORE, L, VF, 1);
9515
9516 // Mark the loop as already vectorized to avoid vectorizing again.
9517 Hints.setAlreadyVectorized();
9518 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9519 return true;
9520}
9521
9522// Emit a remark if there are stores to floats that required a floating point
9523// extension. If the vectorized loop was generated with floating point there
9524// will be a performance penalty from the conversion overhead and the change in
9525// the vector width.
9528 for (BasicBlock *BB : L->getBlocks()) {
9529 for (Instruction &Inst : *BB) {
9530 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9531 if (S->getValueOperand()->getType()->isFloatTy())
9532 Worklist.push_back(S);
9533 }
9534 }
9535 }
9536
9537 // Traverse the floating point stores upwards searching, for floating point
9538 // conversions.
9541 while (!Worklist.empty()) {
9542 auto *I = Worklist.pop_back_val();
9543 if (!L->contains(I))
9544 continue;
9545 if (!Visited.insert(I).second)
9546 continue;
9547
9548 // Emit a remark if the floating point store required a floating
9549 // point conversion.
9550 // TODO: More work could be done to identify the root cause such as a
9551 // constant or a function return type and point the user to it.
9552 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9553 ORE->emit([&]() {
9554 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9555 I->getDebugLoc(), L->getHeader())
9556 << "floating point conversion changes vector width. "
9557 << "Mixed floating point precision requires an up/down "
9558 << "cast that will negatively impact performance.";
9559 });
9560
9561 for (Use &Op : I->operands())
9562 if (auto *OpI = dyn_cast<Instruction>(Op))
9563 Worklist.push_back(OpI);
9564 }
9565}
9566
9567static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9569 std::optional<unsigned> VScale, Loop *L,
9570 ScalarEvolution &SE,
9572 InstructionCost CheckCost = Checks.getCost();
9573 if (!CheckCost.isValid())
9574 return false;
9575
9576 // When interleaving only scalar and vector cost will be equal, which in turn
9577 // would lead to a divide by 0. Fall back to hard threshold.
9578 if (VF.Width.isScalar()) {
9579 if (CheckCost > VectorizeMemoryCheckThreshold) {
9580 LLVM_DEBUG(
9581 dbgs()
9582 << "LV: Interleaving only is not profitable due to runtime checks\n");
9583 return false;
9584 }
9585 return true;
9586 }
9587
9588 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9589 uint64_t ScalarC = *VF.ScalarCost.getValue();
9590 if (ScalarC == 0)
9591 return true;
9592
9593 // First, compute the minimum iteration count required so that the vector
9594 // loop outperforms the scalar loop.
9595 // The total cost of the scalar loop is
9596 // ScalarC * TC
9597 // where
9598 // * TC is the actual trip count of the loop.
9599 // * ScalarC is the cost of a single scalar iteration.
9600 //
9601 // The total cost of the vector loop is
9602 // RtC + VecC * (TC / VF) + EpiC
9603 // where
9604 // * RtC is the cost of the generated runtime checks
9605 // * VecC is the cost of a single vector iteration.
9606 // * TC is the actual trip count of the loop
9607 // * VF is the vectorization factor
9608 // * EpiCost is the cost of the generated epilogue, including the cost
9609 // of the remaining scalar operations.
9610 //
9611 // Vectorization is profitable once the total vector cost is less than the
9612 // total scalar cost:
9613 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9614 //
9615 // Now we can compute the minimum required trip count TC as
9616 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9617 //
9618 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9619 // the computations are performed on doubles, not integers and the result
9620 // is rounded up, hence we get an upper estimate of the TC.
9621 unsigned IntVF = VF.Width.getKnownMinValue();
9622 if (VF.Width.isScalable()) {
9623 unsigned AssumedMinimumVscale = 1;
9624 if (VScale)
9625 AssumedMinimumVscale = *VScale;
9626 IntVF *= AssumedMinimumVscale;
9627 }
9628 uint64_t RtC = *CheckCost.getValue();
9629 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9630 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9631
9632 // Second, compute a minimum iteration count so that the cost of the
9633 // runtime checks is only a fraction of the total scalar loop cost. This
9634 // adds a loop-dependent bound on the overhead incurred if the runtime
9635 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9636 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9637 // cost, compute
9638 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9639 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9640
9641 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9642 // epilogue is allowed, choose the next closest multiple of VF. This should
9643 // partly compensate for ignoring the epilogue cost.
9644 uint64_t MinTC = std::max(MinTC1, MinTC2);
9645 if (SEL == CM_ScalarEpilogueAllowed)
9646 MinTC = alignTo(MinTC, IntVF);
9648
9649 LLVM_DEBUG(
9650 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9651 << VF.MinProfitableTripCount << "\n");
9652
9653 // Skip vectorization if the expected trip count is less than the minimum
9654 // required trip count.
9655 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9658 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9659 "trip count < minimum profitable VF ("
9660 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9661 << ")\n");
9662
9663 return false;
9664 }
9665 }
9666 return true;
9667}
9668
9670 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9672 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9674
9676 assert((EnableVPlanNativePath || L->isInnermost()) &&
9677 "VPlan-native path is not enabled. Only process inner loops.");
9678
9679 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9680 << L->getHeader()->getParent()->getName() << "' from "
9681 << L->getLocStr() << "\n");
9682
9683 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9684
9685 LLVM_DEBUG(
9686 dbgs() << "LV: Loop hints:"
9687 << " force="
9689 ? "disabled"
9691 ? "enabled"
9692 : "?"))
9693 << " width=" << Hints.getWidth()
9694 << " interleave=" << Hints.getInterleave() << "\n");
9695
9696 // Function containing loop
9697 Function *F = L->getHeader()->getParent();
9698
9699 // Looking at the diagnostic output is the only way to determine if a loop
9700 // was vectorized (other than looking at the IR or machine code), so it
9701 // is important to generate an optimization remark for each loop. Most of
9702 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9703 // generated as OptimizationRemark and OptimizationRemarkMissed are
9704 // less verbose reporting vectorized loops and unvectorized loops that may
9705 // benefit from vectorization, respectively.
9706
9707 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9708 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9709 return false;
9710 }
9711
9712 PredicatedScalarEvolution PSE(*SE, *L);
9713
9714 // Check if it is legal to vectorize the loop.
9715 LoopVectorizationRequirements Requirements;
9716 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9717 &Requirements, &Hints, DB, AC, BFI, PSI);
9719 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9720 Hints.emitRemarkWithHints();
9721 return false;
9722 }
9723
9724 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9725 // here. They may require CFG and instruction level transformations before
9726 // even evaluating whether vectorization is profitable. Since we cannot modify
9727 // the incoming IR, we need to build VPlan upfront in the vectorization
9728 // pipeline.
9729 if (!L->isInnermost())
9730 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9731 ORE, BFI, PSI, Hints, Requirements);
9732
9733 assert(L->isInnermost() && "Inner loop expected.");
9734
9735 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9736 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9737
9738 // If an override option has been passed in for interleaved accesses, use it.
9739 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9740 UseInterleaved = EnableInterleavedMemAccesses;
9741
9742 // Analyze interleaved memory accesses.
9743 if (UseInterleaved)
9745
9746 // Check the function attributes and profiles to find out if this function
9747 // should be optimized for size.
9749 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9750
9751 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9752 // count by optimizing for size, to minimize overheads.
9753 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9754 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9755 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9756 << "This loop is worth vectorizing only if no scalar "
9757 << "iteration overheads are incurred.");
9759 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9760 else {
9761 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9762 LLVM_DEBUG(dbgs() << "\n");
9763 // Predicate tail-folded loops are efficient even when the loop
9764 // iteration count is low. However, setting the epilogue policy to
9765 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9766 // with runtime checks. It's more effective to let
9767 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9768 // for the loop.
9771 } else {
9772 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9773 "small to consider vectorizing.\n");
9775 "The trip count is below the minial threshold value.",
9776 "loop trip count is too low, avoiding vectorization",
9777 "LowTripCount", ORE, L);
9778 Hints.emitRemarkWithHints();
9779 return false;
9780 }
9781 }
9782 }
9783
9784 // Check the function attributes to see if implicit floats or vectors are
9785 // allowed.
9786 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9788 "Can't vectorize when the NoImplicitFloat attribute is used",
9789 "loop not vectorized due to NoImplicitFloat attribute",
9790 "NoImplicitFloat", ORE, L);
9791 Hints.emitRemarkWithHints();
9792 return false;
9793 }
9794
9795 // Check if the target supports potentially unsafe FP vectorization.
9796 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9797 // for the target we're vectorizing for, to make sure none of the
9798 // additional fp-math flags can help.
9799 if (Hints.isPotentiallyUnsafe() &&
9802 "Potentially unsafe FP op prevents vectorization",
9803 "loop not vectorized due to unsafe FP support.",
9804 "UnsafeFP", ORE, L);
9805 Hints.emitRemarkWithHints();
9806 return false;
9807 }
9808
9809 bool AllowOrderedReductions;
9810 // If the flag is set, use that instead and override the TTI behaviour.
9811 if (ForceOrderedReductions.getNumOccurrences() > 0)
9812 AllowOrderedReductions = ForceOrderedReductions;
9813 else
9814 AllowOrderedReductions = TTI->enableOrderedReductions();
9815 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9816 ORE->emit([&]() {
9817 auto *ExactFPMathInst = Requirements.getExactFPInst();
9818 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9819 ExactFPMathInst->getDebugLoc(),
9820 ExactFPMathInst->getParent())
9821 << "loop not vectorized: cannot prove it is safe to reorder "
9822 "floating-point operations";
9823 });
9824 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9825 "reorder floating-point operations\n");
9826 Hints.emitRemarkWithHints();
9827 return false;
9828 }
9829
9830 // Use the cost model.
9831 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9832 F, &Hints, IAI);
9833 // Use the planner for vectorization.
9834 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9835 ORE);
9836
9837 // Get user vectorization factor and interleave count.
9838 ElementCount UserVF = Hints.getWidth();
9839 unsigned UserIC = Hints.getInterleave();
9840
9841 // Plan how to best vectorize, return the best VF and its cost.
9842 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9843
9845 unsigned IC = 1;
9846
9847 bool AddBranchWeights =
9848 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9849 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9850 F->getDataLayout(), AddBranchWeights);
9851 if (MaybeVF) {
9852 VF = *MaybeVF;
9853 // Select the interleave count.
9854 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9855
9856 unsigned SelectedIC = std::max(IC, UserIC);
9857 // Optimistically generate runtime checks if they are needed. Drop them if
9858 // they turn out to not be profitable.
9859 if (VF.Width.isVector() || SelectedIC > 1)
9860 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9861
9862 // Check if it is profitable to vectorize with runtime checks.
9863 bool ForceVectorization =
9865 if (!ForceVectorization &&
9867 *PSE.getSE(), SEL)) {
9868 ORE->emit([&]() {
9870 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9871 L->getHeader())
9872 << "loop not vectorized: cannot prove it is safe to reorder "
9873 "memory operations";
9874 });
9875 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9876 Hints.emitRemarkWithHints();
9877 return false;
9878 }
9879 }
9880
9881 // Identify the diagnostic messages that should be produced.
9882 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9883 bool VectorizeLoop = true, InterleaveLoop = true;
9884 if (VF.Width.isScalar()) {
9885 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9886 VecDiagMsg = std::make_pair(
9887 "VectorizationNotBeneficial",
9888 "the cost-model indicates that vectorization is not beneficial");
9889 VectorizeLoop = false;
9890 }
9891
9892 if (!MaybeVF && UserIC > 1) {
9893 // Tell the user interleaving was avoided up-front, despite being explicitly
9894 // requested.
9895 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9896 "interleaving should be avoided up front\n");
9897 IntDiagMsg = std::make_pair(
9898 "InterleavingAvoided",
9899 "Ignoring UserIC, because interleaving was avoided up front");
9900 InterleaveLoop = false;
9901 } else if (IC == 1 && UserIC <= 1) {
9902 // Tell the user interleaving is not beneficial.
9903 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9904 IntDiagMsg = std::make_pair(
9905 "InterleavingNotBeneficial",
9906 "the cost-model indicates that interleaving is not beneficial");
9907 InterleaveLoop = false;
9908 if (UserIC == 1) {
9909 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9910 IntDiagMsg.second +=
9911 " and is explicitly disabled or interleave count is set to 1";
9912 }
9913 } else if (IC > 1 && UserIC == 1) {
9914 // Tell the user interleaving is beneficial, but it explicitly disabled.
9915 LLVM_DEBUG(
9916 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9917 IntDiagMsg = std::make_pair(
9918 "InterleavingBeneficialButDisabled",
9919 "the cost-model indicates that interleaving is beneficial "
9920 "but is explicitly disabled or interleave count is set to 1");
9921 InterleaveLoop = false;
9922 }
9923
9924 // Override IC if user provided an interleave count.
9925 IC = UserIC > 0 ? UserIC : IC;
9926
9927 // Emit diagnostic messages, if any.
9928 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9929 if (!VectorizeLoop && !InterleaveLoop) {
9930 // Do not vectorize or interleaving the loop.
9931 ORE->emit([&]() {
9932 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9933 L->getStartLoc(), L->getHeader())
9934 << VecDiagMsg.second;
9935 });
9936 ORE->emit([&]() {
9937 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9938 L->getStartLoc(), L->getHeader())
9939 << IntDiagMsg.second;
9940 });
9941 return false;
9942 } else if (!VectorizeLoop && InterleaveLoop) {
9943 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9944 ORE->emit([&]() {
9945 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9946 L->getStartLoc(), L->getHeader())
9947 << VecDiagMsg.second;
9948 });
9949 } else if (VectorizeLoop && !InterleaveLoop) {
9950 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9951 << ") in " << L->getLocStr() << '\n');
9952 ORE->emit([&]() {
9953 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9954 L->getStartLoc(), L->getHeader())
9955 << IntDiagMsg.second;
9956 });
9957 } else if (VectorizeLoop && InterleaveLoop) {
9958 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9959 << ") in " << L->getLocStr() << '\n');
9960 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9961 }
9962
9963 bool DisableRuntimeUnroll = false;
9964 MDNode *OrigLoopID = L->getLoopID();
9965 {
9966 using namespace ore;
9967 if (!VectorizeLoop) {
9968 assert(IC > 1 && "interleave count should not be 1 or 0");
9969 // If we decided that it is not legal to vectorize the loop, then
9970 // interleave it.
9971 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
9972 &CM, BFI, PSI, Checks);
9973
9974 VPlan &BestPlan = LVP.getBestPlan();
9975 assert(BestPlan.hasScalarVFOnly() &&
9976 "VPlan cost model and legacy cost model disagreed");
9977 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
9978
9979 ORE->emit([&]() {
9980 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9981 L->getHeader())
9982 << "interleaved loop (interleaved count: "
9983 << NV("InterleaveCount", IC) << ")";
9984 });
9985 } else {
9986 // If we decided that it is *legal* to vectorize the loop, then do it.
9987
9988 // Consider vectorizing the epilogue too if it's profitable.
9989 VectorizationFactor EpilogueVF =
9991 if (EpilogueVF.Width.isVector()) {
9992
9993 // The first pass vectorizes the main loop and creates a scalar epilogue
9994 // to be vectorized by executing the plan (potentially with a different
9995 // factor) again shortly afterwards.
9996 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
9997 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
9998 EPI, &LVL, &CM, BFI, PSI, Checks);
9999
10000 std::unique_ptr<VPlan> BestMainPlan(
10002 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10003 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
10004 ++LoopsVectorized;
10005
10006 // Second pass vectorizes the epilogue and adjusts the control flow
10007 // edges from the first pass.
10008 EPI.MainLoopVF = EPI.EpilogueVF;
10009 EPI.MainLoopUF = EPI.EpilogueUF;
10010 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10011 ORE, EPI, &LVL, &CM, BFI, PSI,
10012 Checks);
10013
10014 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10015 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10016 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10017 Header->setName("vec.epilog.vector.body");
10018
10019 // Re-use the trip count and steps expanded for the main loop, as
10020 // skeleton creation needs it as a value that dominates both the scalar
10021 // and vector epilogue loops
10022 // TODO: This is a workaround needed for epilogue vectorization and it
10023 // should be removed once induction resume value creation is done
10024 // directly in VPlan.
10025 EpilogILV.setTripCount(MainILV.getTripCount());
10026 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10027 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10028 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
10029 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10030 ExpandR->replaceAllUsesWith(ExpandedVal);
10031 if (BestEpiPlan.getTripCount() == ExpandR)
10032 BestEpiPlan.resetTripCount(ExpandedVal);
10033 ExpandR->eraseFromParent();
10034 }
10035
10036 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10037 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10038 // before vectorizing the epilogue loop.
10039 for (VPRecipeBase &R : Header->phis()) {
10040 if (isa<VPCanonicalIVPHIRecipe>(&R))
10041 continue;
10042
10043 Value *ResumeV = nullptr;
10044 // TODO: Move setting of resume values to prepareToExecute.
10045 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10046 const RecurrenceDescriptor &RdxDesc =
10047 ReductionPhi->getRecurrenceDescriptor();
10048 RecurKind RK = RdxDesc.getRecurrenceKind();
10049 ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
10051 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10052 // start value; compare the final value from the main vector loop
10053 // to the start value.
10054 IRBuilder<> Builder(
10055 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10056 ResumeV = Builder.CreateICmpNE(ResumeV,
10057 RdxDesc.getRecurrenceStartValue());
10058 }
10059 } else {
10060 // Create induction resume values for both widened pointer and
10061 // integer/fp inductions and update the start value of the induction
10062 // recipes to use the resume value.
10063 PHINode *IndPhi = nullptr;
10064 const InductionDescriptor *ID;
10065 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10066 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10067 ID = &Ind->getInductionDescriptor();
10068 } else {
10069 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10070 IndPhi = WidenInd->getPHINode();
10071 ID = &WidenInd->getInductionDescriptor();
10072 }
10073
10074 ResumeV = MainILV.createInductionResumeValue(
10075 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10077 }
10078 assert(ResumeV && "Must have a resume value");
10079 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
10080 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10081 }
10082
10083 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10084 "DT not preserved correctly");
10085 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10086 DT, true, &ExpandedSCEVs);
10087 ++LoopsEpilogueVectorized;
10088
10089 if (!MainILV.areSafetyChecksAdded())
10090 DisableRuntimeUnroll = true;
10091 } else {
10092 VPlan &BestPlan = LVP.getBestPlan();
10093 assert(size(BestPlan.vectorFactors()) == 1 &&
10094 "Plan should have a single VF");
10095 ElementCount Width = *BestPlan.vectorFactors().begin();
10096 LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
10097 << "\n");
10098 assert(VF.Width == Width &&
10099 "VPlan cost model and legacy cost model disagreed");
10100 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,
10101 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10102 PSI, Checks);
10103 LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
10104 ++LoopsVectorized;
10105
10106 // Add metadata to disable runtime unrolling a scalar loop when there
10107 // are no runtime checks about strides and memory. A scalar loop that is
10108 // rarely used is not worth unrolling.
10109 if (!LB.areSafetyChecksAdded())
10110 DisableRuntimeUnroll = true;
10111 }
10112 // Report the vectorization decision.
10113 reportVectorization(ORE, L, VF, IC);
10114 }
10115
10118 }
10119
10120 std::optional<MDNode *> RemainderLoopID =
10123 if (RemainderLoopID) {
10124 L->setLoopID(*RemainderLoopID);
10125 } else {
10126 if (DisableRuntimeUnroll)
10128
10129 // Mark the loop as already vectorized to avoid vectorizing again.
10130 Hints.setAlreadyVectorized();
10131 }
10132
10133 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10134 return true;
10135}
10136
10142 SE = &SE_;
10143 LI = &LI_;
10144 TTI = &TTI_;
10145 DT = &DT_;
10146 BFI = BFI_;
10147 TLI = TLI_;
10148 AC = &AC_;
10149 LAIs = &LAIs_;
10150 DB = &DB_;
10151 ORE = &ORE_;
10152 PSI = PSI_;
10153
10154 // Don't attempt if
10155 // 1. the target claims to have no vector registers, and
10156 // 2. interleaving won't help ILP.
10157 //
10158 // The second condition is necessary because, even if the target has no
10159 // vector registers, loop vectorization may still enable scalar
10160 // interleaving.
10163 return LoopVectorizeResult(false, false);
10164
10165 bool Changed = false, CFGChanged = false;
10166
10167 // The vectorizer requires loops to be in simplified form.
10168 // Since simplification may add new inner loops, it has to run before the
10169 // legality and profitability checks. This means running the loop vectorizer
10170 // will simplify all loops, regardless of whether anything end up being
10171 // vectorized.
10172 for (const auto &L : *LI)
10173 Changed |= CFGChanged |=
10174 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10175
10176 // Build up a worklist of inner-loops to vectorize. This is necessary as
10177 // the act of vectorizing or partially unrolling a loop creates new loops
10178 // and can invalidate iterators across the loops.
10179 SmallVector<Loop *, 8> Worklist;
10180
10181 for (Loop *L : *LI)
10182 collectSupportedLoops(*L, LI, ORE, Worklist);
10183
10184 LoopsAnalyzed += Worklist.size();
10185
10186 // Now walk the identified inner loops.
10187 while (!Worklist.empty()) {
10188 Loop *L = Worklist.pop_back_val();
10189
10190 // For the inner loops we actually process, form LCSSA to simplify the
10191 // transform.
10192 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10193
10194 Changed |= CFGChanged |= processLoop(L);
10195
10196 if (Changed) {
10197 LAIs->clear();
10198
10199#ifndef NDEBUG
10200 if (VerifySCEV)
10201 SE->verify();
10202#endif
10203 }
10204 }
10205
10206 // Process each loop nest in the function.
10207 return LoopVectorizeResult(Changed, CFGChanged);
10208}
10209
10212 auto &LI = AM.getResult<LoopAnalysis>(F);
10213 // There are no loops in the function. Return before computing other expensive
10214 // analyses.
10215 if (LI.empty())
10216 return PreservedAnalyses::all();
10218 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10219 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10220 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10221 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10222 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10224
10226 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10228 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10229 BlockFrequencyInfo *BFI = nullptr;
10230 if (PSI && PSI->hasProfileSummary())
10232 LoopVectorizeResult Result =
10233 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10234 if (!Result.MadeAnyChange)
10235 return PreservedAnalyses::all();
10237
10238 if (isAssignmentTrackingEnabled(*F.getParent())) {
10239 for (auto &BB : F)
10241 }
10242
10243 PA.preserve<LoopAnalysis>();
10247
10248 if (Result.MadeCFGChange) {
10249 // Making CFG changes likely means a loop got vectorized. Indicate that
10250 // extra simplification passes should be run.
10251 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10252 // be run if runtime checks have been added.
10255 } else {
10257 }
10258 return PA;
10259}
10260
10262 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10263 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10264 OS, MapClassName2PassName);
10265
10266 OS << '<';
10267 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10268 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10269 OS << '>';
10270}
@ Poison
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
uint64_t Addr
std::string Name
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This defines the Use class.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static void createAndCollectMergePhiForReduction(VPInstruction *RedResult, DenseMap< const RecurrenceDescriptor *, Value * > &ReductionResumeValues, VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue)
static std::optional< unsigned > getSmallBestKnownTC(ScalarEvolution &SE, Loop *L)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it's operands.
static void emitInvalidCostRemarks(SmallVector< InstructionVFPair > InvalidCosts, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
const char LLVMLoopVectorizeFollowupAll[]
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static Type * MaybeVectorizeType(Type *Elt, ElementCount VF)
cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional< unsigned > VScale, Loop *L, ScalarEvolution &SE, ScalarEpilogueLowering SEL)
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I)
Create an analysis remark that explains why vectorization failed.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan)
Feed a resume value for every FOR from the vector loop to the scalar loop, if middle block branches t...
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static bool useActiveLaneMask(TailFoldingStyle Style)
static void addUsersInExitBlock(Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static void AddRuntimeUnrollDisableMetaData(Loop *L)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static cl::opt< bool > PrintVPlansInDotFormat("vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans"))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define R2(n)
This file contains the declarations for metadata subclasses.
Module.h This file contains the declarations for the Module class.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(VerifyEach)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define OP(OPC)
Definition: SandboxIR.h:501
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:214
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:459
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:232
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:438
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:507
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:372
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:365
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:457
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:167
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
BinaryOps getOpcode() const
Definition: InstrTypes.h:442
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1965
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1401
unsigned arg_size() const
Definition: InstrTypes.h:1408
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FIRST_ICMP_PREDICATE
Definition: InstrTypes.h:788
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:850
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:857
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:317
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:103
param_iterator param_begin() const
Definition: DerivedTypes.h:128
param_iterator param_end() const
Definition: DerivedTypes.h:129
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:698
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:207
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:745
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:695
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:719
static GetElementPtrInst * Create(Type *PointeeType, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Definition: Instructions.h:938
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:922
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1193
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2250
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1871
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2246
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:142
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1349
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1332
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:468
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2356
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:513
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1409
Value * CreateStepVector(Type *DstType, const Twine &Name="")
Creates a vector of type DstType with the linear sequence <0, 1, ...>
Definition: IRBuilder.cpp:110
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1366
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671
A struct for saving information about induction variables.
InductionKind getKind() const
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
virtual std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
PHINode * createInductionResumeValue(PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef< BasicBlock * > BypassBlocks, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create a new phi node for the induction variable OrigPhi to resume iteration count in the scalar epil...
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
BasicBlock * LoopScalarBody
The scalar loop body.
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
DenseMap< PHINode *, Value * > IVEndValues
ElementCount MinProfitableTripCount
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
LoopVectorizationCostModel * Cost
The profitablity analysis.
SmallMapVector< const RecurrenceDescriptor *, PHINode *, 4 > ReductionResumeValues
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
BasicBlock * completeLoopSkeleton()
Complete the loop skeleton by adding debug MDs, creating appropriate conditional branches in the midd...
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
LoopVectorizationLegality * Legal
The legality analysis.
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
LoopInfo * LI
Loop Info.
void createInductionResumeValues(const SCEV2ValueTy &ExpandedSCEVs, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create new phi nodes for the induction variables to resume iteration count in the scalar epilogue,...
ProfileSummaryInfo * PSI
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State)
Fix the non-induction PHIs in Plan.
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
virtual std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
unsigned UF
The vectorization unroll factor to use.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan)
Fix the vectorized code, taking care of header phi's, live-outs, and more.
BasicBlock * LoopExitBlock
The unique ExitBlock of the scalar loop if one exists.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:97
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:463
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:470
uint32_t getFactor() const
Definition: VectorUtils.h:486
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:540
InstTy * getInsertPos() const
Definition: VectorUtils.h:556
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:612
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:657
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:668
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:649
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:632
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:662
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:174
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:571
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1254
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, TTI::TargetCostKind CostKind) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool isEpilogueVectorizationProfitable(const ElementCount VF) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
InstructionCost expectedCost(ElementCount VF, SmallVectorImpl< InstructionVFPair > *Invalid=nullptr)
Returns the expected execution cost.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if V is invariant across all loop iterations according to SCEV.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
Type * getWidestInductionType()
Returns the widest induction type.
const LoopAccessInfo * getLAI() const
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
std::optional< VectorizationFactor > plan(ElementCount UserVF, unsigned UserIC)
Plan how to best vectorize, return the best VF and its cost, or std::nullopt if vectorization and int...
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
std::pair< DenseMap< const SCEV *, Value * >, DenseMap< const RecurrenceDescriptor *, Value * > > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
VPlan & getBestPlanFor(ElementCount VF) const
Return the best VPlan for VF.
VPlan & getBestPlan() const
Return the most profitable plan and fix its VF to the most profitable one.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
void printPlans(raw_ostream &O)
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:502
Metadata node.
Definition: Metadata.h:1067
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1071
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:600
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:193
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:688
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:361
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:243
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:234
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
op_iterator op_end()
Definition: User.h:236
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:71
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:2971
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3043
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:2995
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:483
iterator end()
Definition: VPlan.h:3005
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3003
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3056
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:212
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:3034
bool empty() const
Definition: VPlan.h:3014
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:2025
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:437
VPRegionBlock * getParent()
Definition: VPlan.h:509
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:177
void setName(const Twine &newName)
Definition: VPlan.h:502
VPlan * getPlan()
Definition: VPlan.cpp:150
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:155
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:544
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:534
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:3584
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:2710
Type * getScalarType() const
Returns the scalar type of the induction.
Definition: VPlan.h:2739
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:396
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:2909
VPValue * getStartValue() const
Definition: VPlan.h:2908
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1711
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1755
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1744
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1229
@ ResumePhi
Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
Definition: VPlan.h:1247
unsigned getOpcode() const
Definition: VPlan.h:1341
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2082
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:196
static VPLane getFirstLane()
Definition: VPlan.h:180
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:764
VPBasicBlock * getParent()
Definition: VPlan.h:789
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:860
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPValue * getVPValueOrAddLiveIn(Value *V, VPlan &Plan)
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
void createHeaderMask()
Create the mask for the vector loop header block.
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1149
A recipe for handling reduction phis.
Definition: VPlan.h:1966
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:2020
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:2012
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2173
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3149
const VPBlockBase * getEntry() const
Definition: VPlan.h:3188
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:3220
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2288
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isUniform() const
Definition: VPlan.h:2328
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:891
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:955
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:39
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:202
operand_range operands()
Definition: VPlanValue.h:272
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:257
unsigned getNumOperands() const
Definition: VPlanValue.h:251
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:252
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:246
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1421
user_iterator user_begin()
Definition: VPlanValue.h:128
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:172
user_iterator user_end()
Definition: VPlanValue.h:130
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:167
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1425
user_range users()
Definition: VPlanValue.h:132
A recipe to compute the pointers for widened memory accesses of IndexTy for all parts.
Definition: VPlan.h:1655
A recipe for widening Call instructions.
Definition: VPlan.h:1526
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:2835
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1439
A recipe for handling GEP instructions.
Definition: VPlan.h:1613
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:1768
A common base class for widening memory operations.
Definition: VPlan.h:2445
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
Definition: VPlan.h:2453
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition: VPlan.h:2492
Instruction & Ingredient
Definition: VPlan.h:2447
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2506
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2499
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition: VPlan.h:2496
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:1894
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:1933
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:1930
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void execute(VPTransformState &State) override
Generate vector values for the pointer induction.
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition: VPlan.h:1406
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3253
void printDOT(raw_ostream &O) const
Print this VPlan in DOT format to O.
Definition: VPlan.cpp:1173
void prepareToExecute(Value *TripCount, Value *VectorTripCount, Value *CanonicalIVStartValue, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:916
VPBasicBlock * getEntry()
Definition: VPlan.h:3355
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3380
void setName(const Twine &newName)
Definition: VPlan.h:3417
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3383
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3359
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3373
void removeLiveOut(PHINode *PN)
Definition: VPlan.h:3474
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition: VPlan.h:3400
void addLiveOut(PHINode *PN, VPValue *V)
Definition: VPlan.cpp:1182
VPBasicBlock * getPreheader()
Definition: VPlan.h:3493
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.h:3455
bool hasVF(ElementCount VF)
Definition: VPlan.h:3393
bool hasUF(unsigned UF) const
Definition: VPlan.h:3406
void setVF(ElementCount VF)
Definition: VPlan.h:3387
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition: VPlan.cpp:1086
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3366
static VPlanPtr createInitialVPlan(const SCEV *TripCount, ScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop)
Create initial VPlan, having an "entry" VPBasicBlock (wrapping original scalar pre-header ) which con...
Definition: VPlan.cpp:858
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3421
LLVM_DUMP_METHOD void dump() const
Dump the plan to stderr (for debugging).
Definition: VPlan.cpp:1179
bool hasScalarVFOnly() const
Definition: VPlan.h:3404
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:976
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:3463
const MapVector< PHINode *, VPLiveOut * > & getLiveOuts() const
Definition: VPlan.h:3479
void print(raw_ostream &O) const
Print this VPlan to O.
Definition: VPlan.cpp:1123
VPValue * getSCEVExpansion(const SCEV *S) const
Definition: VPlan.h:3483
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1225
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
VectorBuilder & setEVL(Value *NewExplicitVectorLength)
Definition: VectorBuilder.h:82
VectorBuilder & setMask(Value *NewMask)
Definition: VectorBuilder.h:78
Value * createVectorInstruction(unsigned Opcode, Type *ReturnTy, ArrayRef< Value * > VecOpArray, const Twine &Name=Twine())
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:258
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:225
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:239
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:811
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlan.cpp:1610
bool isUniformAfterVectorization(VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlan.h:3808
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:480
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1894
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:988
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7128
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const SCEV * createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *OrigLoop)
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:425
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:214
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:55
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:147
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:120
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< bool > EnableLoopVectorization
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:419
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
Align getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:572
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
Type * ToVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
Definition: VectorUtils.h:135
TargetTransformInfo TTI
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2242
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1701
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:1953
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
Definition: VPlan.h:95
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:593
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:28
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
LoopVectorizeResult runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:69
A marker to determine if extra passes after loop vectorization should be run.
Definition: LoopVectorize.h:86
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:100
ElementCount End
Definition: VPlan.h:105
Struct to hold various analysis needed for cost computations.
Definition: VPlan.h:737
LoopVectorizationCostModel & CM
Definition: VPlan.h:741
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
SmallPtrSet< Instruction *, 8 > SkipCostComputation
Definition: VPlan.h:742
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:1939
VPIteration represents a single point in the iteration space of the output (vectorized and/or unrolle...
Definition: VPlan.h:238
bool isFirstIteration() const
Definition: VPlan.h:250
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:384
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:392
BasicBlock * getPreheaderBBFor(VPRecipeBase *R)
Returns the BasicBlock* mapped to the pre-header of the loop region containing R.
Definition: VPlan.cpp:356
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:255
Value * get(VPValue *Def, unsigned Part, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def and a given Part if IsScalar is false,...
Definition: VPlan.cpp:254
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:429
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:432
void addMetadata(Value *To, Instruction *From)
Add metadata from one instruction to another.
Definition: VPlan.cpp:369
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:425
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:361
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:401
void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar=false)
Set the generated vector Value for a given VPValue and a given Part, if IsScalar is false.
Definition: VPlan.h:307
std::optional< VPIteration > Instance
Hold the indices to generate specific scalar instructions.
Definition: VPlan.h:267
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:409
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:415
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:412
ElementCount VF
The chosen Vectorization and Unroll Factors of the loop being vectorized.
Definition: VPlan.h:261
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:380
void execute(VPTransformState &State) override
Generate the wide load or gather.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2572
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:2521
void execute(VPTransformState &State) override
Generate a wide load or gather.
A recipe for widening select instructions.
Definition: VPlan.h:1579
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition: VPlan.h:2648
void execute(VPTransformState &State) override
Generate the wide store or scatter.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2651
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:2595
void execute(VPTransformState &State) override
Generate a wide store or scatter.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition: VPlan.h:2612
static bool tryAddExplicitVectorLength(VPlan &Plan)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimize(VPlan &Plan, ScalarEvolution &SE)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs, LLVMContext &Ctx)
Insert truncates and extends for any truncated recipe.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Sink users of fixed-order recurrences after the recipe defining their previous value.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.