LLVM 19.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanTransforms.h"
63#include "VPlanVerifier.h"
64#include "llvm/ADT/APInt.h"
65#include "llvm/ADT/ArrayRef.h"
66#include "llvm/ADT/DenseMap.h"
68#include "llvm/ADT/Hashing.h"
69#include "llvm/ADT/MapVector.h"
70#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallSet.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
81#include "llvm/Analysis/CFG.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfo.h"
105#include "llvm/IR/DebugLoc.h"
106#include "llvm/IR/DerivedTypes.h"
108#include "llvm/IR/Dominators.h"
109#include "llvm/IR/Function.h"
110#include "llvm/IR/IRBuilder.h"
111#include "llvm/IR/InstrTypes.h"
112#include "llvm/IR/Instruction.h"
113#include "llvm/IR/Instructions.h"
115#include "llvm/IR/Intrinsics.h"
116#include "llvm/IR/MDBuilder.h"
117#include "llvm/IR/Metadata.h"
118#include "llvm/IR/Module.h"
119#include "llvm/IR/Operator.h"
120#include "llvm/IR/PatternMatch.h"
122#include "llvm/IR/Type.h"
123#include "llvm/IR/Use.h"
124#include "llvm/IR/User.h"
125#include "llvm/IR/Value.h"
126#include "llvm/IR/ValueHandle.h"
128#include "llvm/IR/Verifier.h"
129#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
145#include <algorithm>
146#include <cassert>
147#include <cmath>
148#include <cstdint>
149#include <functional>
150#include <iterator>
151#include <limits>
152#include <map>
153#include <memory>
154#include <string>
155#include <tuple>
156#include <utility>
157
158using namespace llvm;
159
160#define LV_NAME "loop-vectorize"
161#define DEBUG_TYPE LV_NAME
162
163#ifndef NDEBUG
164const char VerboseDebug[] = DEBUG_TYPE "-verbose";
165#endif
166
167/// @{
168/// Metadata attribute names
169const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
171 "llvm.loop.vectorize.followup_vectorized";
173 "llvm.loop.vectorize.followup_epilogue";
174/// @}
175
176STATISTIC(LoopsVectorized, "Number of loops vectorized");
177STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
178STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
179
181 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
182 cl::desc("Enable vectorization of epilogue loops."));
183
185 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
186 cl::desc("When epilogue vectorization is enabled, and a value greater than "
187 "1 is specified, forces the given VF for all applicable epilogue "
188 "loops."));
189
191 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
192 cl::desc("Only loops with vectorization factor equal to or larger than "
193 "the specified value are considered for epilogue vectorization."));
194
195/// Loops with a known constant trip count below this number are vectorized only
196/// if no scalar iteration overheads are incurred.
198 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
199 cl::desc("Loops with a constant trip count that is smaller than this "
200 "value are vectorized only if no scalar iteration overheads "
201 "are incurred."));
202
204 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
205 cl::desc("The maximum allowed number of runtime memory checks"));
206
207// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208// that predication is preferred, and this lists all options. I.e., the
209// vectorizer will try to fold the tail-loop (epilogue) into the vector body
210// and predicate the instructions accordingly. If tail-folding fails, there are
211// different fallback strategies depending on these values:
213 enum Option {
217 };
218} // namespace PreferPredicateTy
219
221 "prefer-predicate-over-epilogue",
224 cl::desc("Tail-folding and predication preferences over creating a scalar "
225 "epilogue loop."),
227 "scalar-epilogue",
228 "Don't tail-predicate loops, create scalar epilogue"),
230 "predicate-else-scalar-epilogue",
231 "prefer tail-folding, create scalar epilogue if tail "
232 "folding fails."),
234 "predicate-dont-vectorize",
235 "prefers tail-folding, don't attempt vectorization if "
236 "tail-folding fails.")));
237
239 "force-tail-folding-style", cl::desc("Force the tail folding style"),
240 cl::init(TailFoldingStyle::None),
242 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
244 TailFoldingStyle::Data, "data",
245 "Create lane mask for data only, using active.lane.mask intrinsic"),
246 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
247 "data-without-lane-mask",
248 "Create lane mask with compare/stepvector"),
249 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
250 "Create lane mask using active.lane.mask intrinsic, and use "
251 "it for both data and control flow"),
252 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253 "data-and-control-without-rt-check",
254 "Similar to data-and-control, but remove the runtime check"),
255 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
256 "Use predicated EVL instructions for tail folding. If EVL "
257 "is unsupported, fallback to data-without-lane-mask.")));
258
260 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
261 cl::desc("Maximize bandwidth when selecting vectorization factor which "
262 "will be determined by the smallest type in loop."));
263
265 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
266 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
267
268/// An interleave-group may need masking if it resides in a block that needs
269/// predication, or in order to mask away gaps.
271 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
272 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
273
275 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
276 cl::desc("A flag that overrides the target's number of scalar registers."));
277
279 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
280 cl::desc("A flag that overrides the target's number of vector registers."));
281
283 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
284 cl::desc("A flag that overrides the target's max interleave factor for "
285 "scalar loops."));
286
288 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
289 cl::desc("A flag that overrides the target's max interleave factor for "
290 "vectorized loops."));
291
293 "force-target-instruction-cost", cl::init(0), cl::Hidden,
294 cl::desc("A flag that overrides the target's expected cost for "
295 "an instruction to a single constant value. Mostly "
296 "useful for getting consistent testing."));
297
299 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
300 cl::desc(
301 "Pretend that scalable vectors are supported, even if the target does "
302 "not support them. This flag should only be used for testing."));
303
305 "small-loop-cost", cl::init(20), cl::Hidden,
306 cl::desc(
307 "The cost of a loop that is considered 'small' by the interleaver."));
308
310 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
311 cl::desc("Enable the use of the block frequency analysis to access PGO "
312 "heuristics minimizing code growth in cold regions and being more "
313 "aggressive in hot regions."));
314
315// Runtime interleave loops for load/store throughput.
317 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
318 cl::desc(
319 "Enable runtime interleaving until load/store ports are saturated"));
320
321/// The number of stores in a loop that are allowed to need predication.
323 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
324 cl::desc("Max number of stores to be predicated behind an if."));
325
327 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
328 cl::desc("Count the induction variable only once when interleaving"));
329
331 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
332 cl::desc("Enable if predication of stores during vectorization."));
333
335 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
336 cl::desc("The maximum interleave count to use when interleaving a scalar "
337 "reduction in a nested loop."));
338
339static cl::opt<bool>
340 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
342 cl::desc("Prefer in-loop vector reductions, "
343 "overriding the targets preference."));
344
346 "force-ordered-reductions", cl::init(false), cl::Hidden,
347 cl::desc("Enable the vectorisation of loops with in-order (strict) "
348 "FP reductions"));
349
351 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
352 cl::desc(
353 "Prefer predicating a reduction operation over an after loop select."));
354
355namespace llvm {
357 "enable-vplan-native-path", cl::Hidden,
358 cl::desc("Enable VPlan-native vectorization path with "
359 "support for outer loop vectorization."));
360}
361
362// This flag enables the stress testing of the VPlan H-CFG construction in the
363// VPlan-native vectorization path. It must be used in conjuction with
364// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
365// verification of the H-CFGs built.
367 "vplan-build-stress-test", cl::init(false), cl::Hidden,
368 cl::desc(
369 "Build VPlan for every supported loop nest in the function and bail "
370 "out right after the build (stress test the VPlan H-CFG construction "
371 "in the VPlan-native vectorization path)."));
372
374 "interleave-loops", cl::init(true), cl::Hidden,
375 cl::desc("Enable loop interleaving in Loop vectorization passes"));
377 "vectorize-loops", cl::init(true), cl::Hidden,
378 cl::desc("Run the Loop vectorization passes"));
379
381 "vplan-print-in-dot-format", cl::Hidden,
382 cl::desc("Use dot format instead of plain text when dumping VPlans"));
383
385 "force-widen-divrem-via-safe-divisor", cl::Hidden,
386 cl::desc(
387 "Override cost based safe divisor widening for div/rem instructions"));
388
390 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
392 cl::desc("Try wider VFs if they enable the use of vector variants"));
393
394// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
395// variables not overflowing do not hold. See `emitSCEVChecks`.
396static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
397// Likelyhood of bypassing the vectorized loop because pointers overlap. See
398// `emitMemRuntimeChecks`.
399static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
400// Likelyhood of bypassing the vectorized loop because there are zero trips left
401// after prolog. See `emitIterationCountCheck`.
402static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
403
404/// A helper function that returns true if the given type is irregular. The
405/// type is irregular if its allocated size doesn't equal the store size of an
406/// element of the corresponding vector type.
407static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
408 // Determine if an array of N elements of type Ty is "bitcast compatible"
409 // with a <N x Ty> vector.
410 // This is only true if there is no padding between the array elements.
411 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
412}
413
414/// A helper function that returns the reciprocal of the block probability of
415/// predicated blocks. If we return X, we are assuming the predicated block
416/// will execute once for every X iterations of the loop header.
417///
418/// TODO: We should use actual block probability here, if available. Currently,
419/// we always assume predicated blocks have a 50% chance of executing.
420static unsigned getReciprocalPredBlockProb() { return 2; }
421
422/// Returns "best known" trip count for the specified loop \p L as defined by
423/// the following procedure:
424/// 1) Returns exact trip count if it is known.
425/// 2) Returns expected trip count according to profile data if any.
426/// 3) Returns upper bound estimate if it is known.
427/// 4) Returns std::nullopt if all of the above failed.
428static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
429 Loop *L) {
430 // Check if exact trip count is known.
431 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
432 return ExpectedTC;
433
434 // Check if there is an expected trip count available from profile data.
436 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
437 return *EstimatedTC;
438
439 // Check if upper bound estimate is known.
440 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
441 return ExpectedTC;
442
443 return std::nullopt;
444}
445
446/// Return a vector containing interleaved elements from multiple
447/// smaller input vectors.
449 const Twine &Name) {
450 unsigned Factor = Vals.size();
451 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
452
453 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
454#ifndef NDEBUG
455 for (Value *Val : Vals)
456 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
457#endif
458
459 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
460 // must use intrinsics to interleave.
461 if (VecTy->isScalableTy()) {
462 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
463 return Builder.CreateIntrinsic(
464 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
465 /*FMFSource=*/nullptr, Name);
466 }
467
468 // Fixed length. Start by concatenating all vectors into a wide vector.
469 Value *WideVec = concatenateVectors(Builder, Vals);
470
471 // Interleave the elements into the wide vector.
472 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
473 return Builder.CreateShuffleVector(
474 WideVec, createInterleaveMask(NumElts, Factor), Name);
475}
476
477namespace {
478// Forward declare GeneratedRTChecks.
479class GeneratedRTChecks;
480
481using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
482} // namespace
483
484namespace llvm {
485
487
488/// InnerLoopVectorizer vectorizes loops which contain only one basic
489/// block to a specified vectorization factor (VF).
490/// This class performs the widening of scalars into vectors, or multiple
491/// scalars. This class also implements the following features:
492/// * It inserts an epilogue loop for handling loops that don't have iteration
493/// counts that are known to be a multiple of the vectorization factor.
494/// * It handles the code generation for reduction variables.
495/// * Scalarization (implementation using scalars) of un-vectorizable
496/// instructions.
497/// InnerLoopVectorizer does not perform any vectorization-legality
498/// checks, and relies on the caller to check for the different legality
499/// aspects. The InnerLoopVectorizer relies on the
500/// LoopVectorizationLegality class to provide information about the induction
501/// and reduction variables that were found to a given vectorization factor.
503public:
506 const TargetLibraryInfo *TLI,
510 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
512 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
513 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
514 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
515 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
517 // Query this against the original loop and save it here because the profile
518 // of the original loop header may change as the transformation happens.
521
523 this->MinProfitableTripCount = VecWidth;
524 else
525 this->MinProfitableTripCount = MinProfitableTripCount;
526 }
527
528 virtual ~InnerLoopVectorizer() = default;
529
530 /// Create a new empty loop that will contain vectorized instructions later
531 /// on, while the old loop will be used as the scalar remainder. Control flow
532 /// is generated around the vectorized (and scalar epilogue) loops consisting
533 /// of various checks and bypasses. Return the pre-header block of the new
534 /// loop and the start value for the canonical induction, if it is != 0. The
535 /// latter is the case when vectorizing the epilogue loop. In the case of
536 /// epilogue vectorization, this function is overriden to handle the more
537 /// complex control flow around the loops. \p ExpandedSCEVs is used to
538 /// look up SCEV expansions for expressions needed during skeleton creation.
539 virtual std::pair<BasicBlock *, Value *>
540 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
541
542 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
543 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
544
545 // Return true if any runtime check is added.
547
548 /// A type for vectorized values in the new loop. Each value from the
549 /// original loop, when vectorized, is represented by UF vector values in the
550 /// new unrolled loop, where UF is the unroll factor.
552
553 /// A helper function to scalarize a single Instruction in the innermost loop.
554 /// Generates a sequence of scalar instances for each lane between \p MinLane
555 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
556 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
557 /// Instr's operands.
558 void scalarizeInstruction(const Instruction *Instr,
559 VPReplicateRecipe *RepRecipe,
560 const VPIteration &Instance,
561 VPTransformState &State);
562
563 /// Try to vectorize interleaved access group \p Group with the base address
564 /// given in \p Addr, optionally masking the vector operations if \p
565 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
566 /// values in the vectorized loop.
568 ArrayRef<VPValue *> VPDefs,
570 ArrayRef<VPValue *> StoredValues,
571 VPValue *BlockInMask, bool NeedsMaskForGaps);
572
573 /// Fix the non-induction PHIs in \p Plan.
574 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
575
576 /// Create a new phi node for the induction variable \p OrigPhi to resume
577 /// iteration count in the scalar epilogue, from where the vectorized loop
578 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
579 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
580 /// and the resume values can come from an additional bypass block, the \p
581 /// AdditionalBypass pair provides information about the bypass block and the
582 /// end value on the edge from bypass to this loop.
584 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
585 ArrayRef<BasicBlock *> BypassBlocks,
586 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
587
588 /// Returns the original loop trip count.
589 Value *getTripCount() const { return TripCount; }
590
591 /// Used to set the trip count after ILV's construction and after the
592 /// preheader block has been executed. Note that this always holds the trip
593 /// count of the original loop for both main loop and epilogue vectorization.
594 void setTripCount(Value *TC) { TripCount = TC; }
595
596protected:
598
599 /// A small list of PHINodes.
601
602 /// A type for scalarized values in the new loop. Each value from the
603 /// original loop, when scalarized, is represented by UF x VF scalar values
604 /// in the new unrolled loop, where UF is the unroll factor and VF is the
605 /// vectorization factor.
607
608 /// Set up the values of the IVs correctly when exiting the vector loop.
609 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
610 Value *VectorTripCount, Value *EndValue,
611 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
612 VPlan &Plan, VPTransformState &State);
613
614 /// Create the exit value of first order recurrences in the middle block and
615 /// update their users.
617 VPTransformState &State);
618
619 /// Create code for the loop exit value of the reduction.
621
622 /// Iteratively sink the scalarized operands of a predicated instruction into
623 /// the block that was created for it.
624 void sinkScalarOperands(Instruction *PredInst);
625
626 /// Returns (and creates if needed) the trip count of the widened loop.
628
629 /// Returns a bitcasted value to the requested vector type.
630 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
632 const DataLayout &DL);
633
634 /// Emit a bypass check to see if the vector trip count is zero, including if
635 /// it overflows.
637
638 /// Emit a bypass check to see if all of the SCEV assumptions we've
639 /// had to make are correct. Returns the block containing the checks or
640 /// nullptr if no checks have been added.
642
643 /// Emit bypass checks to check any memory assumptions we may have made.
644 /// Returns the block containing the checks or nullptr if no checks have been
645 /// added.
647
648 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
649 /// vector loop preheader, middle block and scalar preheader.
651
652 /// Create new phi nodes for the induction variables to resume iteration count
653 /// in the scalar epilogue, from where the vectorized loop left off.
654 /// In cases where the loop skeleton is more complicated (eg. epilogue
655 /// vectorization) and the resume values can come from an additional bypass
656 /// block, the \p AdditionalBypass pair provides information about the bypass
657 /// block and the end value on the edge from bypass to this loop.
659 const SCEV2ValueTy &ExpandedSCEVs,
660 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
661
662 /// Complete the loop skeleton by adding debug MDs, creating appropriate
663 /// conditional branches in the middle block, preparing the builder and
664 /// running the verifier. Return the preheader of the completed vector loop.
666
667 /// Allow subclasses to override and print debug traces before/after vplan
668 /// execution, when trace information is requested.
669 virtual void printDebugTracesAtStart(){};
670 virtual void printDebugTracesAtEnd(){};
671
672 /// The original loop.
674
675 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
676 /// dynamic knowledge to simplify SCEV expressions and converts them to a
677 /// more usable form.
679
680 /// Loop Info.
682
683 /// Dominator Tree.
685
686 /// Target Library Info.
688
689 /// Target Transform Info.
691
692 /// Assumption Cache.
694
695 /// Interface to emit optimization remarks.
697
698 /// The vectorization SIMD factor to use. Each vector will have this many
699 /// vector elements.
701
703
704 /// The vectorization unroll factor to use. Each scalar is vectorized to this
705 /// many different vector instructions.
706 unsigned UF;
707
708 /// The builder that we use
710
711 // --- Vectorization state ---
712
713 /// The vector-loop preheader.
715
716 /// The scalar-loop preheader.
718
719 /// Middle Block between the vector and the scalar.
721
722 /// The unique ExitBlock of the scalar loop if one exists. Note that
723 /// there can be multiple exiting edges reaching this block.
725
726 /// The scalar loop body.
728
729 /// A list of all bypass blocks. The first block is the entry of the loop.
731
732 /// Store instructions that were predicated.
734
735 /// Trip count of the original loop.
736 Value *TripCount = nullptr;
737
738 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
740
741 /// The legality analysis.
743
744 /// The profitablity analysis.
746
747 // Record whether runtime checks are added.
748 bool AddedSafetyChecks = false;
749
750 // Holds the end values for each induction variable. We save the end values
751 // so we can later fix-up the external users of the induction variables.
753
754 /// BFI and PSI are used to check for profile guided size optimizations.
757
758 // Whether this loop should be optimized for size based on profile guided size
759 // optimizatios.
761
762 /// Structure to hold information about generated runtime checks, responsible
763 /// for cleaning the checks, if vectorization turns out unprofitable.
764 GeneratedRTChecks &RTChecks;
765
766 // Holds the resume values for reductions in the loops, used to set the
767 // correct start value of reduction PHIs when vectorizing the epilogue.
770};
771
773public:
776 const TargetLibraryInfo *TLI,
778 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
781 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
783 ElementCount::getFixed(1),
784 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
785 BFI, PSI, Check) {}
786};
787
788/// Encapsulate information regarding vectorization of a loop and its epilogue.
789/// This information is meant to be updated and used across two stages of
790/// epilogue vectorization.
793 unsigned MainLoopUF = 0;
795 unsigned EpilogueUF = 0;
800 Value *TripCount = nullptr;
802
804 ElementCount EVF, unsigned EUF)
805 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
806 assert(EUF == 1 &&
807 "A high UF for the epilogue loop is likely not beneficial.");
808 }
809};
810
811/// An extension of the inner loop vectorizer that creates a skeleton for a
812/// vectorized loop that has its epilogue (residual) also vectorized.
813/// The idea is to run the vplan on a given loop twice, firstly to setup the
814/// skeleton and vectorize the main loop, and secondly to complete the skeleton
815/// from the first step and vectorize the epilogue. This is achieved by
816/// deriving two concrete strategy classes from this base class and invoking
817/// them in succession from the loop vectorizer planner.
819public:
827 GeneratedRTChecks &Checks)
829 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
830 CM, BFI, PSI, Checks),
831 EPI(EPI) {}
832
833 // Override this function to handle the more complex control flow around the
834 // three loops.
835 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
836 const SCEV2ValueTy &ExpandedSCEVs) final {
837 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
838 }
839
840 /// The interface for creating a vectorized skeleton using one of two
841 /// different strategies, each corresponding to one execution of the vplan
842 /// as described above.
843 virtual std::pair<BasicBlock *, Value *>
844 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
845
846 /// Holds and updates state information required to vectorize the main loop
847 /// and its epilogue in two separate passes. This setup helps us avoid
848 /// regenerating and recomputing runtime safety checks. It also helps us to
849 /// shorten the iteration-count-check path length for the cases where the
850 /// iteration count of the loop is so small that the main vector loop is
851 /// completely skipped.
853};
854
855/// A specialized derived class of inner loop vectorizer that performs
856/// vectorization of *main* loops in the process of vectorizing loops and their
857/// epilogues.
859public:
867 GeneratedRTChecks &Check)
869 EPI, LVL, CM, BFI, PSI, Check) {}
870 /// Implements the interface for creating a vectorized skeleton using the
871 /// *main loop* strategy (ie the first pass of vplan execution).
872 std::pair<BasicBlock *, Value *>
873 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
874
875protected:
876 /// Emits an iteration count bypass check once for the main loop (when \p
877 /// ForEpilogue is false) and once for the epilogue loop (when \p
878 /// ForEpilogue is true).
879 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
880 void printDebugTracesAtStart() override;
881 void printDebugTracesAtEnd() override;
882};
883
884// A specialized derived class of inner loop vectorizer that performs
885// vectorization of *epilogue* loops in the process of vectorizing loops and
886// their epilogues.
888public:
896 GeneratedRTChecks &Checks)
898 EPI, LVL, CM, BFI, PSI, Checks) {
900 }
901 /// Implements the interface for creating a vectorized skeleton using the
902 /// *epilogue loop* strategy (ie the second pass of vplan execution).
903 std::pair<BasicBlock *, Value *>
904 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
905
906protected:
907 /// Emits an iteration count bypass check after the main vector loop has
908 /// finished to see if there are any iterations left to execute by either
909 /// the vector epilogue or the scalar epilogue.
911 BasicBlock *Bypass,
912 BasicBlock *Insert);
913 void printDebugTracesAtStart() override;
914 void printDebugTracesAtEnd() override;
915};
916} // end namespace llvm
917
918/// Look for a meaningful debug location on the instruction or it's
919/// operands.
921 if (!I)
922 return DebugLoc();
923
925 if (I->getDebugLoc() != Empty)
926 return I->getDebugLoc();
927
928 for (Use &Op : I->operands()) {
929 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
930 if (OpInst->getDebugLoc() != Empty)
931 return OpInst->getDebugLoc();
932 }
933
934 return I->getDebugLoc();
935}
936
937/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
938/// is passed, the message relates to that particular instruction.
939#ifndef NDEBUG
940static void debugVectorizationMessage(const StringRef Prefix,
941 const StringRef DebugMsg,
942 Instruction *I) {
943 dbgs() << "LV: " << Prefix << DebugMsg;
944 if (I != nullptr)
945 dbgs() << " " << *I;
946 else
947 dbgs() << '.';
948 dbgs() << '\n';
949}
950#endif
951
952/// Create an analysis remark that explains why vectorization failed
953///
954/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
955/// RemarkName is the identifier for the remark. If \p I is passed it is an
956/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
957/// the location of the remark. \return the remark object that can be
958/// streamed to.
960 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
961 Value *CodeRegion = TheLoop->getHeader();
962 DebugLoc DL = TheLoop->getStartLoc();
963
964 if (I) {
965 CodeRegion = I->getParent();
966 // If there is no debug location attached to the instruction, revert back to
967 // using the loop's.
968 if (I->getDebugLoc())
969 DL = I->getDebugLoc();
970 }
971
972 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
973}
974
975namespace llvm {
976
977/// Return a value for Step multiplied by VF.
979 int64_t Step) {
980 assert(Ty->isIntegerTy() && "Expected an integer step");
981 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
982}
983
984/// Return the runtime value for VF.
986 return B.CreateElementCount(Ty, VF);
987}
988
990 Loop *OrigLoop) {
991 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
992 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
993
994 ScalarEvolution &SE = *PSE.getSE();
995 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
996}
997
999 const StringRef OREMsg, const StringRef ORETag,
1000 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1001 Instruction *I) {
1002 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1003 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1004 ORE->emit(
1005 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1006 << "loop not vectorized: " << OREMsg);
1007}
1008
1009void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1010 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1011 Instruction *I) {
1013 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1014 ORE->emit(
1015 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1016 << Msg);
1017}
1018
1019/// Report successful vectorization of the loop. In case an outer loop is
1020/// vectorized, prepend "outer" to the vectorization remark.
1022 VectorizationFactor VF, unsigned IC) {
1024 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1025 nullptr));
1026 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1027 ORE->emit([&]() {
1028 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1029 TheLoop->getHeader())
1030 << "vectorized " << LoopType << "loop (vectorization width: "
1031 << ore::NV("VectorizationFactor", VF.Width)
1032 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1033 });
1034}
1035
1036} // end namespace llvm
1037
1038#ifndef NDEBUG
1039/// \return string containing a file name and a line # for the given loop.
1040static std::string getDebugLocString(const Loop *L) {
1041 std::string Result;
1042 if (L) {
1043 raw_string_ostream OS(Result);
1044 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1045 LoopDbgLoc.print(OS);
1046 else
1047 // Just print the module name.
1048 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1049 OS.flush();
1050 }
1051 return Result;
1052}
1053#endif
1054
1055namespace llvm {
1056
1057// Loop vectorization cost-model hints how the scalar epilogue loop should be
1058// lowered.
1060
1061 // The default: allowing scalar epilogues.
1063
1064 // Vectorization with OptForSize: don't allow epilogues.
1066
1067 // A special case of vectorisation with OptForSize: loops with a very small
1068 // trip count are considered for vectorization under OptForSize, thereby
1069 // making sure the cost of their loop body is dominant, free of runtime
1070 // guards and scalar iteration overheads.
1072
1073 // Loop hint predicate indicating an epilogue is undesired.
1075
1076 // Directive indicating we must either tail fold or not vectorize
1079
1080using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1081
1082/// LoopVectorizationCostModel - estimates the expected speedups due to
1083/// vectorization.
1084/// In many cases vectorization is not profitable. This can happen because of
1085/// a number of reasons. In this class we mainly attempt to predict the
1086/// expected speedup/slowdowns due to the supported instruction set. We use the
1087/// TargetTransformInfo to query the different backends for the cost of
1088/// different operations.
1090public:
1094 const TargetTransformInfo &TTI,
1100 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1101 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1102 Hints(Hints), InterleaveInfo(IAI) {}
1103
1104 /// \return An upper bound for the vectorization factors (both fixed and
1105 /// scalable). If the factors are 0, vectorization and interleaving should be
1106 /// avoided up front.
1107 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1108
1109 /// \return True if runtime checks are required for vectorization, and false
1110 /// otherwise.
1111 bool runtimeChecksRequired();
1112
1113 /// Setup cost-based decisions for user vectorization factor.
1114 /// \return true if the UserVF is a feasible VF to be chosen.
1118 return expectedCost(UserVF).first.isValid();
1119 }
1120
1121 /// \return The size (in bits) of the smallest and widest types in the code
1122 /// that needs to be vectorized. We ignore values that remain scalar such as
1123 /// 64 bit loop indices.
1124 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1125
1126 /// \return The desired interleave count.
1127 /// If interleave count has been specified by metadata it will be returned.
1128 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1129 /// are the selected vectorization factor and the cost of the selected VF.
1130 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1131
1132 /// Memory access instruction may be vectorized in more than one way.
1133 /// Form of instruction after vectorization depends on cost.
1134 /// This function takes cost-based decisions for Load/Store instructions
1135 /// and collects them in a map. This decisions map is used for building
1136 /// the lists of loop-uniform and loop-scalar instructions.
1137 /// The calculated cost is saved with widening decision in order to
1138 /// avoid redundant calculations.
1140
1141 /// A call may be vectorized in different ways depending on whether we have
1142 /// vectorized variants available and whether the target supports masking.
1143 /// This function analyzes all calls in the function at the supplied VF,
1144 /// makes a decision based on the costs of available options, and stores that
1145 /// decision in a map for use in planning and plan execution.
1147
1148 /// A struct that represents some properties of the register usage
1149 /// of a loop.
1151 /// Holds the number of loop invariant values that are used in the loop.
1152 /// The key is ClassID of target-provided register class.
1154 /// Holds the maximum number of concurrent live intervals in the loop.
1155 /// The key is ClassID of target-provided register class.
1157 };
1158
1159 /// \return Returns information about the register usages of the loop for the
1160 /// given vectorization factors.
1163
1164 /// Collect values we want to ignore in the cost model.
1165 void collectValuesToIgnore();
1166
1167 /// Collect all element types in the loop for which widening is needed.
1169
1170 /// Split reductions into those that happen in the loop, and those that happen
1171 /// outside. In loop reductions are collected into InLoopReductions.
1173
1174 /// Returns true if we should use strict in-order reductions for the given
1175 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1176 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1177 /// of FP operations.
1178 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1179 return !Hints->allowReordering() && RdxDesc.isOrdered();
1180 }
1181
1182 /// \returns The smallest bitwidth each instruction can be represented with.
1183 /// The vector equivalents of these instructions should be truncated to this
1184 /// type.
1186 return MinBWs;
1187 }
1188
1189 /// \returns True if it is more profitable to scalarize instruction \p I for
1190 /// vectorization factor \p VF.
1192 assert(VF.isVector() &&
1193 "Profitable to scalarize relevant only for VF > 1.");
1194 assert(
1195 TheLoop->isInnermost() &&
1196 "cost-model should not be used for outer loops (in VPlan-native path)");
1197
1198 auto Scalars = InstsToScalarize.find(VF);
1199 assert(Scalars != InstsToScalarize.end() &&
1200 "VF not yet analyzed for scalarization profitability");
1201 return Scalars->second.contains(I);
1202 }
1203
1204 /// Returns true if \p I is known to be uniform after vectorization.
1206 assert(
1207 TheLoop->isInnermost() &&
1208 "cost-model should not be used for outer loops (in VPlan-native path)");
1209 // Pseudo probe needs to be duplicated for each unrolled iteration and
1210 // vector lane so that profiled loop trip count can be accurately
1211 // accumulated instead of being under counted.
1212 if (isa<PseudoProbeInst>(I))
1213 return false;
1214
1215 if (VF.isScalar())
1216 return true;
1217
1218 auto UniformsPerVF = Uniforms.find(VF);
1219 assert(UniformsPerVF != Uniforms.end() &&
1220 "VF not yet analyzed for uniformity");
1221 return UniformsPerVF->second.count(I);
1222 }
1223
1224 /// Returns true if \p I is known to be scalar after vectorization.
1226 assert(
1227 TheLoop->isInnermost() &&
1228 "cost-model should not be used for outer loops (in VPlan-native path)");
1229 if (VF.isScalar())
1230 return true;
1231
1232 auto ScalarsPerVF = Scalars.find(VF);
1233 assert(ScalarsPerVF != Scalars.end() &&
1234 "Scalar values are not calculated for VF");
1235 return ScalarsPerVF->second.count(I);
1236 }
1237
1238 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1239 /// for vectorization factor \p VF.
1241 return VF.isVector() && MinBWs.contains(I) &&
1242 !isProfitableToScalarize(I, VF) &&
1244 }
1245
1246 /// Decision that was taken during cost calculation for memory instruction.
1249 CM_Widen, // For consecutive accesses with stride +1.
1250 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1257
1258 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1259 /// instruction \p I and vector width \p VF.
1262 assert(VF.isVector() && "Expected VF >=2");
1263 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1264 }
1265
1266 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1267 /// interleaving group \p Grp and vector width \p VF.
1271 assert(VF.isVector() && "Expected VF >=2");
1272 /// Broadcast this decicion to all instructions inside the group.
1273 /// But the cost will be assigned to one instruction only.
1274 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1275 if (auto *I = Grp->getMember(i)) {
1276 if (Grp->getInsertPos() == I)
1277 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1278 else
1279 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1280 }
1281 }
1282 }
1283
1284 /// Return the cost model decision for the given instruction \p I and vector
1285 /// width \p VF. Return CM_Unknown if this instruction did not pass
1286 /// through the cost modeling.
1288 assert(VF.isVector() && "Expected VF to be a vector VF");
1289 assert(
1290 TheLoop->isInnermost() &&
1291 "cost-model should not be used for outer loops (in VPlan-native path)");
1292
1293 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1294 auto Itr = WideningDecisions.find(InstOnVF);
1295 if (Itr == WideningDecisions.end())
1296 return CM_Unknown;
1297 return Itr->second.first;
1298 }
1299
1300 /// Return the vectorization cost for the given instruction \p I and vector
1301 /// width \p VF.
1303 assert(VF.isVector() && "Expected VF >=2");
1304 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1305 assert(WideningDecisions.contains(InstOnVF) &&
1306 "The cost is not calculated");
1307 return WideningDecisions[InstOnVF].second;
1308 }
1309
1314 std::optional<unsigned> MaskPos;
1316 };
1317
1319 Function *Variant, Intrinsic::ID IID,
1320 std::optional<unsigned> MaskPos,
1322 assert(!VF.isScalar() && "Expected vector VF");
1323 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1324 MaskPos, Cost};
1325 }
1326
1328 ElementCount VF) const {
1329 assert(!VF.isScalar() && "Expected vector VF");
1330 return CallWideningDecisions.at(std::make_pair(CI, VF));
1331 }
1332
1333 /// Return True if instruction \p I is an optimizable truncate whose operand
1334 /// is an induction variable. Such a truncate will be removed by adding a new
1335 /// induction variable with the destination type.
1337 // If the instruction is not a truncate, return false.
1338 auto *Trunc = dyn_cast<TruncInst>(I);
1339 if (!Trunc)
1340 return false;
1341
1342 // Get the source and destination types of the truncate.
1343 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1344 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1345
1346 // If the truncate is free for the given types, return false. Replacing a
1347 // free truncate with an induction variable would add an induction variable
1348 // update instruction to each iteration of the loop. We exclude from this
1349 // check the primary induction variable since it will need an update
1350 // instruction regardless.
1351 Value *Op = Trunc->getOperand(0);
1352 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1353 return false;
1354
1355 // If the truncated value is not an induction variable, return false.
1356 return Legal->isInductionPhi(Op);
1357 }
1358
1359 /// Collects the instructions to scalarize for each predicated instruction in
1360 /// the loop.
1362
1363 /// Collect Uniform and Scalar values for the given \p VF.
1364 /// The sets depend on CM decision for Load/Store instructions
1365 /// that may be vectorized as interleave, gather-scatter or scalarized.
1366 /// Also make a decision on what to do about call instructions in the loop
1367 /// at that VF -- scalarize, call a known vector routine, or call a
1368 /// vector intrinsic.
1370 // Do the analysis once.
1371 if (VF.isScalar() || Uniforms.contains(VF))
1372 return;
1375 collectLoopUniforms(VF);
1376 collectLoopScalars(VF);
1377 }
1378
1379 /// Returns true if the target machine supports masked store operation
1380 /// for the given \p DataType and kind of access to \p Ptr.
1381 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1382 return Legal->isConsecutivePtr(DataType, Ptr) &&
1383 TTI.isLegalMaskedStore(DataType, Alignment);
1384 }
1385
1386 /// Returns true if the target machine supports masked load operation
1387 /// for the given \p DataType and kind of access to \p Ptr.
1388 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1389 return Legal->isConsecutivePtr(DataType, Ptr) &&
1390 TTI.isLegalMaskedLoad(DataType, Alignment);
1391 }
1392
1393 /// Returns true if the target machine can represent \p V as a masked gather
1394 /// or scatter operation.
1396 bool LI = isa<LoadInst>(V);
1397 bool SI = isa<StoreInst>(V);
1398 if (!LI && !SI)
1399 return false;
1400 auto *Ty = getLoadStoreType(V);
1402 if (VF.isVector())
1403 Ty = VectorType::get(Ty, VF);
1404 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1405 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1406 }
1407
1408 /// Returns true if the target machine supports all of the reduction
1409 /// variables found for the given VF.
1411 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1412 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1413 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1414 }));
1415 }
1416
1417 /// Given costs for both strategies, return true if the scalar predication
1418 /// lowering should be used for div/rem. This incorporates an override
1419 /// option so it is not simply a cost comparison.
1421 InstructionCost SafeDivisorCost) const {
1422 switch (ForceSafeDivisor) {
1423 case cl::BOU_UNSET:
1424 return ScalarCost < SafeDivisorCost;
1425 case cl::BOU_TRUE:
1426 return false;
1427 case cl::BOU_FALSE:
1428 return true;
1429 };
1430 llvm_unreachable("impossible case value");
1431 }
1432
1433 /// Returns true if \p I is an instruction which requires predication and
1434 /// for which our chosen predication strategy is scalarization (i.e. we
1435 /// don't have an alternate strategy such as masking available).
1436 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1438
1439 /// Returns true if \p I is an instruction that needs to be predicated
1440 /// at runtime. The result is independent of the predication mechanism.
1441 /// Superset of instructions that return true for isScalarWithPredication.
1442 bool isPredicatedInst(Instruction *I) const;
1443
1444 /// Return the costs for our two available strategies for lowering a
1445 /// div/rem operation which requires speculating at least one lane.
1446 /// First result is for scalarization (will be invalid for scalable
1447 /// vectors); second is for the safe-divisor strategy.
1448 std::pair<InstructionCost, InstructionCost>
1450 ElementCount VF) const;
1451
1452 /// Returns true if \p I is a memory instruction with consecutive memory
1453 /// access that can be widened.
1455
1456 /// Returns true if \p I is a memory instruction in an interleaved-group
1457 /// of memory accesses that can be vectorized with wide vector loads/stores
1458 /// and shuffles.
1460
1461 /// Check if \p Instr belongs to any interleaved access group.
1463 return InterleaveInfo.isInterleaved(Instr);
1464 }
1465
1466 /// Get the interleaved access group that \p Instr belongs to.
1469 return InterleaveInfo.getInterleaveGroup(Instr);
1470 }
1471
1472 /// Returns true if we're required to use a scalar epilogue for at least
1473 /// the final iteration of the original loop.
1474 bool requiresScalarEpilogue(bool IsVectorizing) const {
1476 return false;
1477 // If we might exit from anywhere but the latch, must run the exiting
1478 // iteration in scalar form.
1480 return true;
1481 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1482 }
1483
1484 /// Returns true if we're required to use a scalar epilogue for at least
1485 /// the final iteration of the original loop for all VFs in \p Range.
1486 /// A scalar epilogue must either be required for all VFs in \p Range or for
1487 /// none.
1489 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1490 return requiresScalarEpilogue(VF.isVector());
1491 };
1492 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1493 assert(
1494 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1495 "all VFs in range must agree on whether a scalar epilogue is required");
1496 return IsRequired;
1497 }
1498
1499 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1500 /// loop hint annotation.
1502 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1503 }
1504
1505 /// Returns the TailFoldingStyle that is best for the current loop.
1506 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1507 if (!ChosenTailFoldingStyle)
1509 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1510 : ChosenTailFoldingStyle->second;
1511 }
1512
1513 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1514 /// overflow or not.
1515 /// \param IsScalableVF true if scalable vector factors enabled.
1516 /// \param UserIC User specific interleave count.
1517 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1518 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1520 ChosenTailFoldingStyle =
1522 return;
1523 }
1524
1525 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1526 ChosenTailFoldingStyle = std::make_pair(
1527 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1528 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1529 return;
1530 }
1531
1532 // Set styles when forced.
1533 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1534 ForceTailFoldingStyle.getValue());
1536 return;
1537 // Override forced styles if needed.
1538 // FIXME: use actual opcode/data type for analysis here.
1539 // FIXME: Investigate opportunity for fixed vector factor.
1540 bool EVLIsLegal =
1541 IsScalableVF && UserIC <= 1 &&
1542 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1544 // FIXME: implement support for max safe dependency distance.
1546 // FIXME: remove this once reductions are supported.
1548 if (!EVLIsLegal) {
1549 // If for some reason EVL mode is unsupported, fallback to
1550 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1551 // in a generic way.
1552 ChosenTailFoldingStyle =
1555 LLVM_DEBUG(
1556 dbgs()
1557 << "LV: Preference for VP intrinsics indicated. Will "
1558 "not try to generate VP Intrinsics "
1559 << (UserIC > 1
1560 ? "since interleave count specified is greater than 1.\n"
1561 : "due to non-interleaving reasons.\n"));
1562 }
1563 }
1564
1565 /// Returns true if all loop blocks should be masked to fold tail loop.
1566 bool foldTailByMasking() const {
1567 // TODO: check if it is possible to check for None style independent of
1568 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1570 }
1571
1572 /// Returns true if the instructions in this block requires predication
1573 /// for any reason, e.g. because tail folding now requires a predicate
1574 /// or because the block in the original loop was predicated.
1577 }
1578
1579 /// Returns true if VP intrinsics with explicit vector length support should
1580 /// be generated in the tail folded loop.
1581 bool foldTailWithEVL() const {
1583 // FIXME: remove this once vp_reverse is supported.
1584 none_of(
1585 WideningDecisions,
1586 [](const std::pair<std::pair<Instruction *, ElementCount>,
1587 std::pair<InstWidening, InstructionCost>>
1588 &Data) { return Data.second.first == CM_Widen_Reverse; });
1589 }
1590
1591 /// Returns true if the Phi is part of an inloop reduction.
1592 bool isInLoopReduction(PHINode *Phi) const {
1593 return InLoopReductions.contains(Phi);
1594 }
1595
1596 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1597 /// with factor VF. Return the cost of the instruction, including
1598 /// scalarization overhead if it's needed.
1600
1601 /// Estimate cost of a call instruction CI if it were vectorized with factor
1602 /// VF. Return the cost of the instruction, including scalarization overhead
1603 /// if it's needed.
1605
1606 /// Invalidates decisions already taken by the cost model.
1608 WideningDecisions.clear();
1609 CallWideningDecisions.clear();
1610 Uniforms.clear();
1611 Scalars.clear();
1612 }
1613
1614 /// The vectorization cost is a combination of the cost itself and a boolean
1615 /// indicating whether any of the contributing operations will actually
1616 /// operate on vector values after type legalization in the backend. If this
1617 /// latter value is false, then all operations will be scalarized (i.e. no
1618 /// vectorization has actually taken place).
1619 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1620
1621 /// Returns the expected execution cost. The unit of the cost does
1622 /// not matter because we use the 'cost' units to compare different
1623 /// vector widths. The cost that is returned is *not* normalized by
1624 /// the factor width. If \p Invalid is not nullptr, this function
1625 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1626 /// each instruction that has an Invalid cost for the given VF.
1630
1631 bool hasPredStores() const { return NumPredStores > 0; }
1632
1633 /// Returns true if epilogue vectorization is considered profitable, and
1634 /// false otherwise.
1635 /// \p VF is the vectorization factor chosen for the original loop.
1637
1638private:
1639 unsigned NumPredStores = 0;
1640
1641 /// \return An upper bound for the vectorization factors for both
1642 /// fixed and scalable vectorization, where the minimum-known number of
1643 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1644 /// disabled or unsupported, then the scalable part will be equal to
1645 /// ElementCount::getScalable(0).
1646 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1647 ElementCount UserVF,
1648 bool FoldTailByMasking);
1649
1650 /// \return the maximized element count based on the targets vector
1651 /// registers and the loop trip-count, but limited to a maximum safe VF.
1652 /// This is a helper function of computeFeasibleMaxVF.
1653 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1654 unsigned SmallestType,
1655 unsigned WidestType,
1656 ElementCount MaxSafeVF,
1657 bool FoldTailByMasking);
1658
1659 /// \return the maximum legal scalable VF, based on the safe max number
1660 /// of elements.
1661 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1662
1663 /// Returns the execution time cost of an instruction for a given vector
1664 /// width. Vector width of one means scalar.
1665 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1666
1667 /// The cost-computation logic from getInstructionCost which provides
1668 /// the vector type as an output parameter.
1669 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1670 Type *&VectorTy);
1671
1672 /// Return the cost of instructions in an inloop reduction pattern, if I is
1673 /// part of that pattern.
1674 std::optional<InstructionCost>
1675 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1677
1678 /// Calculate vectorization cost of memory instruction \p I.
1679 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1680
1681 /// The cost computation for scalarized memory instruction.
1682 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1683
1684 /// The cost computation for interleaving group of memory instructions.
1685 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1686
1687 /// The cost computation for Gather/Scatter instruction.
1688 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1689
1690 /// The cost computation for widening instruction \p I with consecutive
1691 /// memory access.
1692 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1693
1694 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1695 /// Load: scalar load + broadcast.
1696 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1697 /// element)
1698 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1699
1700 /// Estimate the overhead of scalarizing an instruction. This is a
1701 /// convenience wrapper for the type-based getScalarizationOverhead API.
1702 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1704
1705 /// Returns true if an artificially high cost for emulated masked memrefs
1706 /// should be used.
1707 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1708
1709 /// Map of scalar integer values to the smallest bitwidth they can be legally
1710 /// represented as. The vector equivalents of these values should be truncated
1711 /// to this type.
1713
1714 /// A type representing the costs for instructions if they were to be
1715 /// scalarized rather than vectorized. The entries are Instruction-Cost
1716 /// pairs.
1717 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1718
1719 /// A set containing all BasicBlocks that are known to present after
1720 /// vectorization as a predicated block.
1722 PredicatedBBsAfterVectorization;
1723
1724 /// Records whether it is allowed to have the original scalar loop execute at
1725 /// least once. This may be needed as a fallback loop in case runtime
1726 /// aliasing/dependence checks fail, or to handle the tail/remainder
1727 /// iterations when the trip count is unknown or doesn't divide by the VF,
1728 /// or as a peel-loop to handle gaps in interleave-groups.
1729 /// Under optsize and when the trip count is very small we don't allow any
1730 /// iterations to execute in the scalar loop.
1731 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1732
1733 /// Control finally chosen tail folding style. The first element is used if
1734 /// the IV update may overflow, the second element - if it does not.
1735 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1736 ChosenTailFoldingStyle;
1737
1738 /// A map holding scalar costs for different vectorization factors. The
1739 /// presence of a cost for an instruction in the mapping indicates that the
1740 /// instruction will be scalarized when vectorizing with the associated
1741 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1743
1744 /// Holds the instructions known to be uniform after vectorization.
1745 /// The data is collected per VF.
1747
1748 /// Holds the instructions known to be scalar after vectorization.
1749 /// The data is collected per VF.
1751
1752 /// Holds the instructions (address computations) that are forced to be
1753 /// scalarized.
1755
1756 /// PHINodes of the reductions that should be expanded in-loop.
1757 SmallPtrSet<PHINode *, 4> InLoopReductions;
1758
1759 /// A Map of inloop reduction operations and their immediate chain operand.
1760 /// FIXME: This can be removed once reductions can be costed correctly in
1761 /// VPlan. This was added to allow quick lookup of the inloop operations.
1762 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1763
1764 /// Returns the expected difference in cost from scalarizing the expression
1765 /// feeding a predicated instruction \p PredInst. The instructions to
1766 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1767 /// non-negative return value implies the expression will be scalarized.
1768 /// Currently, only single-use chains are considered for scalarization.
1769 InstructionCost computePredInstDiscount(Instruction *PredInst,
1770 ScalarCostsTy &ScalarCosts,
1771 ElementCount VF);
1772
1773 /// Collect the instructions that are uniform after vectorization. An
1774 /// instruction is uniform if we represent it with a single scalar value in
1775 /// the vectorized loop corresponding to each vector iteration. Examples of
1776 /// uniform instructions include pointer operands of consecutive or
1777 /// interleaved memory accesses. Note that although uniformity implies an
1778 /// instruction will be scalar, the reverse is not true. In general, a
1779 /// scalarized instruction will be represented by VF scalar values in the
1780 /// vectorized loop, each corresponding to an iteration of the original
1781 /// scalar loop.
1782 void collectLoopUniforms(ElementCount VF);
1783
1784 /// Collect the instructions that are scalar after vectorization. An
1785 /// instruction is scalar if it is known to be uniform or will be scalarized
1786 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1787 /// to the list if they are used by a load/store instruction that is marked as
1788 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1789 /// VF values in the vectorized loop, each corresponding to an iteration of
1790 /// the original scalar loop.
1791 void collectLoopScalars(ElementCount VF);
1792
1793 /// Keeps cost model vectorization decision and cost for instructions.
1794 /// Right now it is used for memory instructions only.
1796 std::pair<InstWidening, InstructionCost>>;
1797
1798 DecisionList WideningDecisions;
1799
1800 using CallDecisionList =
1801 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1802
1803 CallDecisionList CallWideningDecisions;
1804
1805 /// Returns true if \p V is expected to be vectorized and it needs to be
1806 /// extracted.
1807 bool needsExtract(Value *V, ElementCount VF) const {
1808 Instruction *I = dyn_cast<Instruction>(V);
1809 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1811 return false;
1812
1813 // Assume we can vectorize V (and hence we need extraction) if the
1814 // scalars are not computed yet. This can happen, because it is called
1815 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1816 // the scalars are collected. That should be a safe assumption in most
1817 // cases, because we check if the operands have vectorizable types
1818 // beforehand in LoopVectorizationLegality.
1819 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1820 };
1821
1822 /// Returns a range containing only operands needing to be extracted.
1823 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1824 ElementCount VF) const {
1826 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1827 }
1828
1829public:
1830 /// The loop that we evaluate.
1832
1833 /// Predicated scalar evolution analysis.
1835
1836 /// Loop Info analysis.
1838
1839 /// Vectorization legality.
1841
1842 /// Vector target information.
1844
1845 /// Target Library Info.
1847
1848 /// Demanded bits analysis.
1850
1851 /// Assumption cache.
1853
1854 /// Interface to emit optimization remarks.
1856
1858
1859 /// Loop Vectorize Hint.
1861
1862 /// The interleave access information contains groups of interleaved accesses
1863 /// with the same stride and close to each other.
1865
1866 /// Values to ignore in the cost model.
1868
1869 /// Values to ignore in the cost model when VF > 1.
1871
1872 /// All element types found in the loop.
1874};
1875} // end namespace llvm
1876
1877namespace {
1878/// Helper struct to manage generating runtime checks for vectorization.
1879///
1880/// The runtime checks are created up-front in temporary blocks to allow better
1881/// estimating the cost and un-linked from the existing IR. After deciding to
1882/// vectorize, the checks are moved back. If deciding not to vectorize, the
1883/// temporary blocks are completely removed.
1884class GeneratedRTChecks {
1885 /// Basic block which contains the generated SCEV checks, if any.
1886 BasicBlock *SCEVCheckBlock = nullptr;
1887
1888 /// The value representing the result of the generated SCEV checks. If it is
1889 /// nullptr, either no SCEV checks have been generated or they have been used.
1890 Value *SCEVCheckCond = nullptr;
1891
1892 /// Basic block which contains the generated memory runtime checks, if any.
1893 BasicBlock *MemCheckBlock = nullptr;
1894
1895 /// The value representing the result of the generated memory runtime checks.
1896 /// If it is nullptr, either no memory runtime checks have been generated or
1897 /// they have been used.
1898 Value *MemRuntimeCheckCond = nullptr;
1899
1900 DominatorTree *DT;
1901 LoopInfo *LI;
1903
1904 SCEVExpander SCEVExp;
1905 SCEVExpander MemCheckExp;
1906
1907 bool CostTooHigh = false;
1908 const bool AddBranchWeights;
1909
1910 Loop *OuterLoop = nullptr;
1911
1912public:
1913 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1915 bool AddBranchWeights)
1916 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1917 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1918
1919 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1920 /// accurately estimate the cost of the runtime checks. The blocks are
1921 /// un-linked from the IR and is added back during vector code generation. If
1922 /// there is no vector code generation, the check blocks are removed
1923 /// completely.
1924 void Create(Loop *L, const LoopAccessInfo &LAI,
1925 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1926
1927 // Hard cutoff to limit compile-time increase in case a very large number of
1928 // runtime checks needs to be generated.
1929 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1930 // profile info.
1931 CostTooHigh =
1933 if (CostTooHigh)
1934 return;
1935
1936 BasicBlock *LoopHeader = L->getHeader();
1937 BasicBlock *Preheader = L->getLoopPreheader();
1938
1939 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1940 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1941 // may be used by SCEVExpander. The blocks will be un-linked from their
1942 // predecessors and removed from LI & DT at the end of the function.
1943 if (!UnionPred.isAlwaysTrue()) {
1944 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1945 nullptr, "vector.scevcheck");
1946
1947 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1948 &UnionPred, SCEVCheckBlock->getTerminator());
1949 }
1950
1951 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1952 if (RtPtrChecking.Need) {
1953 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1954 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1955 "vector.memcheck");
1956
1957 auto DiffChecks = RtPtrChecking.getDiffChecks();
1958 if (DiffChecks) {
1959 Value *RuntimeVF = nullptr;
1960 MemRuntimeCheckCond = addDiffRuntimeChecks(
1961 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1962 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1963 if (!RuntimeVF)
1964 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1965 return RuntimeVF;
1966 },
1967 IC);
1968 } else {
1969 MemRuntimeCheckCond = addRuntimeChecks(
1970 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1972 }
1973 assert(MemRuntimeCheckCond &&
1974 "no RT checks generated although RtPtrChecking "
1975 "claimed checks are required");
1976 }
1977
1978 if (!MemCheckBlock && !SCEVCheckBlock)
1979 return;
1980
1981 // Unhook the temporary block with the checks, update various places
1982 // accordingly.
1983 if (SCEVCheckBlock)
1984 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1985 if (MemCheckBlock)
1986 MemCheckBlock->replaceAllUsesWith(Preheader);
1987
1988 if (SCEVCheckBlock) {
1989 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1990 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1991 Preheader->getTerminator()->eraseFromParent();
1992 }
1993 if (MemCheckBlock) {
1994 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1995 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1996 Preheader->getTerminator()->eraseFromParent();
1997 }
1998
1999 DT->changeImmediateDominator(LoopHeader, Preheader);
2000 if (MemCheckBlock) {
2001 DT->eraseNode(MemCheckBlock);
2002 LI->removeBlock(MemCheckBlock);
2003 }
2004 if (SCEVCheckBlock) {
2005 DT->eraseNode(SCEVCheckBlock);
2006 LI->removeBlock(SCEVCheckBlock);
2007 }
2008
2009 // Outer loop is used as part of the later cost calculations.
2010 OuterLoop = L->getParentLoop();
2011 }
2012
2013 InstructionCost getCost() {
2014 if (SCEVCheckBlock || MemCheckBlock)
2015 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2016
2017 if (CostTooHigh) {
2019 Cost.setInvalid();
2020 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2021 return Cost;
2022 }
2023
2024 InstructionCost RTCheckCost = 0;
2025 if (SCEVCheckBlock)
2026 for (Instruction &I : *SCEVCheckBlock) {
2027 if (SCEVCheckBlock->getTerminator() == &I)
2028 continue;
2031 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2032 RTCheckCost += C;
2033 }
2034 if (MemCheckBlock) {
2035 InstructionCost MemCheckCost = 0;
2036 for (Instruction &I : *MemCheckBlock) {
2037 if (MemCheckBlock->getTerminator() == &I)
2038 continue;
2041 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2042 MemCheckCost += C;
2043 }
2044
2045 // If the runtime memory checks are being created inside an outer loop
2046 // we should find out if these checks are outer loop invariant. If so,
2047 // the checks will likely be hoisted out and so the effective cost will
2048 // reduce according to the outer loop trip count.
2049 if (OuterLoop) {
2050 ScalarEvolution *SE = MemCheckExp.getSE();
2051 // TODO: If profitable, we could refine this further by analysing every
2052 // individual memory check, since there could be a mixture of loop
2053 // variant and invariant checks that mean the final condition is
2054 // variant.
2055 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2056 if (SE->isLoopInvariant(Cond, OuterLoop)) {
2057 // It seems reasonable to assume that we can reduce the effective
2058 // cost of the checks even when we know nothing about the trip
2059 // count. Assume that the outer loop executes at least twice.
2060 unsigned BestTripCount = 2;
2061
2062 // If exact trip count is known use that.
2063 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2064 BestTripCount = SmallTC;
2066 // Else use profile data if available.
2067 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2068 BestTripCount = *EstimatedTC;
2069 }
2070
2071 BestTripCount = std::max(BestTripCount, 1U);
2072 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2073
2074 // Let's ensure the cost is always at least 1.
2075 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2077
2078 if (BestTripCount > 1)
2080 << "We expect runtime memory checks to be hoisted "
2081 << "out of the outer loop. Cost reduced from "
2082 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2083
2084 MemCheckCost = NewMemCheckCost;
2085 }
2086 }
2087
2088 RTCheckCost += MemCheckCost;
2089 }
2090
2091 if (SCEVCheckBlock || MemCheckBlock)
2092 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2093 << "\n");
2094
2095 return RTCheckCost;
2096 }
2097
2098 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2099 /// unused.
2100 ~GeneratedRTChecks() {
2101 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2102 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2103 if (!SCEVCheckCond)
2104 SCEVCleaner.markResultUsed();
2105
2106 if (!MemRuntimeCheckCond)
2107 MemCheckCleaner.markResultUsed();
2108
2109 if (MemRuntimeCheckCond) {
2110 auto &SE = *MemCheckExp.getSE();
2111 // Memory runtime check generation creates compares that use expanded
2112 // values. Remove them before running the SCEVExpanderCleaners.
2113 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2114 if (MemCheckExp.isInsertedInstruction(&I))
2115 continue;
2116 SE.forgetValue(&I);
2117 I.eraseFromParent();
2118 }
2119 }
2120 MemCheckCleaner.cleanup();
2121 SCEVCleaner.cleanup();
2122
2123 if (SCEVCheckCond)
2124 SCEVCheckBlock->eraseFromParent();
2125 if (MemRuntimeCheckCond)
2126 MemCheckBlock->eraseFromParent();
2127 }
2128
2129 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2130 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2131 /// depending on the generated condition.
2132 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2133 BasicBlock *LoopVectorPreHeader,
2134 BasicBlock *LoopExitBlock) {
2135 if (!SCEVCheckCond)
2136 return nullptr;
2137
2138 Value *Cond = SCEVCheckCond;
2139 // Mark the check as used, to prevent it from being removed during cleanup.
2140 SCEVCheckCond = nullptr;
2141 if (auto *C = dyn_cast<ConstantInt>(Cond))
2142 if (C->isZero())
2143 return nullptr;
2144
2145 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2146
2147 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2148 // Create new preheader for vector loop.
2149 if (OuterLoop)
2150 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2151
2152 SCEVCheckBlock->getTerminator()->eraseFromParent();
2153 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2154 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2155 SCEVCheckBlock);
2156
2157 DT->addNewBlock(SCEVCheckBlock, Pred);
2158 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2159
2160 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2161 if (AddBranchWeights)
2163 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2164 return SCEVCheckBlock;
2165 }
2166
2167 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2168 /// the branches to branch to the vector preheader or \p Bypass, depending on
2169 /// the generated condition.
2170 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2171 BasicBlock *LoopVectorPreHeader) {
2172 // Check if we generated code that checks in runtime if arrays overlap.
2173 if (!MemRuntimeCheckCond)
2174 return nullptr;
2175
2176 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2177 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2178 MemCheckBlock);
2179
2180 DT->addNewBlock(MemCheckBlock, Pred);
2181 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2182 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2183
2184 if (OuterLoop)
2185 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2186
2187 BranchInst &BI =
2188 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2189 if (AddBranchWeights) {
2191 }
2192 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2193 MemCheckBlock->getTerminator()->setDebugLoc(
2194 Pred->getTerminator()->getDebugLoc());
2195
2196 // Mark the check as used, to prevent it from being removed during cleanup.
2197 MemRuntimeCheckCond = nullptr;
2198 return MemCheckBlock;
2199 }
2200};
2201} // namespace
2202
2204 return Style == TailFoldingStyle::Data ||
2205 Style == TailFoldingStyle::DataAndControlFlow ||
2206 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2207}
2208
2210 return Style == TailFoldingStyle::DataAndControlFlow ||
2211 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2212}
2213
2214// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2215// vectorization. The loop needs to be annotated with #pragma omp simd
2216// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2217// vector length information is not provided, vectorization is not considered
2218// explicit. Interleave hints are not allowed either. These limitations will be
2219// relaxed in the future.
2220// Please, note that we are currently forced to abuse the pragma 'clang
2221// vectorize' semantics. This pragma provides *auto-vectorization hints*
2222// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2223// provides *explicit vectorization hints* (LV can bypass legal checks and
2224// assume that vectorization is legal). However, both hints are implemented
2225// using the same metadata (llvm.loop.vectorize, processed by
2226// LoopVectorizeHints). This will be fixed in the future when the native IR
2227// representation for pragma 'omp simd' is introduced.
2228static bool isExplicitVecOuterLoop(Loop *OuterLp,
2230 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2231 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2232
2233 // Only outer loops with an explicit vectorization hint are supported.
2234 // Unannotated outer loops are ignored.
2236 return false;
2237
2238 Function *Fn = OuterLp->getHeader()->getParent();
2239 if (!Hints.allowVectorization(Fn, OuterLp,
2240 true /*VectorizeOnlyWhenForced*/)) {
2241 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2242 return false;
2243 }
2244
2245 if (Hints.getInterleave() > 1) {
2246 // TODO: Interleave support is future work.
2247 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2248 "outer loops.\n");
2249 Hints.emitRemarkWithHints();
2250 return false;
2251 }
2252
2253 return true;
2254}
2255
2259 // Collect inner loops and outer loops without irreducible control flow. For
2260 // now, only collect outer loops that have explicit vectorization hints. If we
2261 // are stress testing the VPlan H-CFG construction, we collect the outermost
2262 // loop of every loop nest.
2263 if (L.isInnermost() || VPlanBuildStressTest ||
2265 LoopBlocksRPO RPOT(&L);
2266 RPOT.perform(LI);
2267 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2268 V.push_back(&L);
2269 // TODO: Collect inner loops inside marked outer loops in case
2270 // vectorization fails for the outer loop. Do not invoke
2271 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2272 // already known to be reducible. We can use an inherited attribute for
2273 // that.
2274 return;
2275 }
2276 }
2277 for (Loop *InnerL : L)
2278 collectSupportedLoops(*InnerL, LI, ORE, V);
2279}
2280
2281//===----------------------------------------------------------------------===//
2282// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2283// LoopVectorizationCostModel and LoopVectorizationPlanner.
2284//===----------------------------------------------------------------------===//
2285
2286/// Compute the transformed value of Index at offset StartValue using step
2287/// StepValue.
2288/// For integer induction, returns StartValue + Index * StepValue.
2289/// For pointer induction, returns StartValue[Index * StepValue].
2290/// FIXME: The newly created binary instructions should contain nsw/nuw
2291/// flags, which can be found from the original scalar operations.
2292static Value *
2294 Value *Step,
2296 const BinaryOperator *InductionBinOp) {
2297 Type *StepTy = Step->getType();
2298 Value *CastedIndex = StepTy->isIntegerTy()
2299 ? B.CreateSExtOrTrunc(Index, StepTy)
2300 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2301 if (CastedIndex != Index) {
2302 CastedIndex->setName(CastedIndex->getName() + ".cast");
2303 Index = CastedIndex;
2304 }
2305
2306 // Note: the IR at this point is broken. We cannot use SE to create any new
2307 // SCEV and then expand it, hoping that SCEV's simplification will give us
2308 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2309 // lead to various SCEV crashes. So all we can do is to use builder and rely
2310 // on InstCombine for future simplifications. Here we handle some trivial
2311 // cases only.
2312 auto CreateAdd = [&B](Value *X, Value *Y) {
2313 assert(X->getType() == Y->getType() && "Types don't match!");
2314 if (auto *CX = dyn_cast<ConstantInt>(X))
2315 if (CX->isZero())
2316 return Y;
2317 if (auto *CY = dyn_cast<ConstantInt>(Y))
2318 if (CY->isZero())
2319 return X;
2320 return B.CreateAdd(X, Y);
2321 };
2322
2323 // We allow X to be a vector type, in which case Y will potentially be
2324 // splatted into a vector with the same element count.
2325 auto CreateMul = [&B](Value *X, Value *Y) {
2326 assert(X->getType()->getScalarType() == Y->getType() &&
2327 "Types don't match!");
2328 if (auto *CX = dyn_cast<ConstantInt>(X))
2329 if (CX->isOne())
2330 return Y;
2331 if (auto *CY = dyn_cast<ConstantInt>(Y))
2332 if (CY->isOne())
2333 return X;
2334 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2335 if (XVTy && !isa<VectorType>(Y->getType()))
2336 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2337 return B.CreateMul(X, Y);
2338 };
2339
2340 switch (InductionKind) {
2342 assert(!isa<VectorType>(Index->getType()) &&
2343 "Vector indices not supported for integer inductions yet");
2344 assert(Index->getType() == StartValue->getType() &&
2345 "Index type does not match StartValue type");
2346 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2347 return B.CreateSub(StartValue, Index);
2348 auto *Offset = CreateMul(Index, Step);
2349 return CreateAdd(StartValue, Offset);
2350 }
2352 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2354 assert(!isa<VectorType>(Index->getType()) &&
2355 "Vector indices not supported for FP inductions yet");
2356 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2357 assert(InductionBinOp &&
2358 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2359 InductionBinOp->getOpcode() == Instruction::FSub) &&
2360 "Original bin op should be defined for FP induction");
2361
2362 Value *MulExp = B.CreateFMul(Step, Index);
2363 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2364 "induction");
2365 }
2367 return nullptr;
2368 }
2369 llvm_unreachable("invalid enum");
2370}
2371
2372std::optional<unsigned> getMaxVScale(const Function &F,
2373 const TargetTransformInfo &TTI) {
2374 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2375 return MaxVScale;
2376
2377 if (F.hasFnAttribute(Attribute::VScaleRange))
2378 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2379
2380 return std::nullopt;
2381}
2382
2383/// For the given VF and UF and maximum trip count computed for the loop, return
2384/// whether the induction variable might overflow in the vectorized loop. If not,
2385/// then we know a runtime overflow check always evaluates to false and can be
2386/// removed.
2389 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2390 // Always be conservative if we don't know the exact unroll factor.
2391 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2392
2393 Type *IdxTy = Cost->Legal->getWidestInductionType();
2394 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2395
2396 // We know the runtime overflow check is known false iff the (max) trip-count
2397 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2398 // the vector loop induction variable.
2399 if (unsigned TC =
2400 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2401 uint64_t MaxVF = VF.getKnownMinValue();
2402 if (VF.isScalable()) {
2403 std::optional<unsigned> MaxVScale =
2404 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2405 if (!MaxVScale)
2406 return false;
2407 MaxVF *= *MaxVScale;
2408 }
2409
2410 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2411 }
2412
2413 return false;
2414}
2415
2416// Return whether we allow using masked interleave-groups (for dealing with
2417// strided loads/stores that reside in predicated blocks, or for dealing
2418// with gaps).
2420 // If an override option has been passed in for interleaved accesses, use it.
2421 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2423
2425}
2426
2427// Try to vectorize the interleave group that \p Instr belongs to.
2428//
2429// E.g. Translate following interleaved load group (factor = 3):
2430// for (i = 0; i < N; i+=3) {
2431// R = Pic[i]; // Member of index 0
2432// G = Pic[i+1]; // Member of index 1
2433// B = Pic[i+2]; // Member of index 2
2434// ... // do something to R, G, B
2435// }
2436// To:
2437// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2438// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2439// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2440// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2441//
2442// Or translate following interleaved store group (factor = 3):
2443// for (i = 0; i < N; i+=3) {
2444// ... do something to R, G, B
2445// Pic[i] = R; // Member of index 0
2446// Pic[i+1] = G; // Member of index 1
2447// Pic[i+2] = B; // Member of index 2
2448// }
2449// To:
2450// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2451// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2452// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2453// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2454// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2457 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2458 VPValue *BlockInMask, bool NeedsMaskForGaps) {
2459 Instruction *Instr = Group->getInsertPos();
2460 const DataLayout &DL = Instr->getModule()->getDataLayout();
2461
2462 // Prepare for the vector type of the interleaved load/store.
2463 Type *ScalarTy = getLoadStoreType(Instr);
2464 unsigned InterleaveFactor = Group->getFactor();
2465 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2466
2467 // Prepare for the new pointers.
2468 SmallVector<Value *, 2> AddrParts;
2469 unsigned Index = Group->getIndex(Instr);
2470
2471 // TODO: extend the masked interleaved-group support to reversed access.
2472 assert((!BlockInMask || !Group->isReverse()) &&
2473 "Reversed masked interleave-group not supported.");
2474
2475 Value *Idx;
2476 // If the group is reverse, adjust the index to refer to the last vector lane
2477 // instead of the first. We adjust the index from the first vector lane,
2478 // rather than directly getting the pointer for lane VF - 1, because the
2479 // pointer operand of the interleaved access is supposed to be uniform. For
2480 // uniform instructions, we're only required to generate a value for the
2481 // first vector lane in each unroll iteration.
2482 if (Group->isReverse()) {
2483 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2484 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2488 } else
2490
2491 for (unsigned Part = 0; Part < UF; Part++) {
2492 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2493 if (auto *I = dyn_cast<Instruction>(AddrPart))
2494 State.setDebugLocFrom(I->getDebugLoc());
2495
2496 // Notice current instruction could be any index. Need to adjust the address
2497 // to the member of index 0.
2498 //
2499 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2500 // b = A[i]; // Member of index 0
2501 // Current pointer is pointed to A[i+1], adjust it to A[i].
2502 //
2503 // E.g. A[i+1] = a; // Member of index 1
2504 // A[i] = b; // Member of index 0
2505 // A[i+2] = c; // Member of index 2 (Current instruction)
2506 // Current pointer is pointed to A[i+2], adjust it to A[i].
2507
2508 bool InBounds = false;
2509 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2510 InBounds = gep->isInBounds();
2511 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2512 AddrParts.push_back(AddrPart);
2513 }
2514
2515 State.setDebugLocFrom(Instr->getDebugLoc());
2516 Value *PoisonVec = PoisonValue::get(VecTy);
2517
2518 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2519 unsigned Part, Value *MaskForGaps) -> Value * {
2520 if (VF.isScalable()) {
2521 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2522 assert(InterleaveFactor == 2 &&
2523 "Unsupported deinterleave factor for scalable vectors");
2524 auto *BlockInMaskPart = State.get(BlockInMask, Part);
2525 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2526 auto *MaskTy =
2528 return Builder.CreateIntrinsic(
2529 MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2530 /*FMFSource=*/nullptr, "interleaved.mask");
2531 }
2532
2533 if (!BlockInMask)
2534 return MaskForGaps;
2535
2536 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2537 Value *ShuffledMask = Builder.CreateShuffleVector(
2538 BlockInMaskPart,
2539 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2540 "interleaved.mask");
2541 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2542 MaskForGaps)
2543 : ShuffledMask;
2544 };
2545
2546 // Vectorize the interleaved load group.
2547 if (isa<LoadInst>(Instr)) {
2548 Value *MaskForGaps = nullptr;
2549 if (NeedsMaskForGaps) {
2550 MaskForGaps =
2552 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2553 }
2554
2555 // For each unroll part, create a wide load for the group.
2556 SmallVector<Value *, 2> NewLoads;
2557 for (unsigned Part = 0; Part < UF; Part++) {
2558 Instruction *NewLoad;
2559 if (BlockInMask || MaskForGaps) {
2561 "masked interleaved groups are not allowed.");
2562 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2563 NewLoad =
2564 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2565 GroupMask, PoisonVec, "wide.masked.vec");
2566 }
2567 else
2568 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2569 Group->getAlign(), "wide.vec");
2570 Group->addMetadata(NewLoad);
2571 NewLoads.push_back(NewLoad);
2572 }
2573
2574 if (VecTy->isScalableTy()) {
2575 assert(InterleaveFactor == 2 &&
2576 "Unsupported deinterleave factor for scalable vectors");
2577
2578 for (unsigned Part = 0; Part < UF; ++Part) {
2579 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2580 // so must use intrinsics to deinterleave.
2582 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2583 /*FMFSource=*/nullptr, "strided.vec");
2584 unsigned J = 0;
2585 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2586 Instruction *Member = Group->getMember(I);
2587
2588 if (!Member)
2589 continue;
2590
2591 Value *StridedVec = Builder.CreateExtractValue(DI, I);
2592 // If this member has different type, cast the result type.
2593 if (Member->getType() != ScalarTy) {
2594 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2595 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2596 }
2597
2598 if (Group->isReverse())
2599 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2600
2601 State.set(VPDefs[J], StridedVec, Part);
2602 ++J;
2603 }
2604 }
2605
2606 return;
2607 }
2608
2609 // For each member in the group, shuffle out the appropriate data from the
2610 // wide loads.
2611 unsigned J = 0;
2612 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2613 Instruction *Member = Group->getMember(I);
2614
2615 // Skip the gaps in the group.
2616 if (!Member)
2617 continue;
2618
2619 auto StrideMask =
2620 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2621 for (unsigned Part = 0; Part < UF; Part++) {
2622 Value *StridedVec = Builder.CreateShuffleVector(
2623 NewLoads[Part], StrideMask, "strided.vec");
2624
2625 // If this member has different type, cast the result type.
2626 if (Member->getType() != ScalarTy) {
2627 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2628 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2629 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2630 }
2631
2632 if (Group->isReverse())
2633 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2634
2635 State.set(VPDefs[J], StridedVec, Part);
2636 }
2637 ++J;
2638 }
2639 return;
2640 }
2641
2642 // The sub vector type for current instruction.
2643 auto *SubVT = VectorType::get(ScalarTy, VF);
2644
2645 // Vectorize the interleaved store group.
2646 Value *MaskForGaps =
2648 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2649 "masked interleaved groups are not allowed.");
2650 assert((!MaskForGaps || !VF.isScalable()) &&
2651 "masking gaps for scalable vectors is not yet supported.");
2652 for (unsigned Part = 0; Part < UF; Part++) {
2653 // Collect the stored vector from each member.
2654 SmallVector<Value *, 4> StoredVecs;
2655 unsigned StoredIdx = 0;
2656 for (unsigned i = 0; i < InterleaveFactor; i++) {
2657 assert((Group->getMember(i) || MaskForGaps) &&
2658 "Fail to get a member from an interleaved store group");
2659 Instruction *Member = Group->getMember(i);
2660
2661 // Skip the gaps in the group.
2662 if (!Member) {
2663 Value *Undef = PoisonValue::get(SubVT);
2664 StoredVecs.push_back(Undef);
2665 continue;
2666 }
2667
2668 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2669 ++StoredIdx;
2670
2671 if (Group->isReverse())
2672 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2673
2674 // If this member has different type, cast it to a unified type.
2675
2676 if (StoredVec->getType() != SubVT)
2677 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2678
2679 StoredVecs.push_back(StoredVec);
2680 }
2681
2682 // Interleave all the smaller vectors into one wider vector.
2683 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2684 Instruction *NewStoreInstr;
2685 if (BlockInMask || MaskForGaps) {
2686 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2687 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2688 Group->getAlign(), GroupMask);
2689 } else
2690 NewStoreInstr =
2691 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2692
2693 Group->addMetadata(NewStoreInstr);
2694 }
2695}
2696
2698 VPReplicateRecipe *RepRecipe,
2699 const VPIteration &Instance,
2700 VPTransformState &State) {
2701 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2702
2703 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2704 // the first lane and part.
2705 if (isa<NoAliasScopeDeclInst>(Instr))
2706 if (!Instance.isFirstIteration())
2707 return;
2708
2709 // Does this instruction return a value ?
2710 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2711
2712 Instruction *Cloned = Instr->clone();
2713 if (!IsVoidRetTy) {
2714 Cloned->setName(Instr->getName() + ".cloned");
2715#if !defined(NDEBUG)
2716 // Verify that VPlan type inference results agree with the type of the
2717 // generated values.
2718 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2719 "inferred type and type from generated instructions do not match");
2720#endif
2721 }
2722
2723 RepRecipe->setFlags(Cloned);
2724
2725 if (auto DL = Instr->getDebugLoc())
2726 State.setDebugLocFrom(DL);
2727
2728 // Replace the operands of the cloned instructions with their scalar
2729 // equivalents in the new loop.
2730 for (const auto &I : enumerate(RepRecipe->operands())) {
2731 auto InputInstance = Instance;
2732 VPValue *Operand = I.value();
2734 InputInstance.Lane = VPLane::getFirstLane();
2735 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2736 }
2737 State.addNewMetadata(Cloned, Instr);
2738
2739 // Place the cloned scalar in the new loop.
2740 State.Builder.Insert(Cloned);
2741
2742 State.set(RepRecipe, Cloned, Instance);
2743
2744 // If we just cloned a new assumption, add it the assumption cache.
2745 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2747
2748 // End if-block.
2749 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2750 if (IfPredicateInstr)
2751 PredicatedInstructions.push_back(Cloned);
2752}
2753
2754Value *
2756 if (VectorTripCount)
2757 return VectorTripCount;
2758
2759 Value *TC = getTripCount();
2760 IRBuilder<> Builder(InsertBlock->getTerminator());
2761
2762 Type *Ty = TC->getType();
2763 // This is where we can make the step a runtime constant.
2764 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2765
2766 // If the tail is to be folded by masking, round the number of iterations N
2767 // up to a multiple of Step instead of rounding down. This is done by first
2768 // adding Step-1 and then rounding down. Note that it's ok if this addition
2769 // overflows: the vector induction variable will eventually wrap to zero given
2770 // that it starts at zero and its Step is a power of two; the loop will then
2771 // exit, with the last early-exit vector comparison also producing all-true.
2772 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2773 // is accounted for in emitIterationCountCheck that adds an overflow check.
2774 if (Cost->foldTailByMasking()) {
2776 "VF*UF must be a power of 2 when folding tail by masking");
2777 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2778 TC = Builder.CreateAdd(
2779 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2780 }
2781
2782 // Now we need to generate the expression for the part of the loop that the
2783 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2784 // iterations are not required for correctness, or N - Step, otherwise. Step
2785 // is equal to the vectorization factor (number of SIMD elements) times the
2786 // unroll factor (number of SIMD instructions).
2787 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2788
2789 // There are cases where we *must* run at least one iteration in the remainder
2790 // loop. See the cost model for when this can happen. If the step evenly
2791 // divides the trip count, we set the remainder to be equal to the step. If
2792 // the step does not evenly divide the trip count, no adjustment is necessary
2793 // since there will already be scalar iterations. Note that the minimum
2794 // iterations check ensures that N >= Step.
2795 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2796 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2797 R = Builder.CreateSelect(IsZero, Step, R);
2798 }
2799
2800 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2801
2802 return VectorTripCount;
2803}
2804
2806 const DataLayout &DL) {
2807 // Verify that V is a vector type with same number of elements as DstVTy.
2808 auto *DstFVTy = cast<VectorType>(DstVTy);
2809 auto VF = DstFVTy->getElementCount();
2810 auto *SrcVecTy = cast<VectorType>(V->getType());
2811 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2812 Type *SrcElemTy = SrcVecTy->getElementType();
2813 Type *DstElemTy = DstFVTy->getElementType();
2814 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2815 "Vector elements must have same size");
2816
2817 // Do a direct cast if element types are castable.
2818 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2819 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2820 }
2821 // V cannot be directly casted to desired vector type.
2822 // May happen when V is a floating point vector but DstVTy is a vector of
2823 // pointers or vice-versa. Handle this using a two-step bitcast using an
2824 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2825 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2826 "Only one type should be a pointer type");
2827 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2828 "Only one type should be a floating point type");
2829 Type *IntTy =
2830 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2831 auto *VecIntTy = VectorType::get(IntTy, VF);
2832 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2833 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2834}
2835
2837 Value *Count = getTripCount();
2838 // Reuse existing vector loop preheader for TC checks.
2839 // Note that new preheader block is generated for vector loop.
2840 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2841 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2842
2843 // Generate code to check if the loop's trip count is less than VF * UF, or
2844 // equal to it in case a scalar epilogue is required; this implies that the
2845 // vector trip count is zero. This check also covers the case where adding one
2846 // to the backedge-taken count overflowed leading to an incorrect trip count
2847 // of zero. In this case we will also jump to the scalar loop.
2848 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2850
2851 // If tail is to be folded, vector loop takes care of all iterations.
2852 Type *CountTy = Count->getType();
2853 Value *CheckMinIters = Builder.getFalse();
2854 auto CreateStep = [&]() -> Value * {
2855 // Create step with max(MinProTripCount, UF * VF).
2857 return createStepForVF(Builder, CountTy, VF, UF);
2858
2859 Value *MinProfTC =
2861 if (!VF.isScalable())
2862 return MinProfTC;
2864 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2865 };
2866
2867 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2868 if (Style == TailFoldingStyle::None)
2869 CheckMinIters =
2870 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2871 else if (VF.isScalable() &&
2874 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2875 // an overflow to zero when updating induction variables and so an
2876 // additional overflow check is required before entering the vector loop.
2877
2878 // Get the maximum unsigned value for the type.
2879 Value *MaxUIntTripCount =
2880 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2881 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2882
2883 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2884 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2885 }
2886
2887 // Create new preheader for vector loop.
2889 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2890 "vector.ph");
2891
2892 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2893 DT->getNode(Bypass)->getIDom()) &&
2894 "TC check is expected to dominate Bypass");
2895
2896 // Update dominator for Bypass & LoopExit (if needed).
2897 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2898 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2899 // If there is an epilogue which must run, there's no edge from the
2900 // middle block to exit blocks and thus no need to update the immediate
2901 // dominator of the exit blocks.
2903
2904 BranchInst &BI =
2905 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2908 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2909 LoopBypassBlocks.push_back(TCCheckBlock);
2910}
2911
2913 BasicBlock *const SCEVCheckBlock =
2914 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2915 if (!SCEVCheckBlock)
2916 return nullptr;
2917
2918 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2920 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2921 "Cannot SCEV check stride or overflow when optimizing for size");
2922
2923
2924 // Update dominator only if this is first RT check.
2925 if (LoopBypassBlocks.empty()) {
2926 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2927 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2928 // If there is an epilogue which must run, there's no edge from the
2929 // middle block to exit blocks and thus no need to update the immediate
2930 // dominator of the exit blocks.
2931 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2932 }
2933
2934 LoopBypassBlocks.push_back(SCEVCheckBlock);
2935 AddedSafetyChecks = true;
2936 return SCEVCheckBlock;
2937}
2938
2940 // VPlan-native path does not do any analysis for runtime checks currently.
2942 return nullptr;
2943
2944 BasicBlock *const MemCheckBlock =
2945 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2946
2947 // Check if we generated code that checks in runtime if arrays overlap. We put
2948 // the checks into a separate block to make the more common case of few
2949 // elements faster.
2950 if (!MemCheckBlock)
2951 return nullptr;
2952
2953 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2954 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2955 "Cannot emit memory checks when optimizing for size, unless forced "
2956 "to vectorize.");
2957 ORE->emit([&]() {
2958 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2961 << "Code-size may be reduced by not forcing "
2962 "vectorization, or by source-code modifications "
2963 "eliminating the need for runtime checks "
2964 "(e.g., adding 'restrict').";
2965 });
2966 }
2967
2968 LoopBypassBlocks.push_back(MemCheckBlock);
2969
2970 AddedSafetyChecks = true;
2971
2972 return MemCheckBlock;
2973}
2974
2978 assert(LoopVectorPreHeader && "Invalid loop structure");
2979 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2980 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2981 "multiple exit loop without required epilogue?");
2982
2985 LI, nullptr, Twine(Prefix) + "middle.block");
2988 nullptr, Twine(Prefix) + "scalar.ph");
2989
2990 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2991
2992 // Set up the middle block terminator. Two cases:
2993 // 1) If we know that we must execute the scalar epilogue, emit an
2994 // unconditional branch.
2995 // 2) Otherwise, we must have a single unique exit block (due to how we
2996 // implement the multiple exit case). In this case, set up a conditional
2997 // branch from the middle block to the loop scalar preheader, and the
2998 // exit block. completeLoopSkeleton will update the condition to use an
2999 // iteration check, if required to decide whether to execute the remainder.
3000 BranchInst *BrInst =
3001 Cost->requiresScalarEpilogue(VF.isVector())
3004 Builder.getTrue());
3005 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3007
3008 // Update dominator for loop exit. During skeleton creation, only the vector
3009 // pre-header and the middle block are created. The vector loop is entirely
3010 // created during VPlan exection.
3011 if (!Cost->requiresScalarEpilogue(VF.isVector()))
3012 // If there is an epilogue which must run, there's no edge from the
3013 // middle block to exit blocks and thus no need to update the immediate
3014 // dominator of the exit blocks.
3016}
3017
3019 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3020 ArrayRef<BasicBlock *> BypassBlocks,
3021 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3023 assert(VectorTripCount && "Expected valid arguments");
3024
3025 Instruction *OldInduction = Legal->getPrimaryInduction();
3026 Value *&EndValue = IVEndValues[OrigPhi];
3027 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3028 if (OrigPhi == OldInduction) {
3029 // We know what the end value is.
3030 EndValue = VectorTripCount;
3031 } else {
3033
3034 // Fast-math-flags propagate from the original induction instruction.
3035 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3036 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3037
3039 Step, II.getKind(), II.getInductionBinOp());
3040 EndValue->setName("ind.end");
3041
3042 // Compute the end value for the additional bypass (if applicable).
3043 if (AdditionalBypass.first) {
3044 B.SetInsertPoint(AdditionalBypass.first,
3045 AdditionalBypass.first->getFirstInsertionPt());
3046 EndValueFromAdditionalBypass =
3047 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3048 Step, II.getKind(), II.getInductionBinOp());
3049 EndValueFromAdditionalBypass->setName("ind.end");
3050 }
3051 }
3052
3053 // Create phi nodes to merge from the backedge-taken check block.
3054 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3056 // Copy original phi DL over to the new one.
3057 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3058
3059 // The new PHI merges the original incoming value, in case of a bypass,
3060 // or the value at the end of the vectorized loop.
3061 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3062
3063 // Fix the scalar body counter (PHI node).
3064 // The old induction's phi node in the scalar body needs the truncated
3065 // value.
3066 for (BasicBlock *BB : BypassBlocks)
3067 BCResumeVal->addIncoming(II.getStartValue(), BB);
3068
3069 if (AdditionalBypass.first)
3070 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3071 EndValueFromAdditionalBypass);
3072 return BCResumeVal;
3073}
3074
3075/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3076/// expansion results.
3078 const SCEV2ValueTy &ExpandedSCEVs) {
3079 const SCEV *Step = ID.getStep();
3080 if (auto *C = dyn_cast<SCEVConstant>(Step))
3081 return C->getValue();
3082 if (auto *U = dyn_cast<SCEVUnknown>(Step))
3083 return U->getValue();
3084 auto I = ExpandedSCEVs.find(Step);
3085 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3086 return I->second;
3087}
3088
3090 const SCEV2ValueTy &ExpandedSCEVs,
3091 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3092 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3093 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3094 "Inconsistent information about additional bypass.");
3095 // We are going to resume the execution of the scalar loop.
3096 // Go over all of the induction variables that we found and fix the
3097 // PHIs that are left in the scalar version of the loop.
3098 // The starting values of PHI nodes depend on the counter of the last
3099 // iteration in the vectorized loop.
3100 // If we come from a bypass edge then we need to start from the original
3101 // start value.
3102 for (const auto &InductionEntry : Legal->getInductionVars()) {
3103 PHINode *OrigPhi = InductionEntry.first;
3104 const InductionDescriptor &II = InductionEntry.second;
3105 PHINode *BCResumeVal = createInductionResumeValue(
3106 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3107 AdditionalBypass);
3108 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3109 }
3110}
3111
3113 // The trip counts should be cached by now.
3114 Value *Count = getTripCount();
3116
3117 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3118
3119 // Add a check in the middle block to see if we have completed
3120 // all of the iterations in the first vector loop. Three cases:
3121 // 1) If we require a scalar epilogue, there is no conditional branch as
3122 // we unconditionally branch to the scalar preheader. Do nothing.
3123 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3124 // Thus if tail is to be folded, we know we don't need to run the
3125 // remainder and we can use the previous value for the condition (true).
3126 // 3) Otherwise, construct a runtime check.
3127 if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3128 !Cost->foldTailByMasking()) {
3129 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3130 // of the corresponding compare because they may have ended up with
3131 // different line numbers and we want to avoid awkward line stepping while
3132 // debugging. Eg. if the compare has got a line number inside the loop.
3133 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3134 // operands. Perform simplification directly on VPlan once the branch is
3135 // modeled there.
3137 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3138 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3139 BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3140 BI.setCondition(CmpN);
3141 if (hasBranchWeightMD(*ScalarLatchTerm)) {
3142 // Assume that `Count % VectorTripCount` is equally distributed.
3143 unsigned TripCount = UF * VF.getKnownMinValue();
3144 assert(TripCount > 0 && "trip count should not be zero");
3145 const uint32_t Weights[] = {1, TripCount - 1};
3146 setBranchWeights(BI, Weights);
3147 }
3148 }
3149
3150#ifdef EXPENSIVE_CHECKS
3151 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3152#endif
3153
3154 return LoopVectorPreHeader;
3155}
3156
3157std::pair<BasicBlock *, Value *>
3159 const SCEV2ValueTy &ExpandedSCEVs) {
3160 /*
3161 In this function we generate a new loop. The new loop will contain
3162 the vectorized instructions while the old loop will continue to run the
3163 scalar remainder.
3164
3165 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3166 / | preheader are expanded here. Eventually all required SCEV
3167 / | expansion should happen here.
3168 / v
3169 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3170 | / |
3171 | / v
3172 || [ ] <-- vector pre header.
3173 |/ |
3174 | v
3175 | [ ] \
3176 | [ ]_| <-- vector loop (created during VPlan execution).
3177 | |
3178 | v
3179 \ -[ ] <--- middle-block.
3180 \/ |
3181 /\ v
3182 | ->[ ] <--- new preheader.
3183 | |
3184 (opt) v <-- edge from middle to exit iff epilogue is not required.
3185 | [ ] \
3186 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3187 \ |
3188 \ v
3189 >[ ] <-- exit block(s).
3190 ...
3191 */
3192
3193 // Create an empty vector loop, and prepare basic blocks for the runtime
3194 // checks.
3196
3197 // Now, compare the new count to zero. If it is zero skip the vector loop and
3198 // jump to the scalar loop. This check also covers the case where the
3199 // backedge-taken count is uint##_max: adding one to it will overflow leading
3200 // to an incorrect trip count of zero. In this (rare) case we will also jump
3201 // to the scalar loop.
3203
3204 // Generate the code to check any assumptions that we've made for SCEV
3205 // expressions.
3207
3208 // Generate the code that checks in runtime if arrays overlap. We put the
3209 // checks into a separate block to make the more common case of few elements
3210 // faster.
3212
3213 // Emit phis for the new starting index of the scalar loop.
3214 createInductionResumeValues(ExpandedSCEVs);
3215
3216 return {completeLoopSkeleton(), nullptr};
3217}
3218
3219// Fix up external users of the induction variable. At this point, we are
3220// in LCSSA form, with all external PHIs that use the IV having one input value,
3221// coming from the remainder loop. We need those PHIs to also have a correct
3222// value for the IV when arriving directly from the middle block.
3224 const InductionDescriptor &II,
3225 Value *VectorTripCount, Value *EndValue,
3226 BasicBlock *MiddleBlock,
3227 BasicBlock *VectorHeader, VPlan &Plan,
3228 VPTransformState &State) {
3229 // There are two kinds of external IV usages - those that use the value
3230 // computed in the last iteration (the PHI) and those that use the penultimate
3231 // value (the value that feeds into the phi from the loop latch).
3232 // We allow both, but they, obviously, have different values.
3233
3234 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3235
3236 DenseMap<Value *, Value *> MissingVals;
3237
3238 // An external user of the last iteration's value should see the value that
3239 // the remainder loop uses to initialize its own IV.
3241 for (User *U : PostInc->users()) {
3242 Instruction *UI = cast<Instruction>(U);
3243 if (!OrigLoop->contains(UI)) {
3244 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3245 MissingVals[UI] = EndValue;
3246 }
3247 }
3248
3249 // An external user of the penultimate value need to see EndValue - Step.
3250 // The simplest way to get this is to recompute it from the constituent SCEVs,
3251 // that is Start + (Step * (CRD - 1)).
3252 for (User *U : OrigPhi->users()) {
3253 auto *UI = cast<Instruction>(U);
3254 if (!OrigLoop->contains(UI)) {
3255 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3256 IRBuilder<> B(MiddleBlock->getTerminator());
3257
3258 // Fast-math-flags propagate from the original induction instruction.
3259 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3260 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3261
3262 Value *CountMinusOne = B.CreateSub(
3263 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3264 CountMinusOne->setName("cmo");
3265
3266 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3267 assert(StepVPV && "step must have been expanded during VPlan execution");
3268 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3269 : State.get(StepVPV, {0, 0});
3270 Value *Escape =
3271 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3272 II.getKind(), II.getInductionBinOp());
3273 Escape->setName("ind.escape");
3274 MissingVals[UI] = Escape;
3275 }
3276 }
3277
3278 for (auto &I : MissingVals) {
3279 PHINode *PHI = cast<PHINode>(I.first);
3280 // One corner case we have to handle is two IVs "chasing" each-other,
3281 // that is %IV2 = phi [...], [ %IV1, %latch ]
3282 // In this case, if IV1 has an external use, we need to avoid adding both
3283 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3284 // don't already have an incoming value for the middle block.
3285 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3286 PHI->addIncoming(I.second, MiddleBlock);
3287 Plan.removeLiveOut(PHI);
3288 }
3289 }
3290}
3291
3292namespace {
3293
3294struct CSEDenseMapInfo {
3295 static bool canHandle(const Instruction *I) {
3296 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3297 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3298 }
3299
3300 static inline Instruction *getEmptyKey() {
3302 }
3303
3304 static inline Instruction *getTombstoneKey() {
3306 }
3307
3308 static unsigned getHashValue(const Instruction *I) {
3309 assert(canHandle(I) && "Unknown instruction!");
3310 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3311 I->value_op_end()));
3312 }
3313
3314 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3315 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3316 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3317 return LHS == RHS;
3318 return LHS->isIdenticalTo(RHS);
3319 }
3320};
3321
3322} // end anonymous namespace
3323
3324///Perform cse of induction variable instructions.
3325static void cse(BasicBlock *BB) {
3326 // Perform simple cse.
3328 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3329 if (!CSEDenseMapInfo::canHandle(&In))
3330 continue;
3331
3332 // Check if we can replace this instruction with any of the
3333 // visited instructions.
3334 if (Instruction *V = CSEMap.lookup(&In)) {
3335 In.replaceAllUsesWith(V);
3336 In.eraseFromParent();
3337 continue;
3338 }
3339
3340 CSEMap[&In] = &In;
3341 }
3342}
3343
3346 ElementCount VF) const {
3347 // We only need to calculate a cost if the VF is scalar; for actual vectors
3348 // we should already have a pre-calculated cost at each VF.
3349 if (!VF.isScalar())
3350 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3351
3353 Type *RetTy = CI->getType();
3355 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3356 return *RedCost;
3357
3359 for (auto &ArgOp : CI->args())
3360 Tys.push_back(ArgOp->getType());
3361
3362 InstructionCost ScalarCallCost =
3364
3365 // If this is an intrinsic we may have a lower cost for it.
3367 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3368 return std::min(ScalarCallCost, IntrinsicCost);
3369 }
3370 return ScalarCallCost;
3371}
3372
3374 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3375 return Elt;
3376 return VectorType::get(Elt, VF);
3377}
3378
3381 ElementCount VF) const {
3383 assert(ID && "Expected intrinsic call!");
3384 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3385 FastMathFlags FMF;
3386 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3387 FMF = FPMO->getFastMathFlags();
3388
3391 SmallVector<Type *> ParamTys;
3392 std::transform(FTy->param_begin(), FTy->param_end(),
3393 std::back_inserter(ParamTys),
3394 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3395
3396 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3397 dyn_cast<IntrinsicInst>(CI));
3398 return TTI.getIntrinsicInstrCost(CostAttrs,
3400}
3401
3403 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3404 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3405 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3406}
3407
3409 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3410 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3411 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3412}
3413
3415 VPlan &Plan) {
3416 // Fix widened non-induction PHIs by setting up the PHI operands.
3418 fixNonInductionPHIs(Plan, State);
3419
3420 // At this point every instruction in the original loop is widened to a
3421 // vector form. Now we need to fix the recurrences in the loop. These PHI
3422 // nodes are currently empty because we did not want to introduce cycles.
3423 // This is the second stage of vectorizing recurrences. Note that fixing
3424 // reduction phis are already modeled in VPlan.
3425 // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3426 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3427 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3428 for (VPRecipeBase &R : HeaderVPBB->phis()) {
3429 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3430 fixFixedOrderRecurrence(FOR, State);
3431 }
3432
3433 // Forget the original basic block.
3436
3437 // After vectorization, the exit blocks of the original loop will have
3438 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3439 // looked through single-entry phis.
3440 SmallVector<BasicBlock *> ExitBlocks;
3441 OrigLoop->getExitBlocks(ExitBlocks);
3442 for (BasicBlock *Exit : ExitBlocks)
3443 for (PHINode &PN : Exit->phis())
3445
3446 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3447 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3448 if (Cost->requiresScalarEpilogue(VF.isVector())) {
3449 // No edge from the middle block to the unique exit block has been inserted
3450 // and there is nothing to fix from vector loop; phis should have incoming
3451 // from scalar loop only.
3452 } else {
3453 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3454 // the cost model.
3455
3456 // If we inserted an edge from the middle block to the unique exit block,
3457 // update uses outside the loop (phis) to account for the newly inserted
3458 // edge.
3459
3460 // Fix-up external users of the induction variables.
3461 for (const auto &Entry : Legal->getInductionVars())
3462 fixupIVUsers(Entry.first, Entry.second,
3464 IVEndValues[Entry.first], LoopMiddleBlock,
3465 VectorLoop->getHeader(), Plan, State);
3466 }
3467
3468 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3469 // in the exit block, so update the builder.
3470 State.Builder.SetInsertPoint(State.CFG.ExitBB,
3471 State.CFG.ExitBB->getFirstNonPHIIt());
3472 for (const auto &KV : Plan.getLiveOuts())
3473 KV.second->fixPhi(Plan, State);
3474
3476 sinkScalarOperands(&*PI);
3477
3478 // Remove redundant induction instructions.
3479 cse(VectorLoop->getHeader());
3480
3481 // Set/update profile weights for the vector and remainder loops as original
3482 // loop iterations are now distributed among them. Note that original loop
3483 // represented by LoopScalarBody becomes remainder loop after vectorization.
3484 //
3485 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3486 // end up getting slightly roughened result but that should be OK since
3487 // profile is not inherently precise anyway. Note also possible bypass of
3488 // vector code caused by legality checks is ignored, assigning all the weight
3489 // to the vector loop, optimistically.
3490 //
3491 // For scalable vectorization we can't know at compile time how many iterations
3492 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3493 // vscale of '1'.
3496 VF.getKnownMinValue() * UF);
3497}
3498
3501 // This is the second phase of vectorizing first-order recurrences. An
3502 // overview of the transformation is described below. Suppose we have the
3503 // following loop.
3504 //
3505 // for (int i = 0; i < n; ++i)
3506 // b[i] = a[i] - a[i - 1];
3507 //
3508 // There is a first-order recurrence on "a". For this loop, the shorthand
3509 // scalar IR looks like:
3510 //
3511 // scalar.ph:
3512 // s_init = a[-1]
3513 // br scalar.body
3514 //
3515 // scalar.body:
3516 // i = phi [0, scalar.ph], [i+1, scalar.body]
3517 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3518 // s2 = a[i]
3519 // b[i] = s2 - s1
3520 // br cond, scalar.body, ...
3521 //
3522 // In this example, s1 is a recurrence because it's value depends on the
3523 // previous iteration. In the first phase of vectorization, we created a
3524 // vector phi v1 for s1. We now complete the vectorization and produce the
3525 // shorthand vector IR shown below (for VF = 4, UF = 1).
3526 //
3527 // vector.ph:
3528 // v_init = vector(..., ..., ..., a[-1])
3529 // br vector.body
3530 //
3531 // vector.body
3532 // i = phi [0, vector.ph], [i+4, vector.body]
3533 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3534 // v2 = a[i, i+1, i+2, i+3];
3535 // v3 = vector(v1(3), v2(0, 1, 2))
3536 // b[i, i+1, i+2, i+3] = v2 - v3
3537 // br cond, vector.body, middle.block
3538 //
3539 // middle.block:
3540 // x = v2(3)
3541 // br scalar.ph
3542 //
3543 // scalar.ph:
3544 // s_init = phi [x, middle.block], [a[-1], otherwise]
3545 // br scalar.body
3546 //
3547 // After execution completes the vector loop, we extract the next value of
3548 // the recurrence (x) to use as the initial value in the scalar loop.
3549
3550 // Extract the last vector element in the middle block. This will be the
3551 // initial value for the recurrence when jumping to the scalar loop.
3552 VPValue *PreviousDef = PhiR->getBackedgeValue();
3553 Value *Incoming = State.get(PreviousDef, UF - 1);
3554 auto *ExtractForScalar = Incoming;
3555 auto *IdxTy = Builder.getInt32Ty();
3556 Value *RuntimeVF = nullptr;
3557 if (VF.isVector()) {
3558 auto *One = ConstantInt::get(IdxTy, 1);
3560 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3561 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3562 ExtractForScalar =
3563 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3564 }
3565
3566 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3567 assert(PhiR->getNumUsers() == 1 &&
3568 RecurSplice->getOpcode() ==
3570 "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3571 SmallVector<VPLiveOut *> LiveOuts;
3572 for (VPUser *U : RecurSplice->users())
3573 if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3574 LiveOuts.push_back(LiveOut);
3575
3576 if (!LiveOuts.empty()) {
3577 // Extract the second last element in the middle block if the
3578 // Phi is used outside the loop. We need to extract the phi itself
3579 // and not the last element (the phi update in the current iteration). This
3580 // will be the value when jumping to the exit block from the
3581 // LoopMiddleBlock, when the scalar loop is not run at all.
3582 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3583 if (VF.isVector()) {
3584 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3585 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3586 Incoming, Idx, "vector.recur.extract.for.phi");
3587 } else {
3588 assert(UF > 1 && "VF and UF cannot both be 1");
3589 // When loop is unrolled without vectorizing, initialize
3590 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3591 // value of `Incoming`. This is analogous to the vectorized case above:
3592 // extracting the second last element when VF > 1.
3593 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3594 }
3595
3596 for (VPLiveOut *LiveOut : LiveOuts) {
3597 assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3598 PHINode *LCSSAPhi = LiveOut->getPhi();
3599 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3600 State.Plan->removeLiveOut(LCSSAPhi);
3601 }
3602 }
3603
3604 // Fix the initial value of the original recurrence in the scalar loop.
3606 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3607 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3608 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3609 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3610 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3611 Start->addIncoming(Incoming, BB);
3612 }
3613
3614 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3615 Phi->setName("scalar.recur");
3616}
3617
3619 // The basic block and loop containing the predicated instruction.
3620 auto *PredBB = PredInst->getParent();
3621 auto *VectorLoop = LI->getLoopFor(PredBB);
3622
3623 // Initialize a worklist with the operands of the predicated instruction.
3624 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3625
3626 // Holds instructions that we need to analyze again. An instruction may be
3627 // reanalyzed if we don't yet know if we can sink it or not.
3628 SmallVector<Instruction *, 8> InstsToReanalyze;
3629
3630 // Returns true if a given use occurs in the predicated block. Phi nodes use
3631 // their operands in their corresponding predecessor blocks.
3632 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3633 auto *I = cast<Instruction>(U.getUser());
3634 BasicBlock *BB = I->getParent();
3635 if (auto *Phi = dyn_cast<PHINode>(I))
3636 BB = Phi->getIncomingBlock(
3637 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3638 return BB == PredBB;
3639 };
3640
3641 // Iteratively sink the scalarized operands of the predicated instruction
3642 // into the block we created for it. When an instruction is sunk, it's
3643 // operands are then added to the worklist. The algorithm ends after one pass
3644 // through the worklist doesn't sink a single instruction.
3645 bool Changed;
3646 do {
3647 // Add the instructions that need to be reanalyzed to the worklist, and
3648 // reset the changed indicator.
3649 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3650 InstsToReanalyze.clear();
3651 Changed = false;
3652
3653 while (!Worklist.empty()) {
3654 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3655
3656 // We can't sink an instruction if it is a phi node, is not in the loop,
3657 // may have side effects or may read from memory.
3658 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3659 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3660 I->mayHaveSideEffects() || I->mayReadFromMemory())
3661 continue;
3662
3663 // If the instruction is already in PredBB, check if we can sink its
3664 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3665 // sinking the scalar instruction I, hence it appears in PredBB; but it
3666 // may have failed to sink I's operands (recursively), which we try
3667 // (again) here.
3668 if (I->getParent() == PredBB) {
3669 Worklist.insert(I->op_begin(), I->op_end());
3670 continue;
3671 }
3672
3673 // It's legal to sink the instruction if all its uses occur in the
3674 // predicated block. Otherwise, there's nothing to do yet, and we may
3675 // need to reanalyze the instruction.
3676 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3677 InstsToReanalyze.push_back(I);
3678 continue;
3679 }
3680
3681 // Move the instruction to the beginning of the predicated block, and add
3682 // it's operands to the worklist.
3683 I->moveBefore(&*PredBB->getFirstInsertionPt());
3684 Worklist.insert(I->op_begin(), I->op_end());
3685
3686 // The sinking may have enabled other instructions to be sunk, so we will
3687 // need to iterate.
3688 Changed = true;
3689 }
3690 } while (Changed);
3691}
3692
3694 VPTransformState &State) {
3695 auto Iter = vp_depth_first_deep(Plan.getEntry());
3696 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3697 for (VPRecipeBase &P : VPBB->phis()) {
3698 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3699 if (!VPPhi)
3700 continue;
3701 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3702 // Make sure the builder has a valid insert point.
3703 Builder.SetInsertPoint(NewPhi);
3704 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3705 VPValue *Inc = VPPhi->getIncomingValue(i);
3706 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3707 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3708 }
3709 }
3710 }
3711}
3712
3713void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3714 // We should not collect Scalars more than once per VF. Right now, this
3715 // function is called from collectUniformsAndScalars(), which already does
3716 // this check. Collecting Scalars for VF=1 does not make any sense.
3717 assert(VF.isVector() && !Scalars.contains(VF) &&
3718 "This function should not be visited twice for the same VF");
3719
3720 // This avoids any chances of creating a REPLICATE recipe during planning
3721 // since that would result in generation of scalarized code during execution,
3722 // which is not supported for scalable vectors.
3723 if (VF.isScalable()) {
3724 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3725 return;
3726 }
3727
3729
3730 // These sets are used to seed the analysis with pointers used by memory
3731 // accesses that will remain scalar.
3733 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3734 auto *Latch = TheLoop->getLoopLatch();
3735
3736 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3737 // The pointer operands of loads and stores will be scalar as long as the
3738 // memory access is not a gather or scatter operation. The value operand of a
3739 // store will remain scalar if the store is scalarized.
3740 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3741 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3742 assert(WideningDecision != CM_Unknown &&
3743 "Widening decision should be ready at this moment");
3744 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3745 if (Ptr == Store->getValueOperand())
3746 return WideningDecision == CM_Scalarize;
3747 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3748 "Ptr is neither a value or pointer operand");
3749 return WideningDecision != CM_GatherScatter;
3750 };
3751
3752 // A helper that returns true if the given value is a bitcast or
3753 // getelementptr instruction contained in the loop.
3754 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3755 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3756 isa<GetElementPtrInst>(V)) &&
3758 };
3759
3760 // A helper that evaluates a memory access's use of a pointer. If the use will
3761 // be a scalar use and the pointer is only used by memory accesses, we place
3762 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3763 // PossibleNonScalarPtrs.
3764 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3765 // We only care about bitcast and getelementptr instructions contained in
3766 // the loop.
3767 if (!isLoopVaryingBitCastOrGEP(Ptr))
3768 return;
3769
3770 // If the pointer has already been identified as scalar (e.g., if it was
3771 // also identified as uniform), there's nothing to do.
3772 auto *I = cast<Instruction>(Ptr);
3773 if (Worklist.count(I))
3774 return;
3775
3776 // If the use of the pointer will be a scalar use, and all users of the
3777 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3778 // place the pointer in PossibleNonScalarPtrs.
3779 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3780 return isa<LoadInst>(U) || isa<StoreInst>(U);
3781 }))
3782 ScalarPtrs.insert(I);
3783 else
3784 PossibleNonScalarPtrs.insert(I);
3785 };
3786
3787 // We seed the scalars analysis with three classes of instructions: (1)
3788 // instructions marked uniform-after-vectorization and (2) bitcast,
3789 // getelementptr and (pointer) phi instructions used by memory accesses
3790 // requiring a scalar use.
3791 //
3792 // (1) Add to the worklist all instructions that have been identified as
3793 // uniform-after-vectorization.
3794 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3795
3796 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3797 // memory accesses requiring a scalar use. The pointer operands of loads and
3798 // stores will be scalar as long as the memory accesses is not a gather or
3799 // scatter operation. The value operand of a store will remain scalar if the
3800 // store is scalarized.
3801 for (auto *BB : TheLoop->blocks())
3802 for (auto &I : *BB) {
3803 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3804 evaluatePtrUse(Load, Load->getPointerOperand());
3805 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3806 evaluatePtrUse(Store, Store->getPointerOperand());
3807 evaluatePtrUse(Store, Store->getValueOperand());
3808 }
3809 }
3810 for (auto *I : ScalarPtrs)
3811 if (!PossibleNonScalarPtrs.count(I)) {
3812 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3813 Worklist.insert(I);
3814 }
3815
3816 // Insert the forced scalars.
3817 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3818 // induction variable when the PHI user is scalarized.
3819 auto ForcedScalar = ForcedScalars.find(VF);
3820 if (ForcedScalar != ForcedScalars.end())
3821 for (auto *I : ForcedScalar->second) {
3822 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3823 Worklist.insert(I);
3824 }
3825
3826 // Expand the worklist by looking through any bitcasts and getelementptr
3827 // instructions we've already identified as scalar. This is similar to the
3828 // expansion step in collectLoopUniforms(); however, here we're only
3829 // expanding to include additional bitcasts and getelementptr instructions.
3830 unsigned Idx = 0;
3831 while (Idx != Worklist.size()) {
3832 Instruction *Dst = Worklist[Idx++];
3833 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3834 continue;
3835 auto *Src = cast<Instruction>(Dst->getOperand(0));
3836 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3837 auto *J = cast<Instruction>(U);
3838 return !TheLoop->contains(J) || Worklist.count(J) ||
3839 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3840 isScalarUse(J, Src));
3841 })) {
3842 Worklist.insert(Src);
3843 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3844 }
3845 }
3846
3847 // An induction variable will remain scalar if all users of the induction
3848 // variable and induction variable update remain scalar.
3849 for (const auto &Induction : Legal->getInductionVars()) {
3850 auto *Ind = Induction.first;
3851 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3852
3853 // If tail-folding is applied, the primary induction variable will be used
3854 // to feed a vector compare.
3855 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3856 continue;
3857
3858 // Returns true if \p Indvar is a pointer induction that is used directly by
3859 // load/store instruction \p I.
3860 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3861 Instruction *I) {
3862 return Induction.second.getKind() ==
3864 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3865 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3866 };
3867
3868 // Determine if all users of the induction variable are scalar after
3869 // vectorization.
3870 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3871 auto *I = cast<Instruction>(U);
3872 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3873 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3874 });
3875 if (!ScalarInd)
3876 continue;
3877
3878 // Determine if all users of the induction variable update instruction are
3879 // scalar after vectorization.
3880 auto ScalarIndUpdate =
3881 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3882 auto *I = cast<Instruction>(U);
3883 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3884 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3885 });
3886 if (!ScalarIndUpdate)
3887 continue;
3888
3889 // The induction variable and its update instruction will remain scalar.
3890 Worklist.insert(Ind);
3891 Worklist.insert(IndUpdate);
3892 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3893 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3894 << "\n");
3895 }
3896
3897 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3898}
3899
3901 Instruction *I, ElementCount VF) const {
3902 if (!isPredicatedInst(I))
3903 return false;
3904
3905 // Do we have a non-scalar lowering for this predicated
3906 // instruction? No - it is scalar with predication.
3907 switch(I->getOpcode()) {
3908 default:
3909 return true;
3910 case Instruction::Call:
3911 if (VF.isScalar())
3912 return true;
3913 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3914 .Kind == CM_Scalarize;
3915 case Instruction::Load:
3916 case Instruction::Store: {
3918 auto *Ty = getLoadStoreType(I);
3919 Type *VTy = Ty;
3920 if (VF.isVector())
3921 VTy = VectorType::get(Ty, VF);
3922 const Align Alignment = getLoadStoreAlignment(I);
3923 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3924 TTI.isLegalMaskedGather(VTy, Alignment))
3925 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3926 TTI.isLegalMaskedScatter(VTy, Alignment));
3927 }
3928 case Instruction::UDiv:
3929 case Instruction::SDiv:
3930 case Instruction::SRem:
3931 case Instruction::URem: {
3932 // We have the option to use the safe-divisor idiom to avoid predication.
3933 // The cost based decision here will always select safe-divisor for
3934 // scalable vectors as scalarization isn't legal.
3935 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3936 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3937 }
3938 }
3939}
3940
3942 if (!blockNeedsPredicationForAnyReason(I->getParent()))
3943 return false;
3944
3945 // Can we prove this instruction is safe to unconditionally execute?
3946 // If not, we must use some form of predication.
3947 switch(I->getOpcode()) {
3948 default:
3949 return false;
3950 case Instruction::Load:
3951 case Instruction::Store: {
3952 if (!Legal->isMaskRequired(I))
3953 return false;
3954 // When we know the load's address is loop invariant and the instruction
3955 // in the original scalar loop was unconditionally executed then we
3956 // don't need to mark it as a predicated instruction. Tail folding may
3957 // introduce additional predication, but we're guaranteed to always have
3958 // at least one active lane. We call Legal->blockNeedsPredication here
3959 // because it doesn't query tail-folding. For stores, we need to prove
3960 // both speculation safety (which follows from the same argument as loads),
3961 // but also must prove the value being stored is correct. The easiest
3962 // form of the later is to require that all values stored are the same.
3964 (isa<LoadInst>(I) ||
3965 (isa<StoreInst>(I) &&
3966 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3967 !Legal->blockNeedsPredication(I->getParent()))
3968 return false;
3969 return true;
3970 }
3971 case Instruction::UDiv:
3972 case Instruction::SDiv:
3973 case Instruction::SRem:
3974 case Instruction::URem:
3975 // TODO: We can use the loop-preheader as context point here and get
3976 // context sensitive reasoning
3978 case Instruction::Call:
3979 return Legal->isMaskRequired(I);
3980 }
3981}
3982
3983std::pair<InstructionCost, InstructionCost>
3985 ElementCount VF) const {
3986 assert(I->getOpcode() == Instruction::UDiv ||
3987 I->getOpcode() == Instruction::SDiv ||
3988 I->getOpcode() == Instruction::SRem ||
3989 I->getOpcode() == Instruction::URem);
3991
3993
3994 // Scalarization isn't legal for scalable vector types
3995 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3996 if (!VF.isScalable()) {
3997 // Get the scalarization cost and scale this amount by the probability of
3998 // executing the predicated block. If the instruction is not predicated,
3999 // we fall through to the next case.
4000 ScalarizationCost = 0;
4001
4002 // These instructions have a non-void type, so account for the phi nodes
4003 // that we will create. This cost is likely to be zero. The phi node
4004 // cost, if any, should be scaled by the block probability because it
4005 // models a copy at the end of each predicated block.
4006 ScalarizationCost += VF.getKnownMinValue() *
4007 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4008
4009 // The cost of the non-predicated instruction.
4010 ScalarizationCost += VF.getKnownMinValue() *
4011 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4012
4013 // The cost of insertelement and extractelement instructions needed for
4014 // scalarization.
4015 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4016
4017 // Scale the cost by the probability of executing the predicated blocks.
4018 // This assumes the predicated block for each vector lane is equally
4019 // likely.
4020 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4021 }
4022 InstructionCost SafeDivisorCost = 0;
4023
4024 auto *VecTy = ToVectorTy(I->getType(), VF);
4025
4026 // The cost of the select guard to ensure all lanes are well defined
4027 // after we speculate above any internal control flow.
4028 SafeDivisorCost += TTI.getCmpSelInstrCost(
4029 Instruction::Select, VecTy,
4030 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4032
4033 // Certain instructions can be cheaper to vectorize if they have a constant
4034 // second vector operand. One example of this are shifts on x86.
4035 Value *Op2 = I->getOperand(1);
4036 auto Op2Info = TTI.getOperandInfo(Op2);
4037 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4038 Legal->isInvariant(Op2))
4040
4041 SmallVector<const Value *, 4> Operands(I->operand_values());
4042 SafeDivisorCost += TTI.getArithmeticInstrCost(
4043 I->getOpcode(), VecTy, CostKind,
4044 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4045 Op2Info, Operands, I);
4046 return {ScalarizationCost, SafeDivisorCost};
4047}
4048
4050 Instruction *I, ElementCount VF) {
4051 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4053 "Decision should not be set yet.");
4054 auto *Group = getInterleavedAccessGroup(I);
4055 assert(Group && "Must have a group.");
4056
4057 // If the instruction's allocated size doesn't equal it's type size, it
4058 // requires padding and will be scalarized.
4059 auto &DL = I->getModule()->getDataLayout();
4060 auto *ScalarTy = getLoadStoreType(I);
4061 if (hasIrregularType(ScalarTy, DL))
4062 return false;
4063
4064 // If the group involves a non-integral pointer, we may not be able to
4065 // losslessly cast all values to a common type.
4066 unsigned InterleaveFactor = Group->getFactor();
4067 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4068 for (unsigned i = 0; i < InterleaveFactor; i++) {
4069 Instruction *Member = Group->getMember(i);
4070 if (!Member)
4071 continue;
4072 auto *MemberTy = getLoadStoreType(Member);
4073 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4074 // Don't coerce non-integral pointers to integers or vice versa.
4075 if (MemberNI != ScalarNI) {
4076 // TODO: Consider adding special nullptr value case here
4077 return false;
4078 } else if (MemberNI && ScalarNI &&
4079 ScalarTy->getPointerAddressSpace() !=
4080 MemberTy->getPointerAddressSpace()) {
4081 return false;
4082 }
4083 }
4084
4085 // Check if masking is required.
4086 // A Group may need masking for one of two reasons: it resides in a block that
4087 // needs predication, or it was decided to use masking to deal with gaps
4088 // (either a gap at the end of a load-access that may result in a speculative
4089 // load, or any gaps in a store-access).
4090 bool PredicatedAccessRequiresMasking =
4091 blockNeedsPredicationForAnyReason(I->getParent()) &&
4093 bool LoadAccessWithGapsRequiresEpilogMasking =
4094 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4096 bool StoreAccessWithGapsRequiresMasking =
4097 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4098 if (!PredicatedAccessRequiresMasking &&
4099 !LoadAccessWithGapsRequiresEpilogMasking &&
4100 !StoreAccessWithGapsRequiresMasking)
4101 return true;
4102
4103 // If masked interleaving is required, we expect that the user/target had
4104 // enabled it, because otherwise it either wouldn't have been created or
4105 // it should have been invalidated by the CostModel.
4107 "Masked interleave-groups for predicated accesses are not enabled.");
4108
4109 if (Group->isReverse())
4110 return false;
4111
4112 auto *Ty = getLoadStoreType(I);
4113 const Align Alignment = getLoadStoreAlignment(I);
4114 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4115 : TTI.isLegalMaskedStore(Ty, Alignment);
4116}
4117
4119 Instruction *I, ElementCount VF) {
4120 // Get and ensure we have a valid memory instruction.
4121 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4122
4124 auto *ScalarTy = getLoadStoreType(I);
4125
4126 // In order to be widened, the pointer should be consecutive, first of all.
4127 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4128 return false;
4129
4130 // If the instruction is a store located in a predicated block, it will be
4131 // scalarized.
4132 if (isScalarWithPredication(I, VF))
4133 return false;
4134
4135 // If the instruction's allocated size doesn't equal it's type size, it
4136 // requires padding and will be scalarized.
4137 auto &DL = I->getModule()->getDataLayout();
4138 if (hasIrregularType(ScalarTy, DL))
4139 return false;
4140
4141 return true;
4142}
4143
4144void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4145 // We should not collect Uniforms more than once per VF. Right now,
4146 // this function is called from collectUniformsAndScalars(), which
4147 // already does this check. Collecting Uniforms for VF=1 does not make any
4148 // sense.
4149
4150 assert(VF.isVector() && !Uniforms.contains(VF) &&
4151 "This function should not be visited twice for the same VF");
4152
4153 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4154 // not analyze again. Uniforms.count(VF) will return 1.
4155 Uniforms[VF].clear();
4156
4157 // We now know that the loop is vectorizable!
4158 // Collect instructions inside the loop that will remain uniform after
4159 // vectorization.
4160
4161 // Global values, params and instructions outside of current loop are out of
4162 // scope.
4163 auto isOutOfScope = [&](Value *V) -> bool {
4164 Instruction *I = dyn_cast<Instruction>(V);
4165 return (!I || !TheLoop->contains(I));
4166 };
4167
4168 // Worklist containing uniform instructions demanding lane 0.
4169 SetVector<Instruction *> Worklist;
4170 BasicBlock *Latch = TheLoop->getLoopLatch();
4171
4172 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4173 // that are scalar with predication must not be considered uniform after
4174 // vectorization, because that would create an erroneous replicating region
4175 // where only a single instance out of VF should be formed.
4176 // TODO: optimize such seldom cases if found important, see PR40816.
4177 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4178 if (isOutOfScope(I)) {
4179 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4180 << *I << "\n");
4181 return;
4182 }
4183 if (isScalarWithPredication(I, VF)) {
4184 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4185 << *I << "\n");
4186 return;
4187 }
4188 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4189 Worklist.insert(I);
4190 };
4191
4192 // Start with the conditional branch. If the branch condition is an
4193 // instruction contained in the loop that is only used by the branch, it is
4194 // uniform.
4195 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4196 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4197 addToWorklistIfAllowed(Cmp);
4198
4199 auto PrevVF = VF.divideCoefficientBy(2);
4200 // Return true if all lanes perform the same memory operation, and we can
4201 // thus chose to execute only one.
4202 auto isUniformMemOpUse = [&](Instruction *I) {
4203 // If the value was already known to not be uniform for the previous
4204 // (smaller VF), it cannot be uniform for the larger VF.
4205 if (PrevVF.isVector()) {
4206 auto Iter = Uniforms.find(PrevVF);
4207 if (Iter != Uniforms.end() && !Iter->second.contains(I))
4208 return false;
4209 }
4210 if (!Legal->isUniformMemOp(*I, VF))
4211 return false;
4212 if (isa<LoadInst>(I))
4213 // Loading the same address always produces the same result - at least
4214 // assuming aliasing and ordering which have already been checked.
4215 return true;
4216 // Storing the same value on every iteration.
4217 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4218 };
4219
4220 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4221 InstWidening WideningDecision = getWideningDecision(I, VF);
4222 assert(WideningDecision != CM_Unknown &&
4223 "Widening decision should be ready at this moment");
4224
4225 if (isUniformMemOpUse(I))
4226 return true;
4227
4228 return (WideningDecision == CM_Widen ||
4229 WideningDecision == CM_Widen_Reverse ||
4230 WideningDecision == CM_Interleave);
4231 };
4232
4233 // Returns true if Ptr is the pointer operand of a memory access instruction
4234 // I, I is known to not require scalarization, and the pointer is not also
4235 // stored.
4236 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4237 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4238 return false;
4239 return getLoadStorePointerOperand(I) == Ptr &&
4240 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4241 };
4242
4243 // Holds a list of values which are known to have at least one uniform use.
4244 // Note that there may be other uses which aren't uniform. A "uniform use"
4245 // here is something which only demands lane 0 of the unrolled iterations;
4246 // it does not imply that all lanes produce the same value (e.g. this is not
4247 // the usual meaning of uniform)
4248 SetVector<Value *> HasUniformUse;
4249
4250 // Scan the loop for instructions which are either a) known to have only
4251 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4252 for (auto *BB : TheLoop->blocks())
4253 for (auto &I : *BB) {
4254 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4255 switch (II->getIntrinsicID()) {
4256 case Intrinsic::sideeffect:
4257 case Intrinsic::experimental_noalias_scope_decl:
4258 case Intrinsic::assume:
4259 case Intrinsic::lifetime_start:
4260 case Intrinsic::lifetime_end:
4262 addToWorklistIfAllowed(&I);
4263 break;
4264 default:
4265 break;
4266 }
4267 }
4268
4269 // ExtractValue instructions must be uniform, because the operands are
4270 // known to be loop-invariant.
4271 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4272 assert(isOutOfScope(EVI->getAggregateOperand()) &&
4273 "Expected aggregate value to be loop invariant");
4274 addToWorklistIfAllowed(EVI);
4275 continue;
4276 }
4277
4278 // If there's no pointer operand, there's nothing to do.
4280 if (!Ptr)
4281 continue;
4282
4283 if (isUniformMemOpUse(&I))
4284 addToWorklistIfAllowed(&I);
4285
4286 if (isVectorizedMemAccessUse(&I, Ptr))
4287 HasUniformUse.insert(Ptr);
4288 }
4289
4290 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4291 // demanding) users. Since loops are assumed to be in LCSSA form, this
4292 // disallows uses outside the loop as well.
4293 for (auto *V : HasUniformUse) {
4294 if (isOutOfScope(V))
4295 continue;
4296 auto *I = cast<Instruction>(V);
4297 auto UsersAreMemAccesses =
4298 llvm::all_of(I->users(), [&](User *U) -> bool {
4299 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4300 });
4301 if (UsersAreMemAccesses)
4302 addToWorklistIfAllowed(I);
4303 }
4304
4305 // Expand Worklist in topological order: whenever a new instruction
4306 // is added , its users should be already inside Worklist. It ensures
4307 // a uniform instruction will only be used by uniform instructions.
4308 unsigned idx = 0;
4309 while (idx != Worklist.size()) {
4310 Instruction *I = Worklist[idx++];
4311
4312 for (auto *OV : I->operand_values()) {
4313 // isOutOfScope operands cannot be uniform instructions.
4314 if (isOutOfScope(OV))
4315 continue;
4316 // First order recurrence Phi's should typically be considered
4317 // non-uniform.
4318 auto *OP = dyn_cast<PHINode>(OV);
4320 continue;
4321 // If all the users of the operand are uniform, then add the
4322 // operand into the uniform worklist.
4323 auto *OI = cast<Instruction>(OV);
4324 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4325 auto *J = cast<Instruction>(U);
4326 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4327 }))
4328 addToWorklistIfAllowed(OI);
4329 }
4330 }
4331
4332 // For an instruction to be added into Worklist above, all its users inside
4333 // the loop should also be in Worklist. However, this condition cannot be
4334 // true for phi nodes that form a cyclic dependence. We must process phi
4335 // nodes separately. An induction variable will remain uniform if all users
4336 // of the induction variable and induction variable update remain uniform.
4337 // The code below handles both pointer and non-pointer induction variables.
4338 for (const auto &Induction : Legal->getInductionVars()) {
4339 auto *Ind = Induction.first;
4340 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4341
4342 // Determine if all users of the induction variable are uniform after
4343 // vectorization.
4344 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4345 auto *I = cast<Instruction>(U);
4346 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4347 isVectorizedMemAccessUse(I, Ind);
4348 });
4349 if (!UniformInd)
4350 continue;
4351
4352 // Determine if all users of the induction variable update instruction are
4353 // uniform after vectorization.
4354 auto UniformIndUpdate =
4355 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4356 auto *I = cast<Instruction>(U);
4357 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4358 isVectorizedMemAccessUse(I, IndUpdate);
4359 });
4360 if (!UniformIndUpdate)
4361 continue;
4362
4363 // The induction variable and its update instruction will remain uniform.
4364 addToWorklistIfAllowed(Ind);
4365 addToWorklistIfAllowed(IndUpdate);
4366 }
4367
4368 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4369}
4370
4372 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4373
4375 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4376 "runtime pointer checks needed. Enable vectorization of this "
4377 "loop with '#pragma clang loop vectorize(enable)' when "
4378 "compiling with -Os/-Oz",
4379 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4380 return true;
4381 }
4382
4383 if (!PSE.getPredicate().isAlwaysTrue()) {
4384 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4385 "runtime SCEV checks needed. Enable vectorization of this "
4386 "loop with '#pragma clang loop vectorize(enable)' when "
4387 "compiling with -Os/-Oz",
4388 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4389 return true;
4390 }
4391
4392 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4393 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4394 reportVectorizationFailure("Runtime stride check for small trip count",
4395 "runtime stride == 1 checks needed. Enable vectorization of "
4396 "this loop without such check by compiling with -Os/-Oz",
4397 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4398 return true;
4399 }
4400
4401 return false;
4402}
4403
4405LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4407 return ElementCount::getScalable(0);
4408
4410 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4411 "ScalableVectorizationDisabled", ORE, TheLoop);
4412 return ElementCount::getScalable(0);
4413 }
4414
4415 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4416
4417 auto MaxScalableVF = ElementCount::getScalable(
4418 std::numeric_limits<ElementCount::ScalarTy>::max());
4419
4420 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4421 // FIXME: While for scalable vectors this is currently sufficient, this should
4422 // be replaced by a more detailed mechanism that filters out specific VFs,
4423 // instead of invalidating vectorization for a whole set of VFs based on the
4424 // MaxVF.
4425
4426 // Disable scalable vectorization if the loop contains unsupported reductions.
4427 if (!canVectorizeReductions(MaxScalableVF)) {
4429 "Scalable vectorization not supported for the reduction "
4430 "operations found in this loop.",
4431 "ScalableVFUnfeasible", ORE, TheLoop);
4432 return ElementCount::getScalable(0);
4433 }
4434
4435 // Disable scalable vectorization if the loop contains any instructions
4436 // with element types not supported for scalable vectors.
4437 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4438 return !Ty->isVoidTy() &&
4440 })) {
4441 reportVectorizationInfo("Scalable vectorization is not supported "
4442 "for all element types found in this loop.",
4443 "ScalableVFUnfeasible", ORE, TheLoop);
4444 return ElementCount::getScalable(0);
4445 }
4446
4448 return MaxScalableVF;
4449
4450 // Limit MaxScalableVF by the maximum safe dependence distance.
4451 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4452 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4453 else
4454 MaxScalableVF = ElementCount::getScalable(0);
4455
4456 if (!MaxScalableVF)
4458 "Max legal vector width too small, scalable vectorization "
4459 "unfeasible.",
4460 "ScalableVFUnfeasible", ORE, TheLoop);
4461
4462 return MaxScalableVF;
4463}
4464
4465FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4466 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4468 unsigned SmallestType, WidestType;
4469 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4470
4471 // Get the maximum safe dependence distance in bits computed by LAA.
4472 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4473 // the memory accesses that is most restrictive (involved in the smallest
4474 // dependence distance).
4475 unsigned MaxSafeElements =
4477
4478 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4479 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4480
4481 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4482 << ".\n");
4483 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4484 << ".\n");
4485
4486 // First analyze the UserVF, fall back if the UserVF should be ignored.
4487 if (UserVF) {
4488 auto MaxSafeUserVF =
4489 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4490
4491 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4492 // If `VF=vscale x N` is safe, then so is `VF=N`
4493 if (UserVF.isScalable())
4494 return FixedScalableVFPair(
4495 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4496 else
4497 return UserVF;
4498 }
4499
4500 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4501
4502 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4503 // is better to ignore the hint and let the compiler choose a suitable VF.
4504 if (!UserVF.isScalable()) {
4505 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4506 << " is unsafe, clamping to max safe VF="
4507 << MaxSafeFixedVF << ".\n");
4508 ORE->emit([&]() {
4509 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4511 TheLoop->getHeader())
4512 << "User-specified vectorization factor "
4513 << ore::NV("UserVectorizationFactor", UserVF)
4514 << " is unsafe, clamping to maximum safe vectorization factor "
4515 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4516 });
4517 return MaxSafeFixedVF;
4518 }
4519
4521 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4522 << " is ignored because scalable vectors are not "
4523 "available.\n");
4524 ORE->emit([&]() {
4525 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4527 TheLoop->getHeader())
4528 << "User-specified vectorization factor "
4529 << ore::NV("UserVectorizationFactor", UserVF)
4530 << " is ignored because the target does not support scalable "
4531 "vectors. The compiler will pick a more suitable value.";
4532 });
4533 } else {
4534 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4535 << " is unsafe. Ignoring scalable UserVF.\n");
4536 ORE->emit([&]() {
4537 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4539 TheLoop->getHeader())
4540 << "User-specified vectorization factor "
4541 << ore::NV("UserVectorizationFactor", UserVF)
4542 << " is unsafe. Ignoring the hint to let the compiler pick a "
4543 "more suitable value.";
4544 });
4545 }
4546 }
4547
4548 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4549 << " / " << WidestType << " bits.\n");
4550
4553 if (auto MaxVF =
4554 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4555 MaxSafeFixedVF, FoldTailByMasking))
4556 Result.FixedVF = MaxVF;
4557
4558 if (auto MaxVF =
4559 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4560 MaxSafeScalableVF, FoldTailByMasking))
4561 if (MaxVF.isScalable()) {
4562 Result.ScalableVF = MaxVF;
4563 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4564 << "\n");
4565 }
4566
4567 return Result;
4568}
4569
4573 // TODO: It may by useful to do since it's still likely to be dynamically
4574 // uniform if the target can skip.
4576 "Not inserting runtime ptr check for divergent target",
4577 "runtime pointer checks needed. Not enabled for divergent target",
4578 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4580 }
4581
4582 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4583 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4584 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4585 if (TC == 1) {
4586 reportVectorizationFailure("Single iteration (non) loop",
4587 "loop trip count is one, irrelevant for vectorization",
4588 "SingleIterationLoop", ORE, TheLoop);
4590 }
4591
4592 switch (ScalarEpilogueStatus) {
4594 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4596 [[fallthrough]];
4598 LLVM_DEBUG(
4599 dbgs() << "LV: vector predicate hint/switch found.\n"
4600 << "LV: Not allowing scalar epilogue, creating predicated "
4601 << "vector loop.\n");
4602 break;
4604 // fallthrough as a special case of OptForSize
4606 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4607 LLVM_DEBUG(
4608 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4609 else
4610 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4611 << "count.\n");
4612
4613 // Bail if runtime checks are required, which are not good when optimising
4614 // for size.
4617
4618 break;
4619 }
4620
4621 // The only loops we can vectorize without a scalar epilogue, are loops with
4622 // a bottom-test and a single exiting block. We'd have to handle the fact
4623 // that not every instruction executes on the last iteration. This will
4624 // require a lane mask which varies through the vector loop body. (TODO)
4626 // If there was a tail-folding hint/switch, but we can't fold the tail by
4627 // masking, fallback to a vectorization with a scalar epilogue.
4628 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4629 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4630 "scalar epilogue instead.\n");
4631 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4632 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4633 }
4635 }
4636
4637 // Now try the tail folding
4638
4639 // Invalidate interleave groups that require an epilogue if we can't mask
4640 // the interleave-group.
4642 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4643 "No decisions should have been taken at this point");
4644 // Note: There is no need to invalidate any cost modeling decisions here, as
4645 // non where taken so far.
4647 }
4648
4649 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4650
4651 // Avoid tail folding if the trip count is known to be a multiple of any VF
4652 // we choose.
4653 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4654 MaxFactors.FixedVF.getFixedValue();
4655 if (MaxFactors.ScalableVF) {
4656 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4657 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4658 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4659 *MaxPowerOf2RuntimeVF,
4660 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4661 } else
4662 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4663 }
4664
4665 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4666 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4667 "MaxFixedVF must be a power of 2");
4668 unsigned MaxVFtimesIC =
4669 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4670 ScalarEvolution *SE = PSE.getSE();
4671 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4672 const SCEV *ExitCount = SE->getAddExpr(
4673 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4674 const SCEV *Rem = SE->getURemExpr(
4675 SE->applyLoopGuards(ExitCount, TheLoop),
4676 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4677 if (Rem->isZero()) {
4678 // Accept MaxFixedVF if we do not have a tail.
4679 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4680 return MaxFactors;
4681 }
4682 }
4683
4684 // If we don't know the precise trip count, or if the trip count that we
4685 // found modulo the vectorization factor is not zero, try to fold the tail
4686 // by masking.
4687 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4688 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4689 if (foldTailByMasking()) {
4691 LLVM_DEBUG(
4692 dbgs()
4693 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4694 "try to generate VP Intrinsics with scalable vector "
4695 "factors only.\n");
4696 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4697 // for now.
4698 // TODO: extend it for fixed vectors, if required.
4699 assert(MaxFactors.ScalableVF.isScalable() &&
4700 "Expected scalable vector factor.");
4701
4702 MaxFactors.FixedVF = ElementCount::getFixed(1);
4703 }
4704 return MaxFactors;
4705 }
4706
4707 // If there was a tail-folding hint/switch, but we can't fold the tail by
4708 // masking, fallback to a vectorization with a scalar epilogue.
4709 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4710 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4711 "scalar epilogue instead.\n");
4712 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4713 return MaxFactors;
4714 }
4715
4716 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4717 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4719 }
4720
4721 if (TC == 0) {
4723 "Unable to calculate the loop count due to complex control flow",
4724 "unable to calculate the loop count due to complex control flow",
4725 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4727 }
4728
4730 "Cannot optimize for size and vectorize at the same time.",
4731 "cannot optimize for size and vectorize at the same time. "
4732 "Enable vectorization of this loop with '#pragma clang loop "
4733 "vectorize(enable)' when compiling with -Os/-Oz",
4734 "NoTailLoopWithOptForSize", ORE, TheLoop);
4736}
4737
4738ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4739 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4740 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4741 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4742 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4743 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4745
4746 // Convenience function to return the minimum of two ElementCounts.
4747 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4748 assert((LHS.isScalable() == RHS.isScalable()) &&
4749 "Scalable flags must match");
4750 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4751 };
4752
4753 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4754 // Note that both WidestRegister and WidestType may not be a powers of 2.
4755 auto MaxVectorElementCount = ElementCount::get(
4756 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4757 ComputeScalableMaxVF);
4758 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4759 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4760 << (MaxVectorElementCount * WidestType) << " bits.\n");
4761
4762 if (!MaxVectorElementCount) {
4763 LLVM_DEBUG(dbgs() << "LV: The target has no "
4764 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4765 << " vector registers.\n");
4766 return ElementCount::getFixed(1);
4767 }
4768
4769 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4770 if (MaxVectorElementCount.isScalable() &&
4771 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4772 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4773 auto Min = Attr.getVScaleRangeMin();
4774 WidestRegisterMinEC *= Min;
4775 }
4776
4777 // When a scalar epilogue is required, at least one iteration of the scalar
4778 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4779 // max VF that results in a dead vector loop.
4780 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4781 MaxTripCount -= 1;
4782
4783 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4784 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4785 // If upper bound loop trip count (TC) is known at compile time there is no
4786 // point in choosing VF greater than TC (as done in the loop below). Select
4787 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4788 // scalable, we only fall back on a fixed VF when the TC is less than or
4789 // equal to the known number of lanes.
4790 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4791 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4792 "exceeding the constant trip count: "
4793 << ClampedUpperTripCount << "\n");
4794 return ElementCount::get(
4795 ClampedUpperTripCount,
4796 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4797 }
4798
4800 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4802 ElementCount MaxVF = MaxVectorElementCount;
4803 if (MaximizeBandwidth ||
4804 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4807 auto MaxVectorElementCountMaxBW = ElementCount::get(
4808 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4809 ComputeScalableMaxVF);
4810 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4811
4812 // Collect all viable vectorization factors larger than the default MaxVF
4813 // (i.e. MaxVectorElementCount).
4815 for (ElementCount VS = MaxVectorElementCount * 2;
4816 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4817 VFs.push_back(VS);
4818
4819 // For each VF calculate its register usage.
4820 auto RUs = calculateRegisterUsage(VFs);
4821
4822 // Select the largest VF which doesn't require more registers than existing
4823 // ones.
4824 for (int i = RUs.size() - 1; i >= 0; --i) {
4825 bool Selected = true;
4826 for (auto &pair : RUs[i].MaxLocalUsers) {
4827 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4828 if (pair.second > TargetNumRegisters)
4829 Selected = false;
4830 }
4831 if (Selected) {
4832 MaxVF = VFs[i];
4833 break;
4834 }
4835 }
4836 if (ElementCount MinVF =
4837 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4838 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4839 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4840 << ") with target's minimum: " << MinVF << '\n');
4841 MaxVF = MinVF;
4842 }
4843 }
4844
4845 // Invalidate any widening decisions we might have made, in case the loop
4846 // requires prediction (decided later), but we have already made some
4847 // load/store widening decisions.
4849 }
4850 return MaxVF;
4851}
4852
4853/// Convenience function that returns the value of vscale_range iff
4854/// vscale_range.min == vscale_range.max or otherwise returns the value
4855/// returned by the corresponding TTI method.
4856static std::optional<unsigned>
4858 const Function *Fn = L->getHeader()->getParent();
4859 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4860 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4861 auto Min = Attr.getVScaleRangeMin();
4862 auto Max = Attr.getVScaleRangeMax();
4863 if (Max && Min == Max)
4864 return Max;
4865 }
4866
4867 return TTI.getVScaleForTuning();
4868}
4869
4870bool LoopVectorizationPlanner::isMoreProfitable(
4871 const VectorizationFactor &A, const VectorizationFactor &B) const {
4872 InstructionCost CostA = A.Cost;
4873 InstructionCost CostB = B.Cost;
4874
4875 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4876
4877 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4878 // If the trip count is a known (possibly small) constant, the trip count
4879 // will be rounded up to an integer number of iterations under
4880 // FoldTailByMasking. The total cost in that case will be
4881 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4882 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4883 // some extra overheads, but for the purpose of comparing the costs of
4884 // different VFs we can use this to compare the total loop-body cost
4885 // expected after vectorization.
4886 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4887 InstructionCost VectorCost,
4888 InstructionCost ScalarCost) {
4889 return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4890 : VectorCost * (MaxTripCount / VF) +
4891 ScalarCost * (MaxTripCount % VF);
4892 };
4893 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4894 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4895
4896 return RTCostA < RTCostB;
4897 }
4898
4899 // Improve estimate for the vector width if it is scalable.
4900 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4901 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4902 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4903 if (A.Width.isScalable())
4904 EstimatedWidthA *= *VScale;
4905 if (B.Width.isScalable())
4906 EstimatedWidthB *= *VScale;
4907 }
4908
4909 // Assume vscale may be larger than 1 (or the value being tuned for),
4910 // so that scalable vectorization is slightly favorable over fixed-width
4911 // vectorization.
4912 if (A.Width.isScalable() && !B.Width.isScalable())
4913 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4914
4915 // To avoid the need for FP division:
4916 // (CostA / A.Width) < (CostB / B.Width)
4917 // <=> (CostA * B.Width) < (CostB * A.Width)
4918 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4919}
4920
4923 Loop *TheLoop) {
4924 if (InvalidCosts.empty())
4925 return;
4926
4927 // Emit a report of VFs with invalid costs in the loop.
4928
4929 // Group the remarks per instruction, keeping the instruction order from
4930 // InvalidCosts.
4931 std::map<Instruction *, unsigned> Numbering;
4932 unsigned I = 0;
4933 for (auto &Pair : InvalidCosts)
4934 if (!Numbering.count(Pair.first))
4935 Numbering[Pair.first] = I++;
4936
4937 // Sort the list, first on instruction(number) then on VF.
4938 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4939 if (Numbering[A.first] != Numbering[B.first])
4940 return Numbering[A.first] < Numbering[B.first];
4942 return ECC(A.second, B.second);
4943 });
4944
4945 // For a list of ordered instruction-vf pairs:
4946 // [(load, vf1), (load, vf2), (store, vf1)]
4947 // Group the instructions together to emit separate remarks for:
4948 // load (vf1, vf2)
4949 // store (vf1)
4950 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4951 auto Subset = ArrayRef<InstructionVFPair>();
4952 do {
4953 if (Subset.empty())
4954 Subset = Tail.take_front(1);
4955
4956 Instruction *I = Subset.front().first;
4957
4958 // If the next instruction is different, or if there are no other pairs,
4959 // emit a remark for the collated subset. e.g.
4960 // [(load, vf1), (load, vf2))]
4961 // to emit:
4962 // remark: invalid costs for 'load' at VF=(vf, vf2)
4963 if (Subset == Tail || Tail[Subset.size()].first != I) {
4964 std::string OutString;
4965 raw_string_ostream OS(OutString);
4966 assert(!Subset.empty() && "Unexpected empty range");
4967 OS << "Instruction with invalid costs prevented vectorization at VF=(";
4968 for (const auto &Pair : Subset)
4969 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4970 OS << "):";
4971 if (auto *CI = dyn_cast<CallInst>(I))
4972 OS << " call to " << CI->getCalledFunction()->getName();
4973 else
4974 OS << " " << I->getOpcodeName();
4975 OS.flush();
4976 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4977 Tail = Tail.drop_front(Subset.size());
4978 Subset = {};
4979 } else
4980 // Grow the subset by one element
4981 Subset = Tail.take_front(Subset.size() + 1);
4982 } while (!Tail.empty());
4983}
4984
4985VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
4986 const ElementCountSet &VFCandidates) {
4987 InstructionCost ExpectedCost =
4989 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4990 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4991 assert(VFCandidates.count(ElementCount::getFixed(1)) &&
4992 "Expected Scalar VF to be a candidate");
4993
4994 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4995 ExpectedCost);
4996 VectorizationFactor ChosenFactor = ScalarCost;
4997
4998 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4999 if (ForceVectorization && VFCandidates.size() > 1) {
5000 // Ignore scalar width, because the user explicitly wants vectorization.
5001 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5002 // evaluation.
5003 ChosenFactor.Cost = InstructionCost::getMax();
5004 }
5005
5006 SmallVector<InstructionVFPair> InvalidCosts;
5007 for (const auto &i : VFCandidates) {
5008 // The cost for scalar VF=1 is already calculated, so ignore it.
5009 if (i.isScalar())
5010 continue;
5011
5013 CM.expectedCost(i, &InvalidCosts);
5014 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5015
5016#ifndef NDEBUG
5017 unsigned AssumedMinimumVscale =
5018 getVScaleForTuning(OrigLoop, TTI).value_or(1);
5019 unsigned Width =
5020 Candidate.Width.isScalable()
5021 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5022 : Candidate.Width.getFixedValue();
5023 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5024 << " costs: " << (Candidate.Cost / Width));
5025 if (i.isScalable())
5026 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5027 << AssumedMinimumVscale << ")");
5028 LLVM_DEBUG(dbgs() << ".\n");
5029#endif
5030
5031 if (!C.second && !ForceVectorization) {
5032 LLVM_DEBUG(
5033 dbgs() << "LV: Not considering vector loop of width " << i
5034 << " because it will not generate any vector instructions.\n");
5035 continue;
5036 }
5037
5038 // If profitable add it to ProfitableVF list.
5039 if (isMoreProfitable(Candidate, ScalarCost))
5040 ProfitableVFs.push_back(Candidate);
5041
5042 if (isMoreProfitable(Candidate, ChosenFactor))
5043 ChosenFactor = Candidate;
5044 }
5045
5046 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
5047
5050 "There are conditional stores.",
5051 "store that is conditionally executed prevents vectorization",
5052 "ConditionalStore", ORE, OrigLoop);
5053 ChosenFactor = ScalarCost;
5054 }
5055
5056 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5057 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5058 << "LV: Vectorization seems to be not beneficial, "
5059 << "but was forced by a user.\n");
5060 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5061 return ChosenFactor;
5062}
5063
5064bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5065 ElementCount VF) const {
5066 // Cross iteration phis such as reductions need special handling and are
5067 // currently unsupported.
5068 if (any_of(OrigLoop->getHeader()->phis(),
5069 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5070 return false;
5071
5072 // Phis with uses outside of the loop require special handling and are
5073 // currently unsupported.
5074 for (const auto &Entry : Legal->getInductionVars()) {
5075 // Look for uses of the value of the induction at the last iteration.
5076 Value *PostInc =
5077 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5078 for (User *U : PostInc->users())
5079 if (!OrigLoop->contains(cast<Instruction>(U)))
5080 return false;
5081 // Look for uses of penultimate value of the induction.
5082 for (User *U : Entry.first->users())
5083 if (!OrigLoop->contains(cast<Instruction>(U)))
5084 return false;
5085 }
5086
5087 // Epilogue vectorization code has not been auditted to ensure it handles
5088 // non-latch exits properly. It may be fine, but it needs auditted and
5089 // tested.
5090 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5091 return false;
5092
5093 return true;
5094}
5095
5097 const ElementCount VF) const {
5098 // FIXME: We need a much better cost-model to take different parameters such
5099 // as register pressure, code size increase and cost of extra branches into
5100 // account. For now we apply a very crude heuristic and only consider loops
5101 // with vectorization factors larger than a certain value.
5102
5103 // Allow the target to opt out entirely.
5105 return false;
5106
5107 // We also consider epilogue vectorization unprofitable for targets that don't
5108 // consider interleaving beneficial (eg. MVE).
5109 if (TTI.getMaxInterleaveFactor(VF) <= 1)
5110 return false;
5111
5112 unsigned Multiplier = 1;
5113 if (VF.isScalable())
5114 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5115 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5116 return true;
5117 return false;
5118}
5119
5121 const ElementCount MainLoopVF, unsigned IC) {
5124 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5125 return Result;
5126 }
5127
5128 if (!CM.isScalarEpilogueAllowed()) {
5129 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5130 "epilogue is allowed.\n");
5131 return Result;
5132 }
5133
5134 // Not really a cost consideration, but check for unsupported cases here to
5135 // simplify the logic.
5136 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5137 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5138 "is not a supported candidate.\n");
5139 return Result;
5140 }
5141
5143 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5145 if (hasPlanWithVF(ForcedEC))
5146 return {ForcedEC, 0, 0};
5147 else {
5148 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5149 "viable.\n");
5150 return Result;
5151 }
5152 }
5153
5154 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5155 OrigLoop->getHeader()->getParent()->hasMinSize()) {
5156 LLVM_DEBUG(
5157 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5158 return Result;
5159 }
5160
5161 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5162 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5163 "this loop\n");
5164 return Result;
5165 }
5166
5167 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5168 // the main loop handles 8 lanes per iteration. We could still benefit from
5169 // vectorizing the epilogue loop with VF=4.
5170 ElementCount EstimatedRuntimeVF = MainLoopVF;
5171 if (MainLoopVF.isScalable()) {
5172 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5173 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5174 EstimatedRuntimeVF *= *VScale;
5175 }
5176
5177 ScalarEvolution &SE = *PSE.getSE();
5178 Type *TCType = Legal->getWidestInductionType();
5179 const SCEV *RemainingIterations = nullptr;
5180 for (auto &NextVF : ProfitableVFs) {
5181 // Skip candidate VFs without a corresponding VPlan.
5182 if (!hasPlanWithVF(NextVF.Width))
5183 continue;
5184
5185 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5186 // vectors) or the VF of the main loop (fixed vectors).
5187 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5188 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5189 ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5190 continue;
5191
5192 // If NextVF is greater than the number of remaining iterations, the
5193 // epilogue loop would be dead. Skip such factors.
5194 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5195 // TODO: extend to support scalable VFs.
5196 if (!RemainingIterations) {
5197 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5198 RemainingIterations = SE.getURemExpr(
5199 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5200 }
5201 if (SE.isKnownPredicate(
5203 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5204 RemainingIterations))
5205 continue;
5206 }
5207
5208 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5209 Result = NextVF;
5210 }
5211
5212 if (Result != VectorizationFactor::Disabled())
5213 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5214 << Result.Width << "\n");
5215 return Result;
5216}
5217
5218std::pair<unsigned, unsigned>
5220 unsigned MinWidth = -1U;
5221 unsigned MaxWidth = 8;
5223 // For in-loop reductions, no element types are added to ElementTypesInLoop
5224 // if there are no loads/stores in the loop. In this case, check through the
5225 // reduction variables to determine the maximum width.
5226 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5227 // Reset MaxWidth so that we can find the smallest type used by recurrences
5228 // in the loop.
5229 MaxWidth = -1U;
5230 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5231 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5232 // When finding the min width used by the recurrence we need to account
5233 // for casts on the input operands of the recurrence.
5234 MaxWidth = std::min<unsigned>(
5235 MaxWidth, std::min<unsigned>(
5238 }
5239 } else {
5240 for (Type *T : ElementTypesInLoop) {
5241 MinWidth = std::min<unsigned>(
5242 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5243 MaxWidth = std::max<unsigned>(
5244 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5245 }
5246 }
5247 return {MinWidth, MaxWidth};
5248}
5249
5251 ElementTypesInLoop.clear();
5252 // For each block.
5253 for (BasicBlock *BB : TheLoop->blocks()) {
5254 // For each instruction in the loop.
5255 for (Instruction &I : BB->instructionsWithoutDebug()) {
5256 Type *T = I.getType();
5257
5258 // Skip ignored values.
5259 if (ValuesToIgnore.count(&I))
5260 continue;
5261
5262 // Only examine Loads, Stores and PHINodes.
5263 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5264 continue;
5265
5266 // Examine PHI nodes that are reduction variables. Update the type to
5267 // account for the recurrence type.
5268 if (auto *PN = dyn_cast<PHINode>(&I)) {
5269 if (!Legal->isReductionVariable(PN))
5270 continue;
5271 const RecurrenceDescriptor &RdxDesc =
5272 Legal->getReductionVars().find(PN)->second;
5275 RdxDesc.getRecurrenceType(),
5277 continue;
5278 T = RdxDesc.getRecurrenceType();
5279 }
5280
5281 // Examine the stored values.
5282 if (auto *ST = dyn_cast<StoreInst>(&I))
5283 T = ST->getValueOperand()->getType();
5284
5285 assert(T->isSized() &&
5286 "Expected the load/store/recurrence type to be sized");
5287
5288 ElementTypesInLoop.insert(T);
5289 }
5290 }
5291}
5292
5293unsigned
5295 InstructionCost LoopCost) {
5296 // -- The interleave heuristics --
5297 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5298 // There are many micro-architectural considerations that we can't predict
5299 // at this level. For example, frontend pressure (on decode or fetch) due to
5300 // code size, or the number and capabilities of the execution ports.
5301 //
5302 // We use the following heuristics to select the interleave count:
5303 // 1. If the code has reductions, then we interleave to break the cross
5304 // iteration dependency.
5305 // 2. If the loop is really small, then we interleave to reduce the loop
5306 // overhead.
5307 // 3. We don't interleave if we think that we will spill registers to memory
5308 // due to the increased register pressure.
5309
5311 return 1;
5312
5313 // Do not interleave if EVL is preferred and no User IC is specified.
5314 if (foldTailWithEVL()) {
5315 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
5316 "Unroll factor forced to be 1.\n");
5317 return 1;
5318 }
5319
5320 // We used the distance for the interleave count.
5322 return 1;
5323
5324 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5325 const bool HasReductions = !Legal->getReductionVars().empty();
5326
5327 // If we did not calculate the cost for VF (because the user selected the VF)
5328 // then we calculate the cost of VF here.
5329 if (LoopCost == 0) {
5330 LoopCost = expectedCost(VF).first;
5331 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5332
5333 // Loop body is free and there is no need for interleaving.
5334 if (LoopCost == 0)
5335 return 1;
5336 }
5337
5339 // We divide by these constants so assume that we have at least one
5340 // instruction that uses at least one register.
5341 for (auto& pair : R.MaxLocalUsers) {
5342 pair.second = std::max(pair.second, 1U);
5343 }
5344
5345 // We calculate the interleave count using the following formula.
5346 // Subtract the number of loop invariants from the number of available
5347 // registers. These registers are used by all of the interleaved instances.
5348 // Next, divide the remaining registers by the number of registers that is
5349 // required by the loop, in order to estimate how many parallel instances
5350 // fit without causing spills. All of this is rounded down if necessary to be
5351 // a power of two. We want power of two interleave count to simplify any
5352 // addressing operations or alignment considerations.
5353 // We also want power of two interleave counts to ensure that the induction
5354 // variable of the vector loop wraps to zero, when tail is folded by masking;
5355 // this currently happens when OptForSize, in which case IC is set to 1 above.
5356 unsigned IC = UINT_MAX;
5357
5358 for (auto& pair : R.MaxLocalUsers) {
5359 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5360 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5361 << " registers of "
5362 << TTI.getRegisterClassName(pair.first) << " register class\n");
5363 if (VF.isScalar()) {
5364 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5365 TargetNumRegisters = ForceTargetNumScalarRegs;
5366 } else {
5367 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5368 TargetNumRegisters = ForceTargetNumVectorRegs;
5369 }
5370 unsigned MaxLocalUsers = pair.second;
5371 unsigned LoopInvariantRegs = 0;
5372 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5373 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5374
5375 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5376 MaxLocalUsers);
5377 // Don't count the induction variable as interleaved.
5379 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5380 std::max(1U, (MaxLocalUsers - 1)));
5381 }
5382
5383 IC = std::min(IC, TmpIC);
5384 }
5385
5386 // Clamp the interleave ranges to reasonable counts.
5387 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5388
5389 // Check if the user has overridden the max.
5390 if (VF.isScalar()) {
5391 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5392 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5393 } else {
5394 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5395 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5396 }
5397
5398 unsigned EstimatedVF = VF.getKnownMinValue();
5399 if (VF.isScalable()) {
5400 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5401 EstimatedVF *= *VScale;
5402 }
5403 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5404
5405 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5406 if (KnownTC > 0) {
5407 // At least one iteration must be scalar when this constraint holds. So the
5408 // maximum available iterations for interleaving is one less.
5409 unsigned AvailableTC =
5410 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5411
5412 // If trip count is known we select between two prospective ICs, where
5413 // 1) the aggressive IC is capped by the trip count divided by VF
5414 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5415 // The final IC is selected in a way that the epilogue loop trip count is
5416 // minimized while maximizing the IC itself, so that we either run the
5417 // vector loop at least once if it generates a small epilogue loop, or else
5418 // we run the vector loop at least twice.
5419
5420 unsigned InterleaveCountUB = bit_floor(
5421 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5422 unsigned InterleaveCountLB = bit_floor(std::max(
5423 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5424 MaxInterleaveCount = InterleaveCountLB;
5425
5426 if (InterleaveCountUB != InterleaveCountLB) {
5427 unsigned TailTripCountUB =
5428 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5429 unsigned TailTripCountLB =
5430 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5431 // If both produce same scalar tail, maximize the IC to do the same work
5432 // in fewer vector loop iterations
5433 if (TailTripCountUB == TailTripCountLB)
5434 MaxInterleaveCount = InterleaveCountUB;
5435 }
5436 } else if (BestKnownTC && *BestKnownTC > 0) {
5437 // At least one iteration must be scalar when this constraint holds. So the
5438 // maximum available iterations for interleaving is one less.
5439 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5440 ? (*BestKnownTC) - 1
5441 : *BestKnownTC;
5442
5443 // If trip count is an estimated compile time constant, limit the
5444 // IC to be capped by the trip count divided by VF * 2, such that the vector
5445 // loop runs at least twice to make interleaving seem profitable when there
5446 // is an epilogue loop present. Since exact Trip count is not known we
5447 // choose to be conservative in our IC estimate.
5448 MaxInterleaveCount = bit_floor(std::max(
5449 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5450 }
5451
5452 assert(MaxInterleaveCount > 0 &&
5453 "Maximum interleave count must be greater than 0");
5454
5455 // Clamp the calculated IC to be between the 1 and the max interleave count
5456 // that the target and trip count allows.
5457 if (IC > MaxInterleaveCount)
5458 IC = MaxInterleaveCount;
5459 else
5460 // Make sure IC is greater than 0.
5461 IC = std::max(1u, IC);
5462
5463 assert(IC > 0 && "Interleave count must be greater than 0.");
5464
5465 // Interleave if we vectorized this loop and there is a reduction that could
5466 // benefit from interleaving.
5467 if (VF.isVector() && HasReductions) {
5468 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5469 return IC;
5470 }
5471
5472 // For any scalar loop that either requires runtime checks or predication we
5473 // are better off leaving this to the unroller. Note that if we've already
5474 // vectorized the loop we will have done the runtime check and so interleaving
5475 // won't require further checks.
5476 bool ScalarInterleavingRequiresPredication =
5477 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5478 return Legal->blockNeedsPredication(BB);
5479 }));
5480 bool ScalarInterleavingRequiresRuntimePointerCheck =
5482
5483 // We want to interleave small loops in order to reduce the loop overhead and
5484 // potentially expose ILP opportunities.
5485 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5486 << "LV: IC is " << IC << '\n'
5487 << "LV: VF is " << VF << '\n');
5488 const bool AggressivelyInterleaveReductions =
5489 TTI.enableAggressiveInterleaving(HasReductions);
5490 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5491 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5492 // We assume that the cost overhead is 1 and we use the cost model
5493 // to estimate the cost of the loop and interleave until the cost of the
5494 // loop overhead is about 5% of the cost of the loop.
5495 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5496 SmallLoopCost / *LoopCost.getValue()));
5497
5498 // Interleave until store/load ports (estimated by max interleave count) are
5499 // saturated.
5500 unsigned NumStores = Legal->getNumStores();
5501 unsigned NumLoads = Legal->getNumLoads();
5502 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5503 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5504
5505 // There is little point in interleaving for reductions containing selects
5506 // and compares when VF=1 since it may just create more overhead than it's
5507 // worth for loops with small trip counts. This is because we still have to
5508 // do the final reduction after the loop.
5509 bool HasSelectCmpReductions =
5510 HasReductions &&
5511 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5512 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5513 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5514 RdxDesc.getRecurrenceKind());
5515 });
5516 if (HasSelectCmpReductions) {
5517 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5518 return 1;
5519 }
5520
5521 // If we have a scalar reduction (vector reductions are already dealt with
5522 // by this point), we can increase the critical path length if the loop
5523 // we're interleaving is inside another loop. For tree-wise reductions
5524 // set the limit to 2, and for ordered reductions it's best to disable
5525 // interleaving entirely.
5526 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5527 bool HasOrderedReductions =
5528 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5529 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5530 return RdxDesc.isOrdered();
5531 });
5532 if (HasOrderedReductions) {
5533 LLVM_DEBUG(
5534 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5535 return 1;
5536 }
5537
5538 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5539 SmallIC = std::min(SmallIC, F);
5540 StoresIC = std::min(StoresIC, F);
5541 LoadsIC = std::min(LoadsIC, F);
5542 }
5543
5545 std::max(StoresIC, LoadsIC) > SmallIC) {
5546 LLVM_DEBUG(
5547 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5548 return std::max(StoresIC, LoadsIC);
5549 }
5550
5551 // If there are scalar reductions and TTI has enabled aggressive
5552 // interleaving for reductions, we will interleave to expose ILP.
5553 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5554 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5555 // Interleave no less than SmallIC but not as aggressive as the normal IC
5556 // to satisfy the rare situation when resources are too limited.
5557 return std::max(IC / 2, SmallIC);
5558 } else {
5559 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5560 return SmallIC;
5561 }
5562 }
5563
5564 // Interleave if this is a large loop (small loops are already dealt with by
5565 // this point) that could benefit from interleaving.
5566 if (AggressivelyInterleaveReductions) {
5567 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5568 return IC;
5569 }
5570
5571 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5572 return 1;
5573}
5574
5577 // This function calculates the register usage by measuring the highest number
5578 // of values that are alive at a single location. Obviously, this is a very
5579 // rough estimation. We scan the loop in a topological order in order and
5580 // assign a number to each instruction. We use RPO to ensure that defs are
5581 // met before their users. We assume that each instruction that has in-loop
5582 // users starts an interval. We record every time that an in-loop value is
5583 // used, so we have a list of the first and last occurrences of each
5584 // instruction. Next, we transpose this data structure into a multi map that
5585 // holds the list of intervals that *end* at a specific location. This multi
5586 // map allows us to perform a linear search. We scan the instructions linearly
5587 // and record each time that a new interval starts, by placing it in a set.
5588 // If we find this value in the multi-map then we remove it from the set.
5589 // The max register usage is the maximum size of the set.
5590 // We also search for instructions that are defined outside the loop, but are
5591 // used inside the loop. We need this number separately from the max-interval
5592 // usage number because when we unroll, loop-invariant values do not take
5593 // more register.
5595 DFS.perform(LI);
5596
5597 RegisterUsage RU;
5598
5599 // Each 'key' in the map opens a new interval. The values
5600 // of the map are the index of the 'last seen' usage of the
5601 // instruction that is the key.
5603
5604 // Maps instruction to its index.
5606 // Marks the end of each interval.
5607 IntervalMap EndPoint;
5608 // Saves the list of instruction indices that are used in the loop.
5610 // Saves the list of values that are used in the loop but are defined outside
5611 // the loop (not including non-instruction values such as arguments and
5612 // constants).
5613 SmallSetVector<Instruction *, 8> LoopInvariants;
5614
5615 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5616 for (Instruction &I : BB->instructionsWithoutDebug()) {
5617 IdxToInstr.push_back(&I);
5618
5619 // Save the end location of each USE.
5620 for (Value *U : I.operands()) {
5621 auto *Instr = dyn_cast<Instruction>(U);
5622
5623 // Ignore non-instruction values such as arguments, constants, etc.
5624 // FIXME: Might need some motivation why these values are ignored. If
5625 // for example an argument is used inside the loop it will increase the
5626 // register pressure (so shouldn't we add it to LoopInvariants).
5627 if (!Instr)
5628 continue;
5629
5630 // If this instruction is outside the loop then record it and continue.
5631 if (!TheLoop->contains(Instr)) {
5632 LoopInvariants.insert(Instr);
5633 continue;
5634 }
5635
5636 // Overwrite previous end points.
5637 EndPoint[Instr] = IdxToInstr.size();
5638 Ends.insert(Instr);
5639 }
5640 }
5641 }
5642
5643 // Saves the list of intervals that end with the index in 'key'.
5644 using InstrList = SmallVector<Instruction *, 2>;
5645 DenseMap<unsigned, InstrList> TransposeEnds;
5646
5647 // Transpose the EndPoints to a list of values that end at each index.
5648 for (auto &Interval : EndPoint)
5649 TransposeEnds[Interval.second].push_back(Interval.first);
5650
5651 SmallPtrSet<Instruction *, 8> OpenIntervals;
5654
5655 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5656
5657 const auto &TTICapture = TTI;
5658 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5659 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5660 return 0;
5661 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5662 };
5663
5664 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5665 Instruction *I = IdxToInstr[i];
5666
5667 // Remove all of the instructions that end at this location.
5668 InstrList &List = TransposeEnds[i];
5669 for (Instruction *ToRemove : List)
5670 OpenIntervals.erase(ToRemove);
5671
5672 // Ignore instructions that are never used within the loop.
5673 if (!Ends.count(I))
5674 continue;
5675
5676 // Skip ignored values.
5677 if (ValuesToIgnore.count(I))
5678 continue;
5679
5681
5682 // For each VF find the maximum usage of registers.
5683 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5684 // Count the number of registers used, per register class, given all open
5685 // intervals.
5686 // Note that elements in this SmallMapVector will be default constructed
5687 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5688 // there is no previous entry for ClassID.
5690
5691 if (VFs[j].isScalar()) {
5692 for (auto *Inst : OpenIntervals) {
5693 unsigned ClassID =
5694 TTI.getRegisterClassForType(false, Inst->getType());
5695 // FIXME: The target might use more than one register for the type
5696 // even in the scalar case.
5697 RegUsage[ClassID] += 1;
5698 }
5699 } else {
5701 for (auto *Inst : OpenIntervals) {
5702 // Skip ignored values for VF > 1.
5703 if (VecValuesToIgnore.count(Inst))
5704 continue;
5705 if (isScalarAfterVectorization(Inst, VFs[j])) {
5706 unsigned ClassID =
5707 TTI.getRegisterClassForType(false, Inst->getType());
5708 // FIXME: The target might use more than one register for the type
5709 // even in the scalar case.
5710 RegUsage[ClassID] += 1;
5711 } else {
5712 unsigned ClassID =
5713 TTI.getRegisterClassForType(true, Inst->getType());
5714 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5715 }
5716 }
5717 }
5718
5719 for (auto& pair : RegUsage) {
5720 auto &Entry = MaxUsages[j][pair.first];
5721 Entry = std::max(Entry, pair.second);
5722 }
5723 }
5724
5725 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5726 << OpenIntervals.size() << '\n');
5727
5728 // Add the current instruction to the list of open intervals.
5729 OpenIntervals.insert(I);
5730 }
5731
5732 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5733 // Note that elements in this SmallMapVector will be default constructed
5734 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5735 // there is no previous entry for ClassID.
5737
5738 for (auto *Inst : LoopInvariants) {
5739 // FIXME: The target might use more than one register for the type
5740 // even in the scalar case.
5741 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5742 auto *I = cast<Instruction>(U);
5743 return TheLoop != LI->getLoopFor(I->getParent()) ||
5744 isScalarAfterVectorization(I, VFs[i]);
5745 });
5746
5747 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5748 unsigned ClassID =
5749 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5750 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5751 }
5752
5753 LLVM_DEBUG({
5754 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5755 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5756 << " item\n";
5757 for (const auto &pair : MaxUsages[i]) {
5758 dbgs() << "LV(REG): RegisterClass: "
5759 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5760 << " registers\n";
5761 }
5762 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5763 << " item\n";
5764 for (const auto &pair : Invariant) {
5765 dbgs() << "LV(REG): RegisterClass: "
5766 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5767 << " registers\n";
5768 }
5769 });
5770
5771 RU.LoopInvariantRegs = Invariant;
5772 RU.MaxLocalUsers = MaxUsages[i];
5773 RUs[i] = RU;
5774 }
5775
5776 return RUs;
5777}
5778
5779bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5780 ElementCount VF) {
5781 // TODO: Cost model for emulated masked load/store is completely
5782 // broken. This hack guides the cost model to use an artificially
5783 // high enough value to practically disable vectorization with such
5784 // operations, except where previously deployed legality hack allowed
5785 // using very low cost values. This is to avoid regressions coming simply
5786 // from moving "masked load/store" check from legality to cost model.
5787 // Masked Load/Gather emulation was previously never allowed.
5788 // Limited number of Masked Store/Scatter emulation was allowed.
5790 "Expecting a scalar emulated instruction");
5791 return isa<LoadInst>(I) ||
5792 (isa<StoreInst>(I) &&
5793 NumPredStores > NumberOfStoresToPredicate);
5794}
5795
5797 // If we aren't vectorizing the loop, or if we've already collected the
5798 // instructions to scalarize, there's nothing to do. Collection may already
5799 // have occurred if we have a user-selected VF and are now computing the
5800 // expected cost for interleaving.
5801 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5802 return;
5803
5804 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5805 // not profitable to scalarize any instructions, the presence of VF in the
5806 // map will indicate that we've analyzed it already.
5807 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5808
5809 PredicatedBBsAfterVectorization[VF].clear();
5810
5811 // Find all the instructions that are scalar with predication in the loop and
5812 // determine if it would be better to not if-convert the blocks they are in.
5813 // If so, we also record the instructions to scalarize.
5814 for (BasicBlock *BB : TheLoop->blocks()) {
5816 continue;
5817 for (Instruction &I : *BB)
5818 if (isScalarWithPredication(&I, VF)) {
5819 ScalarCostsTy ScalarCosts;
5820 // Do not apply discount if scalable, because that would lead to
5821 // invalid scalarization costs.
5822 // Do not apply discount logic if hacked cost is needed
5823 // for emulated masked memrefs.
5824 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
5825 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5826 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5827 // Remember that BB will remain after vectorization.
5828 PredicatedBBsAfterVectorization[VF].insert(BB);
5829 }
5830 }
5831}
5832
5833InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5834 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5835 assert(!isUniformAfterVectorization(PredInst, VF) &&
5836 "Instruction marked uniform-after-vectorization will be predicated");
5837
5838 // Initialize the discount to zero, meaning that the scalar version and the
5839 // vector version cost the same.
5840 InstructionCost Discount = 0;
5841
5842 // Holds instructions to analyze. The instructions we visit are mapped in
5843 // ScalarCosts. Those instructions are the ones that would be scalarized if
5844 // we find that the scalar version costs less.
5846
5847 // Returns true if the given instruction can be scalarized.
5848 auto canBeScalarized = [&](Instruction *I) -> bool {
5849 // We only attempt to scalarize instructions forming a single-use chain
5850 // from the original predicated block that would otherwise be vectorized.
5851 // Although not strictly necessary, we give up on instructions we know will
5852 // already be scalar to avoid traversing chains that are unlikely to be
5853 // beneficial.
5854 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5856 return false;
5857
5858 // If the instruction is scalar with predication, it will be analyzed
5859 // separately. We ignore it within the context of PredInst.
5860 if (isScalarWithPredication(I, VF))
5861 return false;
5862
5863 // If any of the instruction's operands are uniform after vectorization,
5864 // the instruction cannot be scalarized. This prevents, for example, a
5865 // masked load from being scalarized.
5866 //
5867 // We assume we will only emit a value for lane zero of an instruction
5868 // marked uniform after vectorization, rather than VF identical values.
5869 // Thus, if we scalarize an instruction that uses a uniform, we would
5870 // create uses of values corresponding to the lanes we aren't emitting code
5871 // for. This behavior can be changed by allowing getScalarValue to clone
5872 // the lane zero values for uniforms rather than asserting.
5873 for (Use &U : I->operands())
5874 if (auto *J = dyn_cast<Instruction>(U.get()))
5875 if (isUniformAfterVectorization(J, VF))
5876 return false;
5877
5878 // Otherwise, we can scalarize the instruction.
5879 return true;
5880 };
5881
5882 // Compute the expected cost discount from scalarizing the entire expression
5883 // feeding the predicated instruction. We currently only consider expressions
5884 // that are single-use instruction chains.
5885 Worklist.push_back(PredInst);
5886 while (!Worklist.empty()) {
5887 Instruction *I = Worklist.pop_back_val();
5888
5889 // If we've already analyzed the instruction, there's nothing to do.
5890 if (ScalarCosts.contains(I))
5891 continue;
5892
5893 // Compute the cost of the vector instruction. Note that this cost already
5894 // includes the scalarization overhead of the predicated instruction.
5895 InstructionCost VectorCost = getInstructionCost(I, VF).first;
5896
5897 // Compute the cost of the scalarized instruction. This cost is the cost of
5898 // the instruction as if it wasn't if-converted and instead remained in the
5899 // predicated block. We will scale this cost by block probability after
5900 // computing the scalarization overhead.
5901 InstructionCost ScalarCost =
5902 VF.getFixedValue() *
5903 getInstructionCost(I, ElementCount::getFixed(1)).first;
5904
5905 // Compute the scalarization overhead of needed insertelement instructions
5906 // and phi nodes.
5908 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5909 ScalarCost += TTI.getScalarizationOverhead(
5910 cast<VectorType>(ToVectorTy(I->getType(), VF)),
5911 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5912 /*Extract*/ false, CostKind);
5913 ScalarCost +=
5914 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5915 }
5916
5917 // Compute the scalarization overhead of needed extractelement
5918 // instructions. For each of the instruction's operands, if the operand can
5919 // be scalarized, add it to the worklist; otherwise, account for the
5920 // overhead.
5921 for (Use &U : I->operands())
5922 if (auto *J = dyn_cast<Instruction>(U.get())) {
5923 assert(VectorType::isValidElementType(J->getType()) &&
5924 "Instruction has non-scalar type");
5925 if (canBeScalarized(J))
5926 Worklist.push_back(J);
5927 else if (needsExtract(J, VF)) {
5928 ScalarCost += TTI.getScalarizationOverhead(
5929 cast<VectorType>(ToVectorTy(J->getType(), VF)),
5930 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5931 /*Extract*/ true, CostKind);
5932 }
5933 }
5934
5935 // Scale the total scalar cost by block probability.
5936 ScalarCost /= getReciprocalPredBlockProb();
5937
5938 // Compute the discount. A non-negative discount means the vector version
5939 // of the instruction costs more, and scalarizing would be beneficial.
5940 Discount += VectorCost - ScalarCost;
5941 ScalarCosts[I] = ScalarCost;
5942 }
5943
5944 return Discount;
5945}
5946
5951
5952 // For each block.
5953 for (BasicBlock *BB : TheLoop->blocks()) {
5954 VectorizationCostTy BlockCost;
5955
5956 // For each instruction in the old loop.
5957 for (Instruction &I : BB->instructionsWithoutDebug()) {
5958 // Skip ignored values.
5959 if (ValuesToIgnore.count(&I) ||
5960 (VF.isVector() && VecValuesToIgnore.count(&I)))
5961 continue;
5962
5963 VectorizationCostTy C = getInstructionCost(&I, VF);
5964
5965 // Check if we should override the cost.
5966 if (C.first.isValid() &&
5967 ForceTargetInstructionCost.getNumOccurrences() > 0)
5969
5970 // Keep a list of instructions with invalid costs.
5971 if (Invalid && !C.first.isValid())
5972 Invalid->emplace_back(&I, VF);
5973
5974 BlockCost.first += C.first;
5975 BlockCost.second |= C.second;
5976 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5977 << " for VF " << VF << " For instruction: " << I
5978 << '\n');
5979 }
5980
5981 // If we are vectorizing a predicated block, it will have been
5982 // if-converted. This means that the block's instructions (aside from
5983 // stores and instructions that may divide by zero) will now be
5984 // unconditionally executed. For the scalar case, we may not always execute
5985 // the predicated block, if it is an if-else block. Thus, scale the block's
5986 // cost by the probability of executing it. blockNeedsPredication from
5987 // Legal is used so as to not include all blocks in tail folded loops.
5988 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5989 BlockCost.first /= getReciprocalPredBlockProb();
5990
5991 Cost.first += BlockCost.first;
5992 Cost.second |= BlockCost.second;
5993 }
5994
5995 return Cost;
5996}
5997
5998/// Gets Address Access SCEV after verifying that the access pattern
5999/// is loop invariant except the induction variable dependence.
6000///
6001/// This SCEV can be sent to the Target in order to estimate the address
6002/// calculation cost.
6004 Value *Ptr,
6007 const Loop *TheLoop) {
6008
6009 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6010 if (!Gep)
6011 return nullptr;
6012
6013 // We are looking for a gep with all loop invariant indices except for one
6014 // which should be an induction variable.
6015 auto SE = PSE.getSE();
6016 unsigned NumOperands = Gep->getNumOperands();
6017 for (unsigned i = 1; i < NumOperands; ++i) {
6018 Value *Opd = Gep->getOperand(i);
6019 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6020 !Legal->isInductionVariable(Opd))
6021 return nullptr;
6022 }
6023
6024 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6025 return PSE.getSCEV(Ptr);
6026}
6027
6029LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6030 ElementCount VF) {
6031 assert(VF.isVector() &&
6032 "Scalarization cost of instruction implies vectorization.");
6033 if (VF.isScalable())
6035
6036 Type *ValTy = getLoadStoreType(I);
6037 auto SE = PSE.getSE();
6038
6039 unsigned AS = getLoadStoreAddressSpace(I);
6041 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6042 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6043 // that it is being called from this specific place.
6044
6045 // Figure out whether the access is strided and get the stride value
6046 // if it's known in compile time
6047 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6048
6049 // Get the cost of the scalar memory instruction and address computation.
6051 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6052
6053 // Don't pass *I here, since it is scalar but will actually be part of a
6054 // vectorized loop where the user of it is a vectorized instruction.
6056 const Align Alignment = getLoadStoreAlignment(I);
6057 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6058 ValTy->getScalarType(),
6059 Alignment, AS, CostKind);
6060
6061 // Get the overhead of the extractelement and insertelement instructions
6062 // we might create due to scalarization.
6063 Cost += getScalarizationOverhead(I, VF, CostKind);
6064
6065 // If we have a predicated load/store, it will need extra i1 extracts and
6066 // conditional branches, but may not be executed for each vector lane. Scale
6067 // the cost by the probability of executing the predicated block.
6068 if (isPredicatedInst(I)) {
6070
6071 // Add the cost of an i1 extract and a branch
6072 auto *Vec_i1Ty =
6075 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6076 /*Insert=*/false, /*Extract=*/true, CostKind);
6077 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6078
6079 if (useEmulatedMaskMemRefHack(I, VF))
6080 // Artificially setting to a high enough value to practically disable
6081 // vectorization with such operations.
6082 Cost = 3000000;
6083 }
6084
6085 return Cost;
6086}
6087
6089LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6090 ElementCount VF) {
6091 Type *ValTy = getLoadStoreType(I);
6092 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6094 unsigned AS = getLoadStoreAddressSpace(I);
6095 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6097
6098 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6099 "Stride should be 1 or -1 for consecutive memory access");
6100 const Align Alignment = getLoadStoreAlignment(I);
6102 if (Legal->isMaskRequired(I)) {
6103 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6104 CostKind);
6105 } else {
6106 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6107 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6108 CostKind, OpInfo, I);
6109 }
6110
6111 bool Reverse = ConsecutiveStride < 0;
6112 if (Reverse)
6114 std::nullopt, CostKind, 0);
6115 return Cost;
6116}
6117
6119LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6120 ElementCount VF) {
6121 assert(Legal->isUniformMemOp(*I, VF));
6122
6123 Type *ValTy = getLoadStoreType(I);
6124 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6125 const Align Alignment = getLoadStoreAlignment(I);
6126 unsigned AS = getLoadStoreAddressSpace(I);
6128 if (isa<LoadInst>(I)) {
6129 return TTI.getAddressComputationCost(ValTy) +
6130 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6131 CostKind) +
6133 }
6134 StoreInst *SI = cast<StoreInst>(I);
6135
6136 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6137 return TTI.getAddressComputationCost(ValTy) +
6138 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6139 CostKind) +
6140 (isLoopInvariantStoreValue
6141 ? 0
6142 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6143 CostKind, VF.getKnownMinValue() - 1));
6144}
6145
6147LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6148 ElementCount VF) {
6149 Type *ValTy = getLoadStoreType(I);
6150 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6151 const Align Alignment = getLoadStoreAlignment(I);
6153
6154 return TTI.getAddressComputationCost(VectorTy) +
6156 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6158}
6159
6161LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6162 ElementCount VF) {
6163 Type *ValTy = getLoadStoreType(I);
6164 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6165 unsigned AS = getLoadStoreAddressSpace(I);
6167
6168 auto Group = getInterleavedAccessGroup(I);
6169 assert(Group && "Fail to get an interleaved access group.");
6170
6171 unsigned InterleaveFactor = Group->getFactor();
6172 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6173
6174 // Holds the indices of existing members in the interleaved group.
6176 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6177 if (Group->getMember(IF))
6178 Indices.push_back(IF);
6179
6180 // Calculate the cost of the whole interleaved group.
6181 bool UseMaskForGaps =
6182 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6183 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6185 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6186 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6187
6188 if (Group->isReverse()) {
6189 // TODO: Add support for reversed masked interleaved access.
6191 "Reverse masked interleaved access not supported.");
6192 Cost += Group->getNumMembers() *
6194 std::nullopt, CostKind, 0);
6195 }
6196 return Cost;
6197}
6198
6199std::optional<InstructionCost>
6200LoopVectorizationCostModel::getReductionPatternCost(
6201 Instruction *I, ElementCount VF, Type *Ty,
6203 using namespace llvm::PatternMatch;
6204 // Early exit for no inloop reductions
6205 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6206 return std::nullopt;
6207 auto *VectorTy = cast<VectorType>(Ty);
6208
6209 // We are looking for a pattern of, and finding the minimal acceptable cost:
6210 // reduce(mul(ext(A), ext(B))) or
6211 // reduce(mul(A, B)) or
6212 // reduce(ext(A)) or
6213 // reduce(A).
6214 // The basic idea is that we walk down the tree to do that, finding the root
6215 // reduction instruction in InLoopReductionImmediateChains. From there we find
6216 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6217 // of the components. If the reduction cost is lower then we return it for the
6218 // reduction instruction and 0 for the other instructions in the pattern. If
6219 // it is not we return an invalid cost specifying the orignal cost method
6220 // should be used.
6221 Instruction *RetI = I;
6222 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6223 if (!RetI->hasOneUser())
6224 return std::nullopt;
6225 RetI = RetI->user_back();
6226 }
6227
6228 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6229 RetI->user_back()->getOpcode() == Instruction::Add) {
6230 RetI = RetI->user_back();
6231 }
6232
6233 // Test if the found instruction is a reduction, and if not return an invalid
6234 // cost specifying the parent to use the original cost modelling.
6235 if (!InLoopReductionImmediateChains.count(RetI))
6236 return std::nullopt;
6237
6238 // Find the reduction this chain is a part of and calculate the basic cost of
6239 // the reduction on its own.
6240 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6241 Instruction *ReductionPhi = LastChain;
6242 while (!isa<PHINode>(ReductionPhi))
6243 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6244
6245 const RecurrenceDescriptor &RdxDesc =
6246 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6247
6249 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6250
6251 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6252 // normal fmul instruction to the cost of the fadd reduction.
6253 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6254 BaseCost +=
6255 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6256
6257 // If we're using ordered reductions then we can just return the base cost
6258 // here, since getArithmeticReductionCost calculates the full ordered
6259 // reduction cost when FP reassociation is not allowed.
6260 if (useOrderedReductions(RdxDesc))
6261 return BaseCost;
6262
6263 // Get the operand that was not the reduction chain and match it to one of the
6264 // patterns, returning the better cost if it is found.
6265 Instruction *RedOp = RetI->getOperand(1) == LastChain
6266 ? dyn_cast<Instruction>(RetI->getOperand(0))
6267 : dyn_cast<Instruction>(RetI->getOperand(1));
6268
6269 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6270
6271 Instruction *Op0, *Op1;
6272 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6273 match(RedOp,
6275 match(Op0, m_ZExtOrSExt(m_Value())) &&
6276 Op0->getOpcode() == Op1->getOpcode() &&
6277 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6279 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6280
6281 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6282 // Note that the extend opcodes need to all match, or if A==B they will have
6283 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6284 // which is equally fine.
6285 bool IsUnsigned = isa<ZExtInst>(Op0);
6286 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6287 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6288
6289 InstructionCost ExtCost =
6290 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6292 InstructionCost MulCost =
6293 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6294 InstructionCost Ext2Cost =
6295 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6297
6299 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6300
6301 if (RedCost.isValid() &&
6302 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6303 return I == RetI ? RedCost : 0;
6304 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6305 !TheLoop->isLoopInvariant(RedOp)) {
6306 // Matched reduce(ext(A))
6307 bool IsUnsigned = isa<ZExtInst>(RedOp);
6308 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6310 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6311 RdxDesc.getFastMathFlags(), CostKind);
6312
6313 InstructionCost ExtCost =
6314 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6316 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6317 return I == RetI ? RedCost : 0;
6318 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6319 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6320 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6321 Op0->getOpcode() == Op1->getOpcode() &&
6323 bool IsUnsigned = isa<ZExtInst>(Op0);
6324 Type *Op0Ty = Op0->getOperand(0)->getType();
6325 Type *Op1Ty = Op1->getOperand(0)->getType();
6326 Type *LargestOpTy =
6327 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6328 : Op0Ty;
6329 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6330
6331 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6332 // different sizes. We take the largest type as the ext to reduce, and add
6333 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6335 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6338 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6340 InstructionCost MulCost =
6341 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6342
6344 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6345 InstructionCost ExtraExtCost = 0;
6346 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6347 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6348 ExtraExtCost = TTI.getCastInstrCost(
6349 ExtraExtOp->getOpcode(), ExtType,
6350 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6352 }
6353
6354 if (RedCost.isValid() &&
6355 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6356 return I == RetI ? RedCost : 0;
6357 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6358 // Matched reduce.add(mul())
6359 InstructionCost MulCost =
6360 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6361
6363 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6364
6365 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6366 return I == RetI ? RedCost : 0;
6367 }
6368 }
6369
6370 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6371}
6372
6374LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6375 ElementCount VF) {
6376 // Calculate scalar cost only. Vectorization cost should be ready at this
6377 // moment.
6378 if (VF.isScalar()) {
6379 Type *ValTy = getLoadStoreType(I);
6380 const Align Alignment = getLoadStoreAlignment(I);
6381 unsigned AS = getLoadStoreAddressSpace(I);
6382
6383 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6384 return TTI.getAddressComputationCost(ValTy) +
6385 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6386 TTI::TCK_RecipThroughput, OpInfo, I);
6387 }
6388 return getWideningCost(I, VF);
6389}
6390
6392LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6393 ElementCount VF) {
6394 // If we know that this instruction will remain uniform, check the cost of
6395 // the scalar version.
6397 VF = ElementCount::getFixed(1);
6398
6399 if (VF.isVector() && isProfitableToScalarize(I, VF))
6400 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6401
6402 // Forced scalars do not have any scalarization overhead.
6403 auto ForcedScalar = ForcedScalars.find(VF);
6404 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6405 auto InstSet = ForcedScalar->second;
6406 if (InstSet.count(I))
6407 return VectorizationCostTy(
6408 (getInstructionCost(I, ElementCount::getFixed(1)).first *
6409 VF.getKnownMinValue()),
6410 false);
6411 }
6412
6413 Type *VectorTy;
6414 InstructionCost C = getInstructionCost(I, VF, VectorTy);
6415
6416 bool TypeNotScalarized = false;
6417 if (VF.isVector() && VectorTy->isVectorTy()) {
6418 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6419 if (VF.isScalable())
6420 // <vscale x 1 x iN> is assumed to be profitable over iN because
6421 // scalable registers are a distinct register class from scalar ones.
6422 // If we ever find a target which wants to lower scalable vectors
6423 // back to scalars, we'll need to update this code to explicitly
6424 // ask TTI about the register class uses for each part.
6425 TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6426 else
6427 TypeNotScalarized = NumParts < VF.getKnownMinValue();
6428 } else
6430 }
6431 return VectorizationCostTy(C, TypeNotScalarized);
6432}
6433
6434InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6436
6437 // There is no mechanism yet to create a scalable scalarization loop,
6438 // so this is currently Invalid.
6439 if (VF.isScalable())
6441
6442 if (VF.isScalar())
6443 return 0;
6444
6446 Type *RetTy = ToVectorTy(I->getType(), VF);
6447 if (!RetTy->isVoidTy() &&
6448 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6450 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6451 /*Insert*/ true,
6452 /*Extract*/ false, CostKind);
6453
6454 // Some targets keep addresses scalar.
6455 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6456 return Cost;
6457
6458 // Some targets support efficient element stores.
6459 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6460 return Cost;
6461
6462 // Collect operands to consider.
6463 CallInst *CI = dyn_cast<CallInst>(I);
6464 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6465
6466 // Skip operands that do not require extraction/scalarization and do not incur
6467 // any overhead.
6469 for (auto *V : filterExtractingOperands(Ops, VF))
6470 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6472 filterExtractingOperands(Ops, VF), Tys, CostKind);
6473}
6474
6476 if (VF.isScalar())
6477 return;
6478 NumPredStores = 0;
6479 for (BasicBlock *BB : TheLoop->blocks()) {
6480 // For each instruction in the old loop.
6481 for (Instruction &I : *BB) {
6483 if (!Ptr)
6484 continue;
6485
6486 // TODO: We should generate better code and update the cost model for
6487 // predicated uniform stores. Today they are treated as any other
6488 // predicated store (see added test cases in
6489 // invariant-store-vectorization.ll).
6490 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6491 NumPredStores++;
6492
6493 if (Legal->isUniformMemOp(I, VF)) {
6494 auto isLegalToScalarize = [&]() {
6495 if (!VF.isScalable())
6496 // Scalarization of fixed length vectors "just works".
6497 return true;
6498
6499 // We have dedicated lowering for unpredicated uniform loads and
6500 // stores. Note that even with tail folding we know that at least
6501 // one lane is active (i.e. generalized predication is not possible
6502 // here), and the logic below depends on this fact.
6503 if (!foldTailByMasking())
6504 return true;
6505
6506 // For scalable vectors, a uniform memop load is always
6507 // uniform-by-parts and we know how to scalarize that.
6508 if (isa<LoadInst>(I))
6509 return true;
6510
6511 // A uniform store isn't neccessarily uniform-by-part
6512 // and we can't assume scalarization.
6513 auto &SI = cast<StoreInst>(I);
6514 return TheLoop->isLoopInvariant(SI.getValueOperand());
6515 };
6516
6517 const InstructionCost GatherScatterCost =
6519 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6520
6521 // Load: Scalar load + broadcast
6522 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6523 // FIXME: This cost is a significant under-estimate for tail folded
6524 // memory ops.
6525 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6526 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6527
6528 // Choose better solution for the current VF, Note that Invalid
6529 // costs compare as maximumal large. If both are invalid, we get
6530 // scalable invalid which signals a failure and a vectorization abort.
6531 if (GatherScatterCost < ScalarizationCost)
6532 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6533 else
6534 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6535 continue;
6536 }
6537
6538 // We assume that widening is the best solution when possible.
6539 if (memoryInstructionCanBeWidened(&I, VF)) {
6540 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6541 int ConsecutiveStride = Legal->isConsecutivePtr(
6543 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6544 "Expected consecutive stride.");
6545 InstWidening Decision =
6546 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6547 setWideningDecision(&I, VF, Decision, Cost);
6548 continue;
6549 }
6550
6551 // Choose between Interleaving, Gather/Scatter or Scalarization.
6553 unsigned NumAccesses = 1;
6554 if (isAccessInterleaved(&I)) {
6555 auto Group = getInterleavedAccessGroup(&I);
6556 assert(Group && "Fail to get an interleaved access group.");
6557
6558 // Make one decision for the whole group.
6559 if (getWideningDecision(&I, VF) != CM_Unknown)
6560 continue;
6561
6562 NumAccesses = Group->getNumMembers();
6564 InterleaveCost = getInterleaveGroupCost(&I, VF);
6565 }
6566
6567 InstructionCost GatherScatterCost =
6569 ? getGatherScatterCost(&I, VF) * NumAccesses
6571
6572 InstructionCost ScalarizationCost =
6573 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6574
6575 // Choose better solution for the current VF,
6576 // write down this decision and use it during vectorization.
6578 InstWidening Decision;
6579 if (InterleaveCost <= GatherScatterCost &&
6580 InterleaveCost < ScalarizationCost) {
6581 Decision = CM_Interleave;
6582 Cost = InterleaveCost;
6583 } else if (GatherScatterCost < ScalarizationCost) {
6584 Decision = CM_GatherScatter;
6585 Cost = GatherScatterCost;
6586 } else {
6587 Decision = CM_Scalarize;
6588 Cost = ScalarizationCost;
6589 }
6590 // If the instructions belongs to an interleave group, the whole group
6591 // receives the same decision. The whole group receives the cost, but
6592 // the cost will actually be assigned to one instruction.
6593 if (auto Group = getInterleavedAccessGroup(&I))
6594 setWideningDecision(Group, VF, Decision, Cost);
6595 else
6596 setWideningDecision(&I, VF, Decision, Cost);
6597 }
6598 }
6599
6600 // Make sure that any load of address and any other address computation
6601 // remains scalar unless there is gather/scatter support. This avoids
6602 // inevitable extracts into address registers, and also has the benefit of
6603 // activating LSR more, since that pass can't optimize vectorized
6604 // addresses.
6606 return;
6607
6608 // Start with all scalar pointer uses.
6610 for (BasicBlock *BB : TheLoop->blocks())
6611 for (Instruction &I : *BB) {
6612 Instruction *PtrDef =
6613 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6614 if (PtrDef && TheLoop->contains(PtrDef) &&
6616 AddrDefs.insert(PtrDef);
6617 }
6618
6619 // Add all instructions used to generate the addresses.
6621 append_range(Worklist, AddrDefs);
6622 while (!Worklist.empty()) {
6623 Instruction *I = Worklist.pop_back_val();
6624 for (auto &Op : I->operands())
6625 if (auto *InstOp = dyn_cast<Instruction>(Op))
6626 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6627 AddrDefs.insert(InstOp).second)
6628 Worklist.push_back(InstOp);
6629 }
6630
6631 for (auto *I : AddrDefs) {
6632 if (isa<LoadInst>(I)) {
6633 // Setting the desired widening decision should ideally be handled in
6634 // by cost functions, but since this involves the task of finding out
6635 // if the loaded register is involved in an address computation, it is
6636 // instead changed here when we know this is the case.
6637 InstWidening Decision = getWideningDecision(I, VF);
6638 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6639 // Scalarize a widened load of address.
6641 I, VF, CM_Scalarize,
6642 (VF.getKnownMinValue() *
6643 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6644 else if (auto Group = getInterleavedAccessGroup(I)) {
6645 // Scalarize an interleave group of address loads.
6646 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6647 if (Instruction *Member = Group->getMember(I))
6649 Member, VF, CM_Scalarize,
6650 (VF.getKnownMinValue() *
6651 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6652 }
6653 }
6654 } else
6655 // Make sure I gets scalarized and a cost estimate without
6656 // scalarization overhead.
6657 ForcedScalars[VF].insert(I);
6658 }
6659}
6660
6662 assert(!VF.isScalar() &&
6663 "Trying to set a vectorization decision for a scalar VF");
6664
6665 for (BasicBlock *BB : TheLoop->blocks()) {
6666 // For each instruction in the old loop.
6667 for (Instruction &I : *BB) {
6668 CallInst *CI = dyn_cast<CallInst>(&I);
6669
6670 if (!CI)
6671 continue;
6672
6677
6678 Function *ScalarFunc = CI->getCalledFunction();
6679 Type *ScalarRetTy = CI->getType();
6680 SmallVector<Type *, 4> Tys, ScalarTys;
6681 bool MaskRequired = Legal->isMaskRequired(CI);
6682 for (auto &ArgOp : CI->args())
6683 ScalarTys.push_back(ArgOp->getType());
6684
6685 // Compute corresponding vector type for return value and arguments.
6686 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6687 for (Type *ScalarTy : ScalarTys)
6688 Tys.push_back(ToVectorTy(ScalarTy, VF));
6689
6690 // An in-loop reduction using an fmuladd intrinsic is a special case;
6691 // we don't want the normal cost for that intrinsic.
6693 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6696 std::nullopt, *RedCost);
6697 continue;
6698 }
6699
6700 // Estimate cost of scalarized vector call. The source operands are
6701 // assumed to be vectors, so we need to extract individual elements from
6702 // there, execute VF scalar calls, and then gather the result into the
6703 // vector return value.
6704 InstructionCost ScalarCallCost =
6705 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6706
6707 // Compute costs of unpacking argument values for the scalar calls and
6708 // packing the return values to a vector.
6709 InstructionCost ScalarizationCost =
6710 getScalarizationOverhead(CI, VF, CostKind);
6711
6712 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6713
6714 // Find the cost of vectorizing the call, if we can find a suitable
6715 // vector variant of the function.
6716 bool UsesMask = false;
6717 VFInfo FuncInfo;
6718 Function *VecFunc = nullptr;
6719 // Search through any available variants for one we can use at this VF.
6720 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6721 // Must match requested VF.
6722 if (Info.Shape.VF != VF)
6723 continue;
6724
6725 // Must take a mask argument if one is required
6726 if (MaskRequired && !Info.isMasked())
6727 continue;
6728
6729 // Check that all parameter kinds are supported
6730 bool ParamsOk = true;
6731 for (VFParameter Param : Info.Shape.Parameters) {
6732 switch (Param.ParamKind) {
6734 break;
6736 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6737 // Make sure the scalar parameter in the loop is invariant.
6738 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6739 TheLoop))
6740 ParamsOk = false;
6741 break;
6742 }
6744 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6745 // Find the stride for the scalar parameter in this loop and see if
6746 // it matches the stride for the variant.
6747 // TODO: do we need to figure out the cost of an extract to get the
6748 // first lane? Or do we hope that it will be folded away?
6749 ScalarEvolution *SE = PSE.getSE();
6750 const auto *SAR =
6751 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6752
6753 if (!SAR || SAR->getLoop() != TheLoop) {
6754 ParamsOk = false;
6755 break;
6756 }
6757
6758 const SCEVConstant *Step =
6759 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6760
6761 if (!Step ||
6762 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6763 ParamsOk = false;
6764
6765 break;
6766 }
6768 UsesMask = true;
6769 break;
6770 default:
6771 ParamsOk = false;
6772 break;
6773 }
6774 }
6775
6776 if (!ParamsOk)
6777 continue;
6778
6779 // Found a suitable candidate, stop here.
6780 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6781 FuncInfo = Info;
6782 break;
6783 }
6784
6785 // Add in the cost of synthesizing a mask if one wasn't required.
6786 InstructionCost MaskCost = 0;
6787 if (VecFunc && UsesMask && !MaskRequired)
6788 MaskCost = TTI.getShuffleCost(
6791 VecFunc->getFunctionType()->getContext()),
6792 VF));
6793
6794 if (TLI && VecFunc && !CI->isNoBuiltin())
6795 VectorCost =
6796 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6797
6798 // Find the cost of an intrinsic; some targets may have instructions that
6799 // perform the operation without needing an actual call.
6801 if (IID != Intrinsic::not_intrinsic)
6802 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6803
6804 InstructionCost Cost = ScalarCost;
6805 InstWidening Decision = CM_Scalarize;
6806
6807 if (VectorCost <= Cost) {
6808 Cost = VectorCost;
6809 Decision = CM_VectorCall;
6810 }
6811
6812 if (IntrinsicCost <= Cost) {
6813 Cost = IntrinsicCost;
6814 Decision = CM_IntrinsicCall;
6815 }
6816
6817 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6819 }
6820 }
6821}
6822
6824LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6825 Type *&VectorTy) {
6826 Type *RetTy = I->getType();
6828 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6829 auto SE = PSE.getSE();
6831
6832 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6833 ElementCount VF) -> bool {
6834 if (VF.isScalar())
6835 return true;
6836
6837 auto Scalarized = InstsToScalarize.find(VF);
6838 assert(Scalarized != InstsToScalarize.end() &&
6839 "VF not yet analyzed for scalarization profitability");
6840 return !Scalarized->second.count(I) &&
6841 llvm::all_of(I->users(), [&](User *U) {
6842 auto *UI = cast<Instruction>(U);
6843 return !Scalarized->second.count(UI);
6844 });
6845 };
6846 (void) hasSingleCopyAfterVectorization;
6847
6848 if (isScalarAfterVectorization(I, VF)) {
6849 // With the exception of GEPs and PHIs, after scalarization there should
6850 // only be one copy of the instruction generated in the loop. This is
6851 // because the VF is either 1, or any instructions that need scalarizing
6852 // have already been dealt with by the time we get here. As a result,
6853 // it means we don't have to multiply the instruction cost by VF.
6854 assert(I->getOpcode() == Instruction::GetElementPtr ||
6855 I->getOpcode() == Instruction::PHI ||
6856 (I->getOpcode() == Instruction::BitCast &&
6857 I->getType()->isPointerTy()) ||
6858 hasSingleCopyAfterVectorization(I, VF));
6859 VectorTy = RetTy;
6860 } else
6861 VectorTy = ToVectorTy(RetTy, VF);
6862
6863 // TODO: We need to estimate the cost of intrinsic calls.
6864 switch (I->getOpcode()) {
6865 case Instruction::GetElementPtr:
6866 // We mark this instruction as zero-cost because the cost of GEPs in
6867 // vectorized code depends on whether the corresponding memory instruction
6868 // is scalarized or not. Therefore, we handle GEPs with the memory
6869 // instruction cost.
6870 return 0;
6871 case Instruction::Br: {
6872 // In cases of scalarized and predicated instructions, there will be VF
6873 // predicated blocks in the vectorized loop. Each branch around these
6874 // blocks requires also an extract of its vector compare i1 element.
6875 bool ScalarPredicatedBB = false;
6876 BranchInst *BI = cast<BranchInst>(I);
6877 if (VF.isVector() && BI->isConditional() &&
6878 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6879 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6880 ScalarPredicatedBB = true;
6881
6882 if (ScalarPredicatedBB) {
6883 // Not possible to scalarize scalable vector with predicated instructions.
6884 if (VF.isScalable())
6886 // Return cost for branches around scalarized and predicated blocks.
6887 auto *Vec_i1Ty =
6888 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6889 return (
6891 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6892 /*Insert*/ false, /*Extract*/ true, CostKind) +
6893 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6894 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6895 // The back-edge branch will remain, as will all scalar branches.
6896 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6897 else
6898 // This branch will be eliminated by if-conversion.
6899 return 0;
6900 // Note: We currently assume zero cost for an unconditional branch inside
6901 // a predicated block since it will become a fall-through, although we
6902 // may decide in the future to call TTI for all branches.
6903 }
6904 case Instruction::PHI: {
6905 auto *Phi = cast<PHINode>(I);
6906
6907 // First-order recurrences are replaced by vector shuffles inside the loop.
6908 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6910 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6912 cast<VectorType>(VectorTy), Mask, CostKind,
6913 VF.getKnownMinValue() - 1);
6914 }
6915
6916 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6917 // converted into select instructions. We require N - 1 selects per phi
6918 // node, where N is the number of incoming values.
6919 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6920 return (Phi->getNumIncomingValues() - 1) *
6922 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6923 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6925
6926 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6927 }
6928 case Instruction::UDiv:
6929 case Instruction::SDiv:
6930 case Instruction::URem:
6931 case Instruction::SRem:
6932 if (VF.isVector() && isPredicatedInst(I)) {
6933 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6934 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6935 ScalarCost : SafeDivisorCost;
6936 }
6937 // We've proven all lanes safe to speculate, fall through.
6938 [[fallthrough]];
6939 case Instruction::Add:
6940 case Instruction::FAdd:
6941 case Instruction::Sub:
6942 case Instruction::FSub:
6943 case Instruction::Mul:
6944 case Instruction::FMul:
6945 case Instruction::FDiv:
6946 case Instruction::FRem:
6947 case Instruction::Shl:
6948 case Instruction::LShr:
6949 case Instruction::AShr:
6950 case Instruction::And:
6951 case Instruction::Or:
6952 case Instruction::Xor: {
6953 // If we're speculating on the stride being 1, the multiplication may
6954 // fold away. We can generalize this for all operations using the notion
6955 // of neutral elements. (TODO)
6956 if (I->getOpcode() == Instruction::Mul &&
6957 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6958 PSE.getSCEV(I->getOperand(1))->isOne()))
6959 return 0;
6960
6961 // Detect reduction patterns
6962 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6963 return *RedCost;
6964
6965 // Certain instructions can be cheaper to vectorize if they have a constant
6966 // second vector operand. One example of this are shifts on x86.
6967 Value *Op2 = I->getOperand(1);
6968 auto Op2Info = TTI.getOperandInfo(Op2);
6969 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6970 Legal->isInvariant(Op2))
6972
6973 SmallVector<const Value *, 4> Operands(I->operand_values());
6975 I->getOpcode(), VectorTy, CostKind,
6976 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6977 Op2Info, Operands, I, TLI);
6978 }
6979 case Instruction::FNeg: {
6981 I->getOpcode(), VectorTy, CostKind,
6982 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6983 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6984 I->getOperand(0), I);
6985 }
6986 case Instruction::Select: {
6987 SelectInst *SI = cast<SelectInst>(I);
6988 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6989 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6990
6991 const Value *Op0, *Op1;
6992 using namespace llvm::PatternMatch;
6993 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6994 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6995 // select x, y, false --> x & y
6996 // select x, true, y --> x | y
6997 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6998 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6999 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7000 Op1->getType()->getScalarSizeInBits() == 1);
7001
7004 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7005 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7006 }
7007
7008 Type *CondTy = SI->getCondition()->getType();
7009 if (!ScalarCond)
7010 CondTy = VectorType::get(CondTy, VF);
7011
7013 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7014 Pred = Cmp->getPredicate();
7015 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7016 CostKind, I);
7017 }
7018 case Instruction::ICmp:
7019 case Instruction::FCmp: {
7020 Type *ValTy = I->getOperand(0)->getType();
7021 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7022 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7023 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7024 VectorTy = ToVectorTy(ValTy, VF);
7025 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7026 cast<CmpInst>(I)->getPredicate(), CostKind,
7027 I);
7028 }
7029 case Instruction::Store:
7030 case Instruction::Load: {
7031 ElementCount Width = VF;
7032 if (Width.isVector()) {
7033 InstWidening Decision = getWideningDecision(I, Width);
7034 assert(Decision != CM_Unknown &&
7035 "CM decision should be taken at this point");
7038 if (Decision == CM_Scalarize)
7039 Width = ElementCount::getFixed(1);
7040 }
7041 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7042 return getMemoryInstructionCost(I, VF);
7043 }
7044 case Instruction::BitCast:
7045 if (I->getType()->isPointerTy())
7046 return 0;
7047 [[fallthrough]];
7048 case Instruction::ZExt:
7049 case Instruction::SExt:
7050 case Instruction::FPToUI:
7051 case Instruction::FPToSI:
7052 case Instruction::FPExt:
7053 case Instruction::PtrToInt:
7054 case Instruction::IntToPtr:
7055 case Instruction::SIToFP:
7056 case Instruction::UIToFP:
7057 case Instruction::Trunc:
7058 case Instruction::FPTrunc: {
7059 // Computes the CastContextHint from a Load/Store instruction.
7060 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7061 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7062 "Expected a load or a store!");
7063
7064 if (VF.isScalar() || !TheLoop->contains(I))
7066
7067 switch (getWideningDecision(I, VF)) {
7079 llvm_unreachable("Instr did not go through cost modelling?");
7082 llvm_unreachable_internal("Instr has invalid widening decision");
7083 }
7084
7085 llvm_unreachable("Unhandled case!");
7086 };
7087
7088 unsigned Opcode = I->getOpcode();
7090 // For Trunc, the context is the only user, which must be a StoreInst.
7091 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7092 if (I->hasOneUse())
7093 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7094 CCH = ComputeCCH(Store);
7095 }
7096 // For Z/Sext, the context is the operand, which must be a LoadInst.
7097 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7098 Opcode == Instruction::FPExt) {
7099 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7100 CCH = ComputeCCH(Load);
7101 }
7102
7103 // We optimize the truncation of induction variables having constant
7104 // integer steps. The cost of these truncations is the same as the scalar
7105 // operation.
7106 if (isOptimizableIVTruncate(I, VF)) {
7107 auto *Trunc = cast<TruncInst>(I);
7108 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7109 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7110 }
7111
7112 // Detect reduction patterns
7113 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7114 return *RedCost;
7115
7116 Type *SrcScalarTy = I->getOperand(0)->getType();
7117 Type *SrcVecTy =
7118 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7120 // This cast is going to be shrunk. This may remove the cast or it might
7121 // turn it into slightly different cast. For example, if MinBW == 16,
7122 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7123 //
7124 // Calculate the modified src and dest types.
7125 Type *MinVecTy = VectorTy;
7126 if (Opcode == Instruction::Trunc) {
7127 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7128 VectorTy =
7129 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7130 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7131 // Leave SrcVecTy unchanged - we only shrink the destination element
7132 // type.
7133 VectorTy =
7134 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7135 }
7136 }
7137
7138 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7139 }
7140 case Instruction::Call:
7141 return getVectorCallCost(cast<CallInst>(I), VF);
7142 case Instruction::ExtractValue:
7144 case Instruction::Alloca:
7145 // We cannot easily widen alloca to a scalable alloca, as
7146 // the result would need to be a vector of pointers.
7147 if (VF.isScalable())
7149 [[fallthrough]];
7150 default:
7151 // This opcode is unknown. Assume that it is the same as 'mul'.
7152 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7153 } // end of switch.
7154}
7155
7157 // Ignore ephemeral values.
7159
7160 // Find all stores to invariant variables. Since they are going to sink
7161 // outside the loop we do not need calculate cost for them.
7162 for (BasicBlock *BB : TheLoop->blocks())
7163 for (Instruction &I : *BB) {
7164 StoreInst *SI;
7165 if ((SI = dyn_cast<StoreInst>(&I)) &&
7166 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7167 ValuesToIgnore.insert(&I);
7168 }
7169
7170 // Ignore type-promoting instructions we identified during reduction
7171 // detection.
7172 for (const auto &Reduction : Legal->getReductionVars()) {
7173 const RecurrenceDescriptor &RedDes = Reduction.second;
7174 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7175 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7176 }
7177 // Ignore type-casting instructions we identified during induction
7178 // detection.
7179 for (const auto &Induction : Legal->getInductionVars()) {
7180 const InductionDescriptor &IndDes = Induction.second;
7181 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7182 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7183 }
7184}
7185
7187 for (const auto &Reduction : Legal->getReductionVars()) {
7188 PHINode *Phi = Reduction.first;
7189 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7190
7191 // We don't collect reductions that are type promoted (yet).
7192 if (RdxDesc.getRecurrenceType() != Phi->getType())
7193 continue;
7194
7195 // If the target would prefer this reduction to happen "in-loop", then we
7196 // want to record it as such.
7197 unsigned Opcode = RdxDesc.getOpcode();
7198 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7199 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7201 continue;
7202
7203 // Check that we can correctly put the reductions into the loop, by
7204 // finding the chain of operations that leads from the phi to the loop
7205 // exit value.
7206 SmallVector<Instruction *, 4> ReductionOperations =
7207 RdxDesc.getReductionOpChain(Phi, TheLoop);
7208 bool InLoop = !ReductionOperations.empty();
7209
7210 if (InLoop) {
7211 InLoopReductions.insert(Phi);
7212 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7213 Instruction *LastChain = Phi;
7214 for (auto *I : ReductionOperations) {
7215 InLoopReductionImmediateChains[I] = LastChain;
7216 LastChain = I;
7217 }
7218 }
7219 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7220 << " reduction for phi: " << *Phi << "\n");
7221 }
7222}
7223
7225 DebugLoc DL, const Twine &Name) {
7227 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7228 return tryInsertInstruction(
7229 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7230}
7231
7232// This function will select a scalable VF if the target supports scalable
7233// vectors and a fixed one otherwise.
7234// TODO: we could return a pair of values that specify the max VF and
7235// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7236// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7237// doesn't have a cost model that can choose which plan to execute if
7238// more than one is generated.
7241 unsigned WidestType;
7242 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7243
7248
7250 unsigned N = RegSize.getKnownMinValue() / WidestType;
7251 return ElementCount::get(N, RegSize.isScalable());
7252}
7253
7256 ElementCount VF = UserVF;
7257 // Outer loop handling: They may require CFG and instruction level
7258 // transformations before even evaluating whether vectorization is profitable.
7259 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7260 // the vectorization pipeline.
7261 if (!OrigLoop->isInnermost()) {
7262 // If the user doesn't provide a vectorization factor, determine a
7263 // reasonable one.
7264 if (UserVF.isZero()) {
7265 VF = determineVPlanVF(TTI, CM);
7266 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7267
7268 // Make sure we have a VF > 1 for stress testing.
7269 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7270 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7271 << "overriding computed VF.\n");
7272 VF = ElementCount::getFixed(4);
7273 }
7274 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7276 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7277 << "not supported by the target.\n");
7279 "Scalable vectorization requested but not supported by the target",
7280 "the scalable user-specified vectorization width for outer-loop "
7281 "vectorization cannot be used because the target does not support "
7282 "scalable vectors.",
7283 "ScalableVFUnfeasible", ORE, OrigLoop);
7285 }
7286 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7288 "VF needs to be a power of two");
7289 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7290 << "VF " << VF << " to build VPlans.\n");
7291 buildVPlans(VF, VF);
7292
7293 // For VPlan build stress testing, we bail out after VPlan construction.
7296
7297 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7298 }
7299
7300 LLVM_DEBUG(
7301 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7302 "VPlan-native path.\n");
7304}
7305
7306std::optional<VectorizationFactor>
7308 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7311
7312 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7313 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7314 return std::nullopt;
7315
7316 // Invalidate interleave groups if all blocks of loop will be predicated.
7317 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7319 LLVM_DEBUG(
7320 dbgs()
7321 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7322 "which requires masked-interleaved support.\n");
7324 // Invalidating interleave groups also requires invalidating all decisions
7325 // based on them, which includes widening decisions and uniform and scalar
7326 // values.
7328 }
7329
7330 ElementCount MaxUserVF =
7331 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7332 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7333 if (!UserVF.isZero() && UserVFIsLegal) {
7335 "VF needs to be a power of two");
7336 // Collect the instructions (and their associated costs) that will be more
7337 // profitable to scalarize.
7339 if (CM.selectUserVectorizationFactor(UserVF)) {
7340 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7341 buildVPlansWithVPRecipes(UserVF, UserVF);
7342 if (!hasPlanWithVF(UserVF)) {
7343 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7344 << ".\n");
7345 return std::nullopt;
7346 }
7347
7349 return {{UserVF, 0, 0}};
7350 } else
7351 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7352 "InvalidCost", ORE, OrigLoop);
7353 }
7354
7355 // Populate the set of Vectorization Factor Candidates.
7356 ElementCountSet VFCandidates;
7357 for (auto VF = ElementCount::getFixed(1);
7358 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7359 VFCandidates.insert(VF);
7360 for (auto VF = ElementCount::getScalable(1);
7361 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7362 VFCandidates.insert(VF);
7363
7365 for (const auto &VF : VFCandidates) {
7366 // Collect Uniform and Scalar instructions after vectorization with VF.
7368
7369 // Collect the instructions (and their associated costs) that will be more
7370 // profitable to scalarize.
7371 if (VF.isVector())
7373 }
7374
7375 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7376 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7377
7379 if (!MaxFactors.hasVector())
7381
7382 // Select the optimal vectorization factor.
7383 VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7384 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7385 if (!hasPlanWithVF(VF.Width)) {
7386 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7387 << ".\n");
7388 return std::nullopt;
7389 }
7390 return VF;
7391}
7392
7394 assert(count_if(VPlans,
7395 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7396 1 &&
7397 "Best VF has not a single VPlan.");
7398
7399 for (const VPlanPtr &Plan : VPlans) {
7400 if (Plan->hasVF(VF))
7401 return *Plan.get();
7402 }
7403 llvm_unreachable("No plan found!");
7404}
7405
7408 // Reserve first location for self reference to the LoopID metadata node.
7409 MDs.push_back(nullptr);
7410 bool IsUnrollMetadata = false;
7411 MDNode *LoopID = L->getLoopID();
7412 if (LoopID) {
7413 // First find existing loop unrolling disable metadata.
7414 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7415 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7416 if (MD) {
7417 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7418 IsUnrollMetadata =
7419 S && S->getString().starts_with("llvm.loop.unroll.disable");
7420 }
7421 MDs.push_back(LoopID->getOperand(i));
7422 }
7423 }
7424
7425 if (!IsUnrollMetadata) {
7426 // Add runtime unroll disable metadata.
7427 LLVMContext &Context = L->getHeader()->getContext();
7428 SmallVector<Metadata *, 1> DisableOperands;
7429 DisableOperands.push_back(
7430 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7431 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7432 MDs.push_back(DisableNode);
7433 MDNode *NewLoopID = MDNode::get(Context, MDs);
7434 // Set operand 0 to refer to the loop id itself.
7435 NewLoopID->replaceOperandWith(0, NewLoopID);
7436 L->setLoopID(NewLoopID);
7437 }
7438}
7439
7440// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7441// create a merge phi node for it and add it to \p ReductionResumeValues.
7443 VPInstruction *RedResult,
7445 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
7446 if (!RedResult ||
7448 return;
7449
7450 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7451 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7452
7453 Value *FinalValue =
7454 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7455 auto *ResumePhi =
7456 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7457
7458 // TODO: bc.merge.rdx should not be created here, instead it should be
7459 // modeled in VPlan.
7460 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7461 // Create a phi node that merges control-flow from the backedge-taken check
7462 // block and the middle block.
7463 auto *BCBlockPhi =
7464 PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7465 LoopScalarPreHeader->getTerminator()->getIterator());
7466
7467 // If we are fixing reductions in the epilogue loop then we should already
7468 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7469 // we carry over the incoming values correctly.
7470 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7471 if (Incoming == LoopMiddleBlock)
7472 BCBlockPhi->addIncoming(FinalValue, Incoming);
7473 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7474 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7475 Incoming);
7476 else
7477 BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
7478 }
7479
7480 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7481 // TODO: This fixup should instead be modeled in VPlan.
7482 // Fix the scalar loop reduction variable with the incoming reduction sum
7483 // from the vector body and from the backedge value.
7484 int IncomingEdgeBlockIdx =
7485 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7486 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7487 // Pick the other block.
7488 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7489 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7490 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7491 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7492
7493 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7494}
7495
7496std::pair<DenseMap<const SCEV *, Value *>,
7499 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7500 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7501 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7502 assert(BestVPlan.hasVF(BestVF) &&
7503 "Trying to execute plan with unsupported VF");
7504 assert(BestVPlan.hasUF(BestUF) &&
7505 "Trying to execute plan with unsupported UF");
7506 assert(
7507 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7508 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7509
7510 if (!IsEpilogueVectorization)
7511 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7512
7513 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7514 << ", UF=" << BestUF << '\n');
7515 BestVPlan.setName("Final VPlan");
7516 LLVM_DEBUG(BestVPlan.dump());
7517
7518 // Perform the actual loop transformation.
7519 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7520 OrigLoop->getHeader()->getContext());
7521
7522 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7523 // before making any changes to the CFG.
7524 if (!BestVPlan.getPreheader()->empty()) {
7525 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7527 BestVPlan.getPreheader()->execute(&State);
7528 }
7529 if (!ILV.getTripCount())
7530 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7531 else
7532 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7533 "count during epilogue vectorization");
7534
7535 // 1. Set up the skeleton for vectorization, including vector pre-header and
7536 // middle block. The vector loop is created during VPlan execution.
7537 Value *CanonicalIVStartValue;
7538 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7539 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7540 : State.ExpandedSCEVs);
7541
7542 // Only use noalias metadata when using memory checks guaranteeing no overlap
7543 // across all iterations.
7544 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7545 std::unique_ptr<LoopVersioning> LVer = nullptr;
7546 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7548
7549 // We currently don't use LoopVersioning for the actual loop cloning but we
7550 // still use it to add the noalias metadata.
7551 // TODO: Find a better way to re-use LoopVersioning functionality to add
7552 // metadata.
7553 LVer = std::make_unique<LoopVersioning>(
7554 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7555 PSE.getSE());
7556 State.LVer = &*LVer;
7558 }
7559
7561
7562 //===------------------------------------------------===//
7563 //
7564 // Notice: any optimization or new instruction that go
7565 // into the code below should also be implemented in
7566 // the cost-model.
7567 //
7568 //===------------------------------------------------===//
7569
7570 // 2. Copy and widen instructions from the old loop into the new loop.
7571 BestVPlan.prepareToExecute(ILV.getTripCount(),
7572 ILV.getOrCreateVectorTripCount(nullptr),
7573 CanonicalIVStartValue, State);
7574
7575 BestVPlan.execute(&State);
7576
7577 // 2.5 Collect reduction resume values.
7579 auto *ExitVPBB =
7580 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7581 for (VPRecipeBase &R : *ExitVPBB) {
7582 createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
7583 ReductionResumeValues, State, OrigLoop,
7584 State.CFG.VPBB2IRBB[ExitVPBB]);
7585 }
7586
7587 // 2.6. Maintain Loop Hints
7588 // Keep all loop hints from the original loop on the vector loop (we'll
7589 // replace the vectorizer-specific hints below).
7590 MDNode *OrigLoopID = OrigLoop->getLoopID();
7591
7592 std::optional<MDNode *> VectorizedLoopID =
7595
7596 VPBasicBlock *HeaderVPBB =
7598 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7599 if (VectorizedLoopID)
7600 L->setLoopID(*VectorizedLoopID);
7601 else {
7602 // Keep all loop hints from the original loop on the vector loop (we'll
7603 // replace the vectorizer-specific hints below).
7604 if (MDNode *LID = OrigLoop->getLoopID())
7605 L->setLoopID(LID);
7606
7607 LoopVectorizeHints Hints(L, true, *ORE);
7608 Hints.setAlreadyVectorized();
7609 }
7611 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7612 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7614
7615 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7616 // predication, updating analyses.
7617 ILV.fixVectorizedLoop(State, BestVPlan);
7618
7620
7621 return {State.ExpandedSCEVs, ReductionResumeValues};
7622}
7623
7624#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7626 for (const auto &Plan : VPlans)
7628 Plan->printDOT(O);
7629 else
7630 Plan->print(O);
7631}
7632#endif
7633
7634//===--------------------------------------------------------------------===//
7635// EpilogueVectorizerMainLoop
7636//===--------------------------------------------------------------------===//
7637
7638/// This function is partially responsible for generating the control flow
7639/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7640std::pair<BasicBlock *, Value *>
7642 const SCEV2ValueTy &ExpandedSCEVs) {
7644
7645 // Generate the code to check the minimum iteration count of the vector
7646 // epilogue (see below).
7650
7651 // Generate the code to check any assumptions that we've made for SCEV
7652 // expressions.
7654
7655 // Generate the code that checks at runtime if arrays overlap. We put the
7656 // checks into a separate block to make the more common case of few elements
7657 // faster.
7659
7660 // Generate the iteration count check for the main loop, *after* the check
7661 // for the epilogue loop, so that the path-length is shorter for the case
7662 // that goes directly through the vector epilogue. The longer-path length for
7663 // the main loop is compensated for, by the gain from vectorizing the larger
7664 // trip count. Note: the branch will get updated later on when we vectorize
7665 // the epilogue.
7668
7669 // Generate the induction variable.
7671
7672 // Skip induction resume value creation here because they will be created in
7673 // the second pass for the scalar loop. The induction resume values for the
7674 // inductions in the epilogue loop are created before executing the plan for
7675 // the epilogue loop.
7676
7677 return {completeLoopSkeleton(), nullptr};
7678}
7679
7681 LLVM_DEBUG({
7682 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7683 << "Main Loop VF:" << EPI.MainLoopVF
7684 << ", Main Loop UF:" << EPI.MainLoopUF
7685 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7686 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7687 });
7688}
7689
7692 dbgs() << "intermediate fn:\n"
7693 << *OrigLoop->getHeader()->getParent() << "\n";
7694 });
7695}
7696
7697BasicBlock *
7699 bool ForEpilogue) {
7700 assert(Bypass && "Expected valid bypass basic block.");
7701 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7702 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7703 Value *Count = getTripCount();
7704 // Reuse existing vector loop preheader for TC checks.
7705 // Note that new preheader block is generated for vector loop.
7706 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7707 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7708
7709 // Generate code to check if the loop's trip count is less than VF * UF of the
7710 // main vector loop.
7711 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7712 : VF.isVector())
7715
7716 Value *CheckMinIters = Builder.CreateICmp(
7717 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7718 "min.iters.check");
7719
7720 if (!ForEpilogue)
7721 TCCheckBlock->setName("vector.main.loop.iter.check");
7722
7723 // Create new preheader for vector loop.
7724 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7725 DT, LI, nullptr, "vector.ph");
7726
7727 if (ForEpilogue) {
7728 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7729 DT->getNode(Bypass)->getIDom()) &&
7730 "TC check is expected to dominate Bypass");
7731
7732 // Update dominator for Bypass & LoopExit.
7733 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7734 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7735 // For loops with multiple exits, there's no edge from the middle block
7736 // to exit blocks (as the epilogue must run) and thus no need to update
7737 // the immediate dominator of the exit blocks.
7739
7740 LoopBypassBlocks.push_back(TCCheckBlock);
7741
7742 // Save the trip count so we don't have to regenerate it in the
7743 // vec.epilog.iter.check. This is safe to do because the trip count
7744 // generated here dominates the vector epilog iter check.
7745 EPI.TripCount = Count;
7746 }
7747
7748 BranchInst &BI =
7749 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7752 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7753
7754 return TCCheckBlock;
7755}
7756
7757//===--------------------------------------------------------------------===//
7758// EpilogueVectorizerEpilogueLoop
7759//===--------------------------------------------------------------------===//
7760
7761/// This function is partially responsible for generating the control flow
7762/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7763std::pair<BasicBlock *, Value *>
7765 const SCEV2ValueTy &ExpandedSCEVs) {
7766 createVectorLoopSkeleton("vec.epilog.");
7767
7768 // Now, compare the remaining count and if there aren't enough iterations to
7769 // execute the vectorized epilogue skip to the scalar part.
7770 LoopVectorPreHeader->setName("vec.epilog.ph");
7771 BasicBlock *VecEpilogueIterationCountCheck =
7773 nullptr, "vec.epilog.iter.check", true);
7775 VecEpilogueIterationCountCheck);
7776
7777 // Adjust the control flow taking the state info from the main loop
7778 // vectorization into account.
7780 "expected this to be saved from the previous pass.");
7782 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7783
7786
7788 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7789
7790 if (EPI.SCEVSafetyCheck)
7792 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7793 if (EPI.MemSafetyCheck)
7795 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7796
7798 VecEpilogueIterationCountCheck,
7799 VecEpilogueIterationCountCheck->getSinglePredecessor());
7800
7803 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7804 // If there is an epilogue which must run, there's no edge from the
7805 // middle block to exit blocks and thus no need to update the immediate
7806 // dominator of the exit blocks.
7809
7810 // Keep track of bypass blocks, as they feed start values to the induction and
7811 // reduction phis in the scalar loop preheader.
7812 if (EPI.SCEVSafetyCheck)
7814 if (EPI.MemSafetyCheck)
7817
7818 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7819 // reductions which merge control-flow from the latch block and the middle
7820 // block. Update the incoming values here and move the Phi into the preheader.
7821 SmallVector<PHINode *, 4> PhisInBlock;
7822 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7823 PhisInBlock.push_back(&Phi);
7824
7825 for (PHINode *Phi : PhisInBlock) {
7826 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7827 Phi->replaceIncomingBlockWith(
7828 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7829 VecEpilogueIterationCountCheck);
7830
7831 // If the phi doesn't have an incoming value from the
7832 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7833 // value and also those from other check blocks. This is needed for
7834 // reduction phis only.
7835 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7836 return EPI.EpilogueIterationCountCheck == IncB;
7837 }))
7838 continue;
7839 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7840 if (EPI.SCEVSafetyCheck)
7841 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7842 if (EPI.MemSafetyCheck)
7843 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7844 }
7845
7846 // Generate a resume induction for the vector epilogue and put it in the
7847 // vector epilogue preheader
7848 Type *IdxTy = Legal->getWidestInductionType();
7849 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7851 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7852 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7854
7855 // Generate induction resume values. These variables save the new starting
7856 // indexes for the scalar loop. They are used to test if there are any tail
7857 // iterations left once the vector loop has completed.
7858 // Note that when the vectorized epilogue is skipped due to iteration count
7859 // check, then the resume value for the induction variable comes from
7860 // the trip count of the main vector loop, hence passing the AdditionalBypass
7861 // argument.
7862 createInductionResumeValues(ExpandedSCEVs,
7863 {VecEpilogueIterationCountCheck,
7864 EPI.VectorTripCount} /* AdditionalBypass */);
7865
7866 return {completeLoopSkeleton(), EPResumeVal};
7867}
7868
7869BasicBlock *
7871 BasicBlock *Bypass, BasicBlock *Insert) {
7872
7874 "Expected trip count to have been safed in the first pass.");
7875 assert(
7876 (!isa<Instruction>(EPI.TripCount) ||
7877 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7878 "saved trip count does not dominate insertion point.");
7879 Value *TC = EPI.TripCount;
7880 IRBuilder<> Builder(Insert->getTerminator());
7881 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7882
7883 // Generate code to check if the loop's trip count is less than VF * UF of the
7884 // vector epilogue loop.
7885 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7888
7889 Value *CheckMinIters =
7890 Builder.CreateICmp(P, Count,
7893 "min.epilog.iters.check");
7894
7895 BranchInst &BI =
7896 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7898 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7899 unsigned EpilogueLoopStep =
7901 // We assume the remaining `Count` is equally distributed in
7902 // [0, MainLoopStep)
7903 // So the probability for `Count < EpilogueLoopStep` should be
7904 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7905 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7906 const uint32_t Weights[] = {EstimatedSkipCount,
7907 MainLoopStep - EstimatedSkipCount};
7908 setBranchWeights(BI, Weights);
7909 }
7910 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7911
7912 LoopBypassBlocks.push_back(Insert);
7913 return Insert;
7914}
7915
7917 LLVM_DEBUG({
7918 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7919 << "Epilogue Loop VF:" << EPI.EpilogueVF
7920 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7921 });
7922}
7923
7926 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7927 });
7928}
7929
7931 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7932 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7933 bool PredicateAtRangeStart = Predicate(Range.Start);
7934
7935 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7936 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7937 Range.End = TmpVF;
7938 break;
7939 }
7940
7941 return PredicateAtRangeStart;
7942}
7943
7944/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7945/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7946/// of VF's starting at a given VF and extending it as much as possible. Each
7947/// vectorization decision can potentially shorten this sub-range during
7948/// buildVPlan().
7950 ElementCount MaxVF) {
7951 auto MaxVFTimes2 = MaxVF * 2;
7952 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7953 VFRange SubRange = {VF, MaxVFTimes2};
7954 VPlans.push_back(buildVPlan(SubRange));
7955 VF = SubRange.End;
7956 }
7957}
7958
7959iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
7961 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
7962 if (auto *I = dyn_cast<Instruction>(Op)) {
7963 if (auto *R = Ingredient2Recipe.lookup(I))
7964 return R->getVPSingleValue();
7965 }
7966 return Plan.getOrAddLiveIn(Op);
7967 };
7968 return map_range(Operands, Fn);
7969}
7970
7972 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7973
7974 // Look for cached value.
7975 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7976 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7977 if (ECEntryIt != EdgeMaskCache.end())
7978 return ECEntryIt->second;
7979
7980 VPValue *SrcMask = getBlockInMask(Src);
7981
7982 // The terminator has to be a branch inst!
7983 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7984 assert(BI && "Unexpected terminator found");
7985
7986 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7987 return EdgeMaskCache[Edge] = SrcMask;
7988
7989 // If source is an exiting block, we know the exit edge is dynamically dead
7990 // in the vector loop, and thus we don't need to restrict the mask. Avoid
7991 // adding uses of an otherwise potentially dead instruction.
7992 if (OrigLoop->isLoopExiting(Src))
7993 return EdgeMaskCache[Edge] = SrcMask;
7994
7995 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);
7996 assert(EdgeMask && "No Edge Mask found for condition");
7997
7998 if (BI->getSuccessor(0) != Dst)
7999 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8000
8001 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8002 // The condition is 'SrcMask && EdgeMask', which is equivalent to
8003 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8004 // The select version does not introduce new UB if SrcMask is false and
8005 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8006 VPValue *False = Plan.getOrAddLiveIn(
8008 EdgeMask =
8009 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8010 }
8011
8012 return EdgeMaskCache[Edge] = EdgeMask;
8013}
8014
8016 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8017
8018 // Look for cached value.
8019 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8020 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8021 assert(ECEntryIt != EdgeMaskCache.end() &&
8022 "looking up mask for edge which has not been created");
8023 return ECEntryIt->second;
8024}
8025
8027 BasicBlock *Header = OrigLoop->getHeader();
8028
8029 // When not folding the tail, use nullptr to model all-true mask.
8030 if (!CM.foldTailByMasking()) {
8031 BlockMaskCache[Header] = nullptr;
8032 return;
8033 }
8034
8035 // Introduce the early-exit compare IV <= BTC to form header block mask.
8036 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8037 // constructing the desired canonical IV in the header block as its first
8038 // non-phi instructions.
8039
8040 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8041 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8042 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8043 HeaderVPBB->insert(IV, NewInsertionPoint);
8044
8045 VPBuilder::InsertPointGuard Guard(Builder);
8046 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8047 VPValue *BlockMask = nullptr;
8049 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8050 BlockMaskCache[Header] = BlockMask;
8051}
8052
8054 // Return the cached value.
8055 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8056 assert(BCEntryIt != BlockMaskCache.end() &&
8057 "Trying to access mask for block without one.");
8058 return BCEntryIt->second;
8059}
8060
8062 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8063 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8064 assert(OrigLoop->getHeader() != BB &&
8065 "Loop header must have cached block mask");
8066
8067 // All-one mask is modelled as no-mask following the convention for masked
8068 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8069 VPValue *BlockMask = nullptr;
8070 // This is the block mask. We OR all incoming edges.
8071 for (auto *Predecessor : predecessors(BB)) {
8072 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8073 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8074 BlockMaskCache[BB] = EdgeMask;
8075 return;
8076 }
8077
8078 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8079 BlockMask = EdgeMask;
8080 continue;
8081 }
8082
8083 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8084 }
8085
8086 BlockMaskCache[BB] = BlockMask;
8087}
8088
8090VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8091 VFRange &Range) {
8092 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8093 "Must be called with either a load or store");
8094
8095 auto willWiden = [&](ElementCount VF) -> bool {
8097 CM.getWideningDecision(I, VF);
8099 "CM decision should be taken at this point.");
8101 return true;
8102 if (CM.isScalarAfterVectorization(I, VF) ||
8103 CM.isProfitableToScalarize(I, VF))
8104 return false;
8106 };
8107
8109 return nullptr;
8110
8111 VPValue *Mask = nullptr;
8112 if (Legal->isMaskRequired(I))
8113 Mask = getBlockInMask(I->getParent());
8114
8115 // Determine if the pointer operand of the access is either consecutive or
8116 // reverse consecutive.
8118 CM.getWideningDecision(I, Range.Start);
8120 bool Consecutive =
8122
8123 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8124 if (Consecutive) {
8125 auto *GEP = dyn_cast<GetElementPtrInst>(
8126 Ptr->getUnderlyingValue()->stripPointerCasts());
8127 auto *VectorPtr = new VPVectorPointerRecipe(
8128 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8129 I->getDebugLoc());
8130 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8131 Ptr = VectorPtr;
8132 }
8133 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8134 return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8135 Reverse, I->getDebugLoc());
8136
8137 StoreInst *Store = cast<StoreInst>(I);
8139 *Store, Ptr, Operands[0], Mask, Consecutive, Reverse, I->getDebugLoc());
8140}
8141
8142/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8143/// insert a recipe to expand the step for the induction recipe.
8146 VPValue *Start, const InductionDescriptor &IndDesc,
8147 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8148 VFRange &Range) {
8149 assert(IndDesc.getStartValue() ==
8150 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8151 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8152 "step must be loop invariant");
8153
8154 VPValue *Step =
8156 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8157 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8158 }
8159 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8160 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8161}
8162
8163VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8165
8166 // Check if this is an integer or fp induction. If so, build the recipe that
8167 // produces its scalar and vector values.
8168 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8169 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8170 *PSE.getSE(), *OrigLoop, Range);
8171
8172 // Check if this is pointer induction. If so, build the recipe for it.
8173 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8174 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8175 *PSE.getSE());
8177 Phi, Operands[0], Step, *II,
8179 [&](ElementCount VF) {
8180 return CM.isScalarAfterVectorization(Phi, VF);
8181 },
8182 Range));
8183 }
8184 return nullptr;
8185}
8186
8187VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8189 // Optimize the special case where the source is a constant integer
8190 // induction variable. Notice that we can only optimize the 'trunc' case
8191 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8192 // (c) other casts depend on pointer size.
8193
8194 // Determine whether \p K is a truncation based on an induction variable that
8195 // can be optimized.
8196 auto isOptimizableIVTruncate =
8197 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8198 return [=](ElementCount VF) -> bool {
8199 return CM.isOptimizableIVTruncate(K, VF);
8200 };
8201 };
8202
8204 isOptimizableIVTruncate(I), Range)) {
8205
8206 auto *Phi = cast<PHINode>(I->getOperand(0));
8207 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8208 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8209 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8210 *OrigLoop, Range);
8211 }
8212 return nullptr;
8213}
8214
8215VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8217 unsigned NumIncoming = Phi->getNumIncomingValues();
8218
8219 // We know that all PHIs in non-header blocks are converted into selects, so
8220 // we don't have to worry about the insertion order and we can just use the
8221 // builder. At this point we generate the predication tree. There may be
8222 // duplications since this is a simple recursive scan, but future
8223 // optimizations will clean it up.
8224 // TODO: At the moment the first mask is always skipped, but it would be
8225 // better to skip the most expensive mask.
8226 SmallVector<VPValue *, 2> OperandsWithMask;
8227
8228 for (unsigned In = 0; In < NumIncoming; In++) {
8229 OperandsWithMask.push_back(Operands[In]);
8230 VPValue *EdgeMask =
8231 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8232 if (!EdgeMask) {
8233 assert(In == 0 && "Both null and non-null edge masks found");
8235 "Distinct incoming values with one having a full mask");
8236 break;
8237 }
8238 if (In == 0)
8239 continue;
8240 OperandsWithMask.push_back(EdgeMask);
8241 }
8242 return new VPBlendRecipe(Phi, OperandsWithMask);
8243}
8244
8245VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8247 VFRange &Range) {
8249 [this, CI](ElementCount VF) {
8250 return CM.isScalarWithPredication(CI, VF);
8251 },
8252 Range);
8253
8254 if (IsPredicated)
8255 return nullptr;
8256
8258 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8259 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8260 ID == Intrinsic::pseudoprobe ||
8261 ID == Intrinsic::experimental_noalias_scope_decl))
8262 return nullptr;
8263
8264 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8265
8266 // Is it beneficial to perform intrinsic call compared to lib call?
8267 bool ShouldUseVectorIntrinsic =
8269 [&](ElementCount VF) -> bool {
8270 return CM.getCallWideningDecision(CI, VF).Kind ==
8272 },
8273 Range);
8274 if (ShouldUseVectorIntrinsic)
8275 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
8276 CI->getDebugLoc());
8277
8278 Function *Variant = nullptr;
8279 std::optional<unsigned> MaskPos;
8280 // Is better to call a vectorized version of the function than to to scalarize
8281 // the call?
8282 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8283 [&](ElementCount VF) -> bool {
8284 // The following case may be scalarized depending on the VF.
8285 // The flag shows whether we can use a usual Call for vectorized
8286 // version of the instruction.
8287
8288 // If we've found a variant at a previous VF, then stop looking. A
8289 // vectorized variant of a function expects input in a certain shape
8290 // -- basically the number of input registers, the number of lanes
8291 // per register, and whether there's a mask required.
8292 // We store a pointer to the variant in the VPWidenCallRecipe, so
8293 // once we have an appropriate variant it's only valid for that VF.
8294 // This will force a different vplan to be generated for each VF that
8295 // finds a valid variant.
8296 if (Variant)
8297 return false;
8299 CM.getCallWideningDecision(CI, VF);
8301 Variant = Decision.Variant;
8302 MaskPos = Decision.MaskPos;
8303 return true;
8304 }
8305
8306 return false;
8307 },
8308 Range);
8309 if (ShouldUseVectorCall) {
8310 if (MaskPos.has_value()) {
8311 // We have 2 cases that would require a mask:
8312 // 1) The block needs to be predicated, either due to a conditional
8313 // in the scalar loop or use of an active lane mask with
8314 // tail-folding, and we use the appropriate mask for the block.
8315 // 2) No mask is required for the block, but the only available
8316 // vector variant at this VF requires a mask, so we synthesize an
8317 // all-true mask.
8318 VPValue *Mask = nullptr;
8319 if (Legal->isMaskRequired(CI))
8320 Mask = getBlockInMask(CI->getParent());
8321 else
8323 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8324
8325 Ops.insert(Ops.begin() + *MaskPos, Mask);
8326 }
8327
8328 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8330 Variant);
8331 }
8332
8333 return nullptr;
8334}
8335
8336bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8337 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8338 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8339 // Instruction should be widened, unless it is scalar after vectorization,
8340 // scalarization is profitable or it is predicated.
8341 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8342 return CM.isScalarAfterVectorization(I, VF) ||
8343 CM.isProfitableToScalarize(I, VF) ||
8344 CM.isScalarWithPredication(I, VF);
8345 };
8347 Range);
8348}
8349
8350VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8352 VPBasicBlock *VPBB) {
8353 switch (I->getOpcode()) {
8354 default:
8355 return nullptr;
8356 case Instruction::SDiv:
8357 case Instruction::UDiv:
8358 case Instruction::SRem:
8359 case Instruction::URem: {
8360 // If not provably safe, use a select to form a safe divisor before widening the
8361 // div/rem operation itself. Otherwise fall through to general handling below.
8362 if (CM.isPredicatedInst(I)) {
8363 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8364 VPValue *Mask = getBlockInMask(I->getParent());
8365 VPValue *One =
8366 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8367 auto *SafeRHS =
8368 new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8369 I->getDebugLoc());
8370 VPBB->appendRecipe(SafeRHS);
8371 Ops[1] = SafeRHS;
8372 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8373 }
8374 [[fallthrough]];
8375 }
8376 case Instruction::Add:
8377 case Instruction::And:
8378 case Instruction::AShr:
8379 case Instruction::FAdd:
8380 case Instruction::FCmp:
8381 case Instruction::FDiv:
8382 case Instruction::FMul:
8383 case Instruction::FNeg:
8384 case Instruction::FRem:
8385 case Instruction::FSub:
8386 case Instruction::ICmp:
8387 case Instruction::LShr:
8388 case Instruction::Mul:
8389 case Instruction::Or:
8390 case Instruction::Select:
8391 case Instruction::Shl:
8392 case Instruction::Sub:
8393 case Instruction::Xor:
8394 case Instruction::Freeze:
8395 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8396 };
8397}
8398
8400 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8401 for (VPHeaderPHIRecipe *R : PhisToFix) {
8402 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8403 VPRecipeBase *IncR =
8404 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8405 R->addOperand(IncR->getVPSingleValue());
8406 }
8407}
8408
8410 VFRange &Range) {
8412 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8413 Range);
8414
8415 bool IsPredicated = CM.isPredicatedInst(I);
8416
8417 // Even if the instruction is not marked as uniform, there are certain
8418 // intrinsic calls that can be effectively treated as such, so we check for
8419 // them here. Conservatively, we only do this for scalable vectors, since
8420 // for fixed-width VFs we can always fall back on full scalarization.
8421 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8422 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8423 case Intrinsic::assume:
8424 case Intrinsic::lifetime_start:
8425 case Intrinsic::lifetime_end:
8426 // For scalable vectors if one of the operands is variant then we still
8427 // want to mark as uniform, which will generate one instruction for just
8428 // the first lane of the vector. We can't scalarize the call in the same
8429 // way as for fixed-width vectors because we don't know how many lanes
8430 // there are.
8431 //
8432 // The reasons for doing it this way for scalable vectors are:
8433 // 1. For the assume intrinsic generating the instruction for the first
8434 // lane is still be better than not generating any at all. For
8435 // example, the input may be a splat across all lanes.
8436 // 2. For the lifetime start/end intrinsics the pointer operand only
8437 // does anything useful when the input comes from a stack object,
8438 // which suggests it should always be uniform. For non-stack objects
8439 // the effect is to poison the object, which still allows us to
8440 // remove the call.
8441 IsUniform = true;
8442 break;
8443 default:
8444 break;
8445 }
8446 }
8447 VPValue *BlockInMask = nullptr;
8448 if (!IsPredicated) {
8449 // Finalize the recipe for Instr, first if it is not predicated.
8450 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8451 } else {
8452 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8453 // Instructions marked for predication are replicated and a mask operand is
8454 // added initially. Masked replicate recipes will later be placed under an
8455 // if-then construct to prevent side-effects. Generate recipes to compute
8456 // the block mask for this region.
8457 BlockInMask = getBlockInMask(I->getParent());
8458 }
8459
8460 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8461 IsUniform, BlockInMask);
8462 return Recipe;
8463}
8464
8468 VFRange &Range, VPBasicBlock *VPBB) {
8469 // First, check for specific widening recipes that deal with inductions, Phi
8470 // nodes, calls and memory operations.
8471 VPRecipeBase *Recipe;
8472 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8473 if (Phi->getParent() != OrigLoop->getHeader())
8474 return tryToBlend(Phi, Operands);
8475
8476 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8477 return Recipe;
8478
8479 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8480 assert((Legal->isReductionVariable(Phi) ||
8481 Legal->isFixedOrderRecurrence(Phi)) &&
8482 "can only widen reductions and fixed-order recurrences here");
8483 VPValue *StartV = Operands[0];
8484 if (Legal->isReductionVariable(Phi)) {
8485 const RecurrenceDescriptor &RdxDesc =
8486 Legal->getReductionVars().find(Phi)->second;
8487 assert(RdxDesc.getRecurrenceStartValue() ==
8488 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8489 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8490 CM.isInLoopReduction(Phi),
8491 CM.useOrderedReductions(RdxDesc));
8492 } else {
8493 // TODO: Currently fixed-order recurrences are modeled as chains of
8494 // first-order recurrences. If there are no users of the intermediate
8495 // recurrences in the chain, the fixed order recurrence should be modeled
8496 // directly, enabling more efficient codegen.
8497 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8498 }
8499
8500 PhisToFix.push_back(PhiRecipe);
8501 return PhiRecipe;
8502 }
8503
8504 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8505 cast<TruncInst>(Instr), Operands, Range)))
8506 return Recipe;
8507
8508 // All widen recipes below deal only with VF > 1.
8510 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8511 return nullptr;
8512
8513 if (auto *CI = dyn_cast<CallInst>(Instr))
8514 return tryToWidenCall(CI, Operands, Range);
8515
8516 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8517 return tryToWidenMemory(Instr, Operands, Range);
8518
8519 if (!shouldWiden(Instr, Range))
8520 return nullptr;
8521
8522 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8523 return new VPWidenGEPRecipe(GEP,
8524 make_range(Operands.begin(), Operands.end()));
8525
8526 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8527 return new VPWidenSelectRecipe(
8528 *SI, make_range(Operands.begin(), Operands.end()));
8529 }
8530
8531 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8532 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8533 *CI);
8534 }
8535
8536 return tryToWiden(Instr, Operands, VPBB);
8537}
8538
8539void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8540 ElementCount MaxVF) {
8541 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8542
8543 auto MaxVFTimes2 = MaxVF * 2;
8544 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8545 VFRange SubRange = {VF, MaxVFTimes2};
8546 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8547 // Now optimize the initial VPlan.
8548 if (!Plan->hasVF(ElementCount::getFixed(1)))
8550 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8551 VPlanTransforms::optimize(*Plan, *PSE.getSE());
8552 // TODO: try to put it close to addActiveLaneMask().
8553 if (CM.foldTailWithEVL())
8555 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8556 VPlans.push_back(std::move(Plan));
8557 }
8558 VF = SubRange.End;
8559 }
8560}
8561
8562// Add the necessary canonical IV and branch recipes required to control the
8563// loop.
8564static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8565 DebugLoc DL) {
8566 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8567 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8568
8569 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8570 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8571 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8572 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8573 Header->insert(CanonicalIVPHI, Header->begin());
8574
8575 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8576 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8577 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8578 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8579 "index.next");
8580 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8581
8582 // Add the BranchOnCount VPInstruction to the latch.
8584 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8585}
8586
8587// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8588// original exit block.
8589static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8590 VPRecipeBuilder &Builder, VPlan &Plan) {
8591 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8592 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8593 // Only handle single-exit loops with unique exit blocks for now.
8594 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8595 return;
8596
8597 // Introduce VPUsers modeling the exit values.
8598 for (PHINode &ExitPhi : ExitBB->phis()) {
8599 Value *IncomingValue =
8600 ExitPhi.getIncomingValueForBlock(ExitingBB);
8601 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
8602 Plan.addLiveOut(&ExitPhi, V);
8603 }
8604}
8605
8607LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8608
8610
8611 // ---------------------------------------------------------------------------
8612 // Build initial VPlan: Scan the body of the loop in a topological order to
8613 // visit each basic block after having visited its predecessor basic blocks.
8614 // ---------------------------------------------------------------------------
8615
8616 // Create initial VPlan skeleton, having a basic block for the pre-header
8617 // which contains SCEV expansions that need to happen before the CFG is
8618 // modified; a basic block for the vector pre-header, followed by a region for
8619 // the vector loop, followed by the middle basic block. The skeleton vector
8620 // loop region contains a header and latch basic blocks.
8622 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8623 *PSE.getSE());
8624 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8625 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8626 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8627 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8628 Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8629
8630 // Don't use getDecisionAndClampRange here, because we don't know the UF
8631 // so this function is better to be conservative, rather than to split
8632 // it up into different VPlans.
8633 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8634 bool IVUpdateMayOverflow = false;
8635 for (ElementCount VF : Range)
8636 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8637
8639 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8640 // When not folding the tail, we know that the induction increment will not
8641 // overflow.
8642 bool HasNUW = Style == TailFoldingStyle::None;
8643 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8644
8645 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8646
8647 // ---------------------------------------------------------------------------
8648 // Pre-construction: record ingredients whose recipes we'll need to further
8649 // process after constructing the initial VPlan.
8650 // ---------------------------------------------------------------------------
8651
8652 // For each interleave group which is relevant for this (possibly trimmed)
8653 // Range, add it to the set of groups to be later applied to the VPlan and add
8654 // placeholders for its members' Recipes which we'll be replacing with a
8655 // single VPInterleaveRecipe.
8657 auto applyIG = [IG, this](ElementCount VF) -> bool {
8658 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8659 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8661 // For scalable vectors, the only interleave factor currently supported
8662 // is 2 since we require the (de)interleave2 intrinsics instead of
8663 // shufflevectors.
8664 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8665 "Unsupported interleave factor for scalable vectors");
8666 return Result;
8667 };
8668 if (!getDecisionAndClampRange(applyIG, Range))
8669 continue;
8670 InterleaveGroups.insert(IG);
8671 };
8672
8673 // ---------------------------------------------------------------------------
8674 // Construct recipes for the instructions in the loop
8675 // ---------------------------------------------------------------------------
8676
8677 // Scan the body of the loop in a topological order to visit each basic block
8678 // after having visited its predecessor basic blocks.
8679 LoopBlocksDFS DFS(OrigLoop);
8680 DFS.perform(LI);
8681
8682 VPBasicBlock *VPBB = HeaderVPBB;
8683 BasicBlock *HeaderBB = OrigLoop->getHeader();
8684 bool NeedsMasks =
8685 CM.foldTailByMasking() ||
8686 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
8687 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8688 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8689 });
8690 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8691 // Relevant instructions from basic block BB will be grouped into VPRecipe
8692 // ingredients and fill a new VPBasicBlock.
8693 if (VPBB != HeaderVPBB)
8694 VPBB->setName(BB->getName());
8695 Builder.setInsertPoint(VPBB);
8696
8697 if (VPBB == HeaderVPBB)
8698 RecipeBuilder.createHeaderMask();
8699 else if (NeedsMasks)
8700 RecipeBuilder.createBlockInMask(BB);
8701
8702 // Introduce each ingredient into VPlan.
8703 // TODO: Model and preserve debug intrinsics in VPlan.
8704 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8705 Instruction *Instr = &I;
8707 auto *Phi = dyn_cast<PHINode>(Instr);
8708 if (Phi && Phi->getParent() == HeaderBB) {
8709 Operands.push_back(Plan->getOrAddLiveIn(
8710 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8711 } else {
8712 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
8713 Operands = {OpRange.begin(), OpRange.end()};
8714 }
8715
8716 // Invariant stores inside loop will be deleted and a single store
8717 // with the final reduction value will be added to the exit block
8718 StoreInst *SI;
8719 if ((SI = dyn_cast<StoreInst>(&I)) &&
8720 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8721 continue;
8722
8723 VPRecipeBase *Recipe =
8724 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8725 if (!Recipe)
8726 Recipe = RecipeBuilder.handleReplication(Instr, Range);
8727
8728 RecipeBuilder.setRecipe(Instr, Recipe);
8729 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8730 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8731 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8732 // recipes and need to be moved to the phi section of HeaderVPBB:
8733 // * tail-folding (non-phi recipes computing the header mask are
8734 // introduced earlier than regular header phi recipes, and should appear
8735 // after them)
8736 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8737
8738 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8739 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8740 "unexpected recipe needs moving");
8741 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8742 } else
8743 VPBB->appendRecipe(Recipe);
8744 }
8745
8747 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8748 }
8749
8750 // After here, VPBB should not be used.
8751 VPBB = nullptr;
8752
8753 if (CM.requiresScalarEpilogue(Range)) {
8754 // No edge from the middle block to the unique exit block has been inserted
8755 // and there is nothing to fix from vector loop; phis should have incoming
8756 // from scalar loop only.
8757 } else
8758 addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
8759
8760 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8761 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8762 "entry block must be set to a VPRegionBlock having a non-empty entry "
8763 "VPBasicBlock");
8764 RecipeBuilder.fixHeaderPhis();
8765
8766 // ---------------------------------------------------------------------------
8767 // Transform initial VPlan: Apply previously taken decisions, in order, to
8768 // bring the VPlan to its final state.
8769 // ---------------------------------------------------------------------------
8770
8771 // Adjust the recipes for any inloop reductions.
8772 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8773
8774 // Interleave memory: for each Interleave Group we marked earlier as relevant
8775 // for this VPlan, replace the Recipes widening its memory instructions with a
8776 // single VPInterleaveRecipe at its insertion point.
8777 for (const auto *IG : InterleaveGroups) {
8778 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8779 RecipeBuilder.getRecipe(IG->getInsertPos()));
8780 SmallVector<VPValue *, 4> StoredValues;
8781 for (unsigned i = 0; i < IG->getFactor(); ++i)
8782 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8783 auto *StoreR =
8784 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8785 StoredValues.push_back(StoreR->getStoredValue());
8786 }
8787
8788 bool NeedsMaskForGaps =
8789 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8790 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8791 Recipe->getMask(), NeedsMaskForGaps);
8792 VPIG->insertBefore(Recipe);
8793 unsigned J = 0;
8794 for (unsigned i = 0; i < IG->getFactor(); ++i)
8795 if (Instruction *Member = IG->getMember(i)) {
8796 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8797 if (!Member->getType()->isVoidTy()) {
8798 VPValue *OriginalV = MemberR->getVPSingleValue();
8799 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8800 J++;
8801 }
8802 MemberR->eraseFromParent();
8803 }
8804 }
8805
8806 for (ElementCount VF : Range)
8807 Plan->addVF(VF);
8808 Plan->setName("Initial VPlan");
8809
8810 // Replace VPValues for known constant strides guaranteed by predicate scalar
8811 // evolution.
8812 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8813 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8814 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8815 // Only handle constant strides for now.
8816 if (!ScevStride)
8817 continue;
8818 Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
8819
8820 auto *ConstVPV = Plan->getOrAddLiveIn(CI);
8821 // The versioned value may not be used in the loop directly, so just add a
8822 // new live-in in those cases.
8823 Plan->getOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
8824 }
8825
8827 return Legal->blockNeedsPredication(BB);
8828 });
8829
8830 // Sink users of fixed-order recurrence past the recipe defining the previous
8831 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8833 return nullptr;
8834
8835 if (useActiveLaneMask(Style)) {
8836 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8837 // TailFoldingStyle is visible there.
8838 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8839 bool WithoutRuntimeCheck =
8841 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8842 WithoutRuntimeCheck);
8843 }
8844 return Plan;
8845}
8846
8847VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8848 // Outer loop handling: They may require CFG and instruction level
8849 // transformations before even evaluating whether vectorization is profitable.
8850 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8851 // the vectorization pipeline.
8852 assert(!OrigLoop->isInnermost());
8853 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8854
8855 // Create new empty VPlan
8856 auto Plan = VPlan::createInitialVPlan(
8857 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8858 *PSE.getSE());
8859
8860 // Build hierarchical CFG
8861 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8862 HCFGBuilder.buildHierarchicalCFG();
8863
8864 for (ElementCount VF : Range)
8865 Plan->addVF(VF);
8866
8868 Plan,
8869 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8870 *PSE.getSE(), *TLI);
8871
8872 // Remove the existing terminator of the exiting block of the top-most region.
8873 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8874 auto *Term =
8875 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8876 Term->eraseFromParent();
8877
8878 // Tail folding is not supported for outer loops, so the induction increment
8879 // is guaranteed to not wrap.
8880 bool HasNUW = true;
8881 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8882 DebugLoc());
8883 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8884 return Plan;
8885}
8886
8887// Adjust the recipes for reductions. For in-loop reductions the chain of
8888// instructions leading from the loop exit instr to the phi need to be converted
8889// to reductions, with one operand being vector and the other being the scalar
8890// reduction chain. For other reductions, a select is introduced between the phi
8891// and live-out recipes when folding the tail.
8892//
8893// A ComputeReductionResult recipe is added to the middle block, also for
8894// in-loop reductions which compute their result in-loop, because generating
8895// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8896//
8897// Adjust AnyOf reductions; replace the reduction phi for the selected value
8898// with a boolean reduction phi node to check if the condition is true in any
8899// iteration. The final value is selected by the final ComputeReductionResult.
8900void LoopVectorizationPlanner::adjustRecipesForReductions(
8901 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8902 ElementCount MinVF) {
8903 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8904 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8905 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8906 // sank outside of the loop would keep the same order as they had in the
8907 // original loop.
8908 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8909 for (VPRecipeBase &R : Header->phis()) {
8910 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8911 ReductionPHIList.emplace_back(ReductionPhi);
8912 }
8913 bool HasIntermediateStore = false;
8914 stable_sort(ReductionPHIList,
8915 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8916 const VPReductionPHIRecipe *R2) {
8917 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8918 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8919 HasIntermediateStore |= IS1 || IS2;
8920
8921 // If neither of the recipes has an intermediate store, keep the
8922 // order the same.
8923 if (!IS1 && !IS2)
8924 return false;
8925
8926 // If only one of the recipes has an intermediate store, then
8927 // move it towards the beginning of the list.
8928 if (IS1 && !IS2)
8929 return true;
8930
8931 if (!IS1 && IS2)
8932 return false;
8933
8934 // If both recipes have an intermediate store, then the recipe
8935 // with the later store should be processed earlier. So it
8936 // should go to the beginning of the list.
8937 return DT->dominates(IS2, IS1);
8938 });
8939
8940 if (HasIntermediateStore && ReductionPHIList.size() > 1)
8941 for (VPRecipeBase *R : ReductionPHIList)
8942 R->moveBefore(*Header, Header->getFirstNonPhi());
8943
8944 for (VPRecipeBase &R : Header->phis()) {
8945 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8946 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8947 continue;
8948
8949 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8950 RecurKind Kind = RdxDesc.getRecurrenceKind();
8952 "AnyOf reductions are not allowed for in-loop reductions");
8953
8954 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8956 Worklist.insert(PhiR);
8957 for (unsigned I = 0; I != Worklist.size(); ++I) {
8958 VPSingleDefRecipe *Cur = Worklist[I];
8959 for (VPUser *U : Cur->users()) {
8960 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8961 if (!UserRecipe) {
8962 assert(isa<VPLiveOut>(U) &&
8963 "U must either be a VPSingleDef or VPLiveOut");
8964 continue;
8965 }
8966 Worklist.insert(UserRecipe);
8967 }
8968 }
8969
8970 // Visit operation "Links" along the reduction chain top-down starting from
8971 // the phi until LoopExitValue. We keep track of the previous item
8972 // (PreviousLink) to tell which of the two operands of a Link will remain
8973 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8974 // the select instructions. Blend recipes of in-loop reduction phi's will
8975 // get folded to their non-phi operand, as the reduction recipe handles the
8976 // condition directly.
8977 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8978 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
8979 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8980
8981 // Index of the first operand which holds a non-mask vector operand.
8982 unsigned IndexOfFirstOperand;
8983 // Recognize a call to the llvm.fmuladd intrinsic.
8984 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8985 VPValue *VecOp;
8986 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8987 if (IsFMulAdd) {
8988 assert(
8990 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8991 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8992 isa<VPWidenCallRecipe>(CurrentLink)) &&
8993 CurrentLink->getOperand(2) == PreviousLink &&
8994 "expected a call where the previous link is the added operand");
8995
8996 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8997 // need to create an fmul recipe (multiplying the first two operands of
8998 // the fmuladd together) to use as the vector operand for the fadd
8999 // reduction.
9000 VPInstruction *FMulRecipe = new VPInstruction(
9001 Instruction::FMul,
9002 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9003 CurrentLinkI->getFastMathFlags());
9004 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9005 VecOp = FMulRecipe;
9006 } else {
9007 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9008 if (PhiR->isInLoop() && Blend) {
9009 assert(Blend->getNumIncomingValues() == 2 &&
9010 "Blend must have 2 incoming values");
9011 if (Blend->getIncomingValue(0) == PhiR)
9012 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9013 else {
9014 assert(Blend->getIncomingValue(1) == PhiR &&
9015 "PhiR must be an operand of the blend");
9016 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9017 }
9018 continue;
9019 }
9020
9022 if (isa<VPWidenRecipe>(CurrentLink)) {
9023 assert(isa<CmpInst>(CurrentLinkI) &&
9024 "need to have the compare of the select");
9025 continue;
9026 }
9027 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9028 "must be a select recipe");
9029 IndexOfFirstOperand = 1;
9030 } else {
9031 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9032 "Expected to replace a VPWidenSC");
9033 IndexOfFirstOperand = 0;
9034 }
9035 // Note that for non-commutable operands (cmp-selects), the semantics of
9036 // the cmp-select are captured in the recurrence kind.
9037 unsigned VecOpId =
9038 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9039 ? IndexOfFirstOperand + 1
9040 : IndexOfFirstOperand;
9041 VecOp = CurrentLink->getOperand(VecOpId);
9042 assert(VecOp != PreviousLink &&
9043 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9044 (VecOpId - IndexOfFirstOperand)) ==
9045 PreviousLink &&
9046 "PreviousLink must be the operand other than VecOp");
9047 }
9048
9049 BasicBlock *BB = CurrentLinkI->getParent();
9050 VPValue *CondOp = nullptr;
9052 CondOp = RecipeBuilder.getBlockInMask(BB);
9053
9054 VPReductionRecipe *RedRecipe =
9055 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
9056 CondOp, CM.useOrderedReductions(RdxDesc));
9057 // Append the recipe to the end of the VPBasicBlock because we need to
9058 // ensure that it comes after all of it's inputs, including CondOp.
9059 // Note that this transformation may leave over dead recipes (including
9060 // CurrentLink), which will be cleaned by a later VPlan transform.
9061 LinkVPBB->appendRecipe(RedRecipe);
9062 CurrentLink->replaceAllUsesWith(RedRecipe);
9063 PreviousLink = RedRecipe;
9064 }
9065 }
9066 Builder.setInsertPoint(&*LatchVPBB->begin());
9067 for (VPRecipeBase &R :
9068 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9069 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9070 if (!PhiR)
9071 continue;
9072
9073 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9074 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9075 // with a boolean reduction phi node to check if the condition is true in
9076 // any iteration. The final value is selected by the final
9077 // ComputeReductionResult.
9079 RdxDesc.getRecurrenceKind())) {
9080 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9081 return isa<VPWidenSelectRecipe>(U) ||
9082 (isa<VPReplicateRecipe>(U) &&
9083 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9084 Instruction::Select);
9085 }));
9086 VPValue *Cmp = Select->getOperand(0);
9087 // If the compare is checking the reduction PHI node, adjust it to check
9088 // the start value.
9089 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9090 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9091 if (CmpR->getOperand(I) == PhiR)
9092 CmpR->setOperand(I, PhiR->getStartValue());
9093 }
9094 VPBuilder::InsertPointGuard Guard(Builder);
9095 Builder.setInsertPoint(Select);
9096
9097 // If the true value of the select is the reduction phi, the new value is
9098 // selected if the negated condition is true in any iteration.
9099 if (Select->getOperand(1) == PhiR)
9100 Cmp = Builder.createNot(Cmp);
9101 VPValue *Or = Builder.createOr(PhiR, Cmp);
9102 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9103
9104 // Convert the reduction phi to operate on bools.
9105 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9106 OrigLoop->getHeader()->getContext())));
9107 }
9108
9109 // If tail is folded by masking, introduce selects between the phi
9110 // and the live-out instruction of each reduction, at the beginning of the
9111 // dedicated latch block.
9112 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9113 auto *NewExitingVPV = PhiR->getBackedgeValue();
9114 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9115 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9116 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9117 "reduction recipe must be defined before latch");
9118 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9119 std::optional<FastMathFlags> FMFs =
9120 PhiTy->isFloatingPointTy()
9121 ? std::make_optional(RdxDesc.getFastMathFlags())
9122 : std::nullopt;
9123 NewExitingVPV =
9124 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9125 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9126 return isa<VPInstruction>(&U) &&
9127 cast<VPInstruction>(&U)->getOpcode() ==
9129 });
9132 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9134 PhiR->setOperand(1, NewExitingVPV);
9135 }
9136
9137 // If the vector reduction can be performed in a smaller type, we truncate
9138 // then extend the loop exit value to enable InstCombine to evaluate the
9139 // entire expression in the smaller type.
9140 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9141 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9143 RdxDesc.getRecurrenceKind())) {
9144 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9145 Type *RdxTy = RdxDesc.getRecurrenceType();
9146 auto *Trunc =
9147 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9148 auto *Extnd =
9149 RdxDesc.isSigned()
9150 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9151 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9152
9153 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9154 Extnd->insertAfter(Trunc);
9155 if (PhiR->getOperand(1) == NewExitingVPV)
9156 PhiR->setOperand(1, Extnd->getVPSingleValue());
9157 NewExitingVPV = Extnd;
9158 }
9159
9160 // We want code in the middle block to appear to execute on the location of
9161 // the scalar loop's latch terminator because: (a) it is all compiler
9162 // generated, (b) these instructions are always executed after evaluating
9163 // the latch conditional branch, and (c) other passes may add new
9164 // predecessors which terminate on this line. This is the easiest way to
9165 // ensure we don't accidentally cause an extra step back into the loop while
9166 // debugging.
9167 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9168
9169 // TODO: At the moment ComputeReductionResult also drives creation of the
9170 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9171 // even for in-loop reductions, until the reduction resume value handling is
9172 // also modeled in VPlan.
9173 auto *FinalReductionResult = new VPInstruction(
9174 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9175 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9176 ->appendRecipe(FinalReductionResult);
9177 OrigExitingVPV->replaceUsesWithIf(
9178 FinalReductionResult,
9179 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9180 }
9181
9183}
9184
9185#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9187 VPSlotTracker &SlotTracker) const {
9188 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9189 IG->getInsertPos()->printAsOperand(O, false);
9190 O << ", ";
9192 VPValue *Mask = getMask();
9193 if (Mask) {
9194 O << ", ";
9195 Mask->printAsOperand(O, SlotTracker);
9196 }
9197
9198 unsigned OpIdx = 0;
9199 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9200 if (!IG->getMember(i))
9201 continue;
9202 if (getNumStoreOperands() > 0) {
9203 O << "\n" << Indent << " store ";
9204 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9205 O << " to index " << i;
9206 } else {
9207 O << "\n" << Indent << " ";
9209 O << " = load from index " << i;
9210 }
9211 ++OpIdx;
9212 }
9213}
9214#endif
9215
9218 "Not a pointer induction according to InductionDescriptor!");
9219 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9220 "Unexpected type.");
9222 "Recipe should have been replaced");
9223
9224 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9225 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9226 Type *PhiType = IndDesc.getStep()->getType();
9227
9228 // Build a pointer phi
9229 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9230 Type *ScStValueType = ScalarStartValue->getType();
9231 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
9232 CanonicalIV->getIterator());
9233
9234 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9235 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9236
9237 // A pointer induction, performed by using a gep
9238 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9239
9240 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9241 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9242 Value *NumUnrolledElems =
9243 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9244 Value *InductionGEP = GetElementPtrInst::Create(
9245 State.Builder.getInt8Ty(), NewPointerPhi,
9246 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9247 InductionLoc);
9248 // Add induction update using an incorrect block temporarily. The phi node
9249 // will be fixed after VPlan execution. Note that at this point the latch
9250 // block cannot be used, as it does not exist yet.
9251 // TODO: Model increment value in VPlan, by turning the recipe into a
9252 // multi-def and a subclass of VPHeaderPHIRecipe.
9253 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9254
9255 // Create UF many actual address geps that use the pointer
9256 // phi as base and a vectorized version of the step value
9257 // (<step*0, ..., step*N>) as offset.
9258 for (unsigned Part = 0; Part < State.UF; ++Part) {
9259 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9260 Value *StartOffsetScalar =
9261 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9262 Value *StartOffset =
9263 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9264 // Create a vector of consecutive numbers from zero to VF.
9265 StartOffset = State.Builder.CreateAdd(
9266 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9267
9268 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9269 "scalar step must be the same across all parts");
9270 Value *GEP = State.Builder.CreateGEP(
9271 State.Builder.getInt8Ty(), NewPointerPhi,
9272 State.Builder.CreateMul(
9273 StartOffset,
9274 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9275 "vector.gep"));
9276 State.set(this, GEP, Part);
9277 }
9278}
9279
9281 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9282
9283 // Fast-math-flags propagate from the original induction instruction.
9285 if (FPBinOp)
9286 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9287
9288 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9289 Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
9290 Value *DerivedIV = emitTransformedIndex(
9291 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9292 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9293 DerivedIV->setName("offset.idx");
9294 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9295
9296 State.set(this, DerivedIV, VPIteration(0, 0));
9297}
9298
9300 assert(!State.Instance && "Interleave group being replicated.");
9301 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9303 NeedsMaskForGaps);
9304}
9305
9308 if (State.Instance) { // Generate a single instance.
9309 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9310 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9311 // Insert scalar instance packing it into a vector.
9312 if (State.VF.isVector() && shouldPack()) {
9313 // If we're constructing lane 0, initialize to start from poison.
9314 if (State.Instance->Lane.isFirstLane()) {
9315 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9316 Value *Poison = PoisonValue::get(
9317 VectorType::get(UI->getType(), State.VF));
9318 State.set(this, Poison, State.Instance->Part);
9319 }
9320 State.packScalarIntoVectorValue(this, *State.Instance);
9321 }
9322 return;
9323 }
9324
9325 if (IsUniform) {
9326 // If the recipe is uniform across all parts (instead of just per VF), only
9327 // generate a single instance.
9328 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9329 all_of(operands(), [](VPValue *Op) {
9330 return Op->isDefinedOutsideVectorRegions();
9331 })) {
9332 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9333 if (user_begin() != user_end()) {
9334 for (unsigned Part = 1; Part < State.UF; ++Part)
9335 State.set(this, State.get(this, VPIteration(0, 0)),
9336 VPIteration(Part, 0));
9337 }
9338 return;
9339 }
9340
9341 // Uniform within VL means we need to generate lane 0 only for each
9342 // unrolled copy.
9343 for (unsigned Part = 0; Part < State.UF; ++Part)
9344 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9345 return;
9346 }
9347
9348 // A store of a loop varying value to a uniform address only needs the last
9349 // copy of the store.
9350 if (isa<StoreInst>(UI) &&
9352 auto Lane = VPLane::getLastLaneForVF(State.VF);
9353 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9354 State);
9355 return;
9356 }
9357
9358 // Generate scalar instances for all VF lanes of all UF parts.
9359 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9360 const unsigned EndLane = State.VF.getKnownMinValue();
9361 for (unsigned Part = 0; Part < State.UF; ++Part)
9362 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9363 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9364}
9365
9366/// Creates either vp_store or vp_scatter intrinsics calls to represent
9367/// predicated store/scatter.
9368static Instruction *
9370 Value *StoredVal, bool IsScatter, Value *Mask,
9371 Value *EVL, const Align &Alignment) {
9372 CallInst *Call;
9373 if (IsScatter) {
9374 Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9375 Intrinsic::vp_scatter,
9376 {StoredVal, Addr, Mask, EVL});
9377 } else {
9378 VectorBuilder VBuilder(Builder);
9379 VBuilder.setEVL(EVL).setMask(Mask);
9380 Call = cast<CallInst>(VBuilder.createVectorInstruction(
9381 Instruction::Store, Type::getVoidTy(EVL->getContext()),
9382 {StoredVal, Addr}));
9383 }
9384 Call->addParamAttr(
9385 1, Attribute::getWithAlignment(Call->getContext(), Alignment));
9386 return Call;
9387}
9388
9389/// Creates either vp_load or vp_gather intrinsics calls to represent
9390/// predicated load/gather.
9392 VectorType *DataTy,
9393 Value *Addr, bool IsGather,
9394 Value *Mask, Value *EVL,
9395 const Align &Alignment) {
9396 CallInst *Call;
9397 if (IsGather) {
9398 Call =
9399 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9400 nullptr, "wide.masked.gather");
9401 } else {
9402 VectorBuilder VBuilder(Builder);
9403 VBuilder.setEVL(EVL).setMask(Mask);
9404 Call = cast<CallInst>(VBuilder.createVectorInstruction(
9405 Instruction::Load, DataTy, Addr, "vp.op.load"));
9406 }
9407 Call->addParamAttr(
9408 0, Attribute::getWithAlignment(Call->getContext(), Alignment));
9409 return Call;
9410}
9411
9413 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9414
9415 // Attempt to issue a wide load.
9416 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9417 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9418
9419 assert((LI || SI) && "Invalid Load/Store instruction");
9420 assert((!SI || StoredValue) && "No stored value provided for widened store");
9421 assert((!LI || !StoredValue) && "Stored value provided for widened load");
9422
9423 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9424
9425 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9426 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9427 bool CreateGatherScatter = !isConsecutive();
9428
9429 auto &Builder = State.Builder;
9430 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9431 bool isMaskRequired = getMask();
9432 if (isMaskRequired) {
9433 // Mask reversal is only needed for non-all-one (null) masks, as reverse of
9434 // a null all-one mask is a null mask.
9435 for (unsigned Part = 0; Part < State.UF; ++Part) {
9436 Value *Mask = State.get(getMask(), Part);
9437 if (isReverse())
9438 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9439 BlockInMaskParts[Part] = Mask;
9440 }
9441 }
9442
9443 // Handle Stores:
9444 if (SI) {
9446
9447 for (unsigned Part = 0; Part < State.UF; ++Part) {
9448 Instruction *NewSI = nullptr;
9449 Value *StoredVal = State.get(StoredValue, Part);
9450 // TODO: split this into several classes for better design.
9451 if (State.EVL) {
9452 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9453 "explicit vector length.");
9454 assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
9456 "EVL must be VPInstruction::ExplicitVectorLength.");
9457 Value *EVL = State.get(State.EVL, VPIteration(0, 0));
9458 // If EVL is not nullptr, then EVL must be a valid value set during plan
9459 // creation, possibly default value = whole vector register length. EVL
9460 // is created only if TTI prefers predicated vectorization, thus if EVL
9461 // is not nullptr it also implies preference for predicated
9462 // vectorization.
9463 // FIXME: Support reverse store after vp_reverse is added.
9464 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9466 Builder, State.get(getAddr(), Part, !CreateGatherScatter),
9467 StoredVal, CreateGatherScatter, MaskPart, EVL, Alignment);
9468 } else if (CreateGatherScatter) {
9469 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9470 Value *VectorGep = State.get(getAddr(), Part);
9471 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9472 MaskPart);
9473 } else {
9474 if (isReverse()) {
9475 // If we store to reverse consecutive memory locations, then we need
9476 // to reverse the order of elements in the stored value.
9477 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9478 // We don't want to update the value in the map as it might be used in
9479 // another expression. So don't call resetVectorValue(StoredVal).
9480 }
9481 auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
9482 if (isMaskRequired)
9483 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9484 BlockInMaskParts[Part]);
9485 else
9486 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9487 }
9488 State.addMetadata(NewSI, SI);
9489 }
9490 return;
9491 }
9492
9493 // Handle loads.
9494 assert(LI && "Must have a load instruction");
9496 for (unsigned Part = 0; Part < State.UF; ++Part) {
9497 Value *NewLI;
9498 // TODO: split this into several classes for better design.
9499 if (State.EVL) {
9500 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9501 "explicit vector length.");
9502 assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
9504 "EVL must be VPInstruction::ExplicitVectorLength.");
9505 Value *EVL = State.get(State.EVL, VPIteration(0, 0));
9506 // If EVL is not nullptr, then EVL must be a valid value set during plan
9507 // creation, possibly default value = whole vector register length. EVL
9508 // is created only if TTI prefers predicated vectorization, thus if EVL
9509 // is not nullptr it also implies preference for predicated
9510 // vectorization.
9511 // FIXME: Support reverse loading after vp_reverse is added.
9512 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9514 Builder, DataTy, State.get(getAddr(), Part, !CreateGatherScatter),
9515 CreateGatherScatter, MaskPart, EVL, Alignment);
9516 } else if (CreateGatherScatter) {
9517 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9518 Value *VectorGep = State.get(getAddr(), Part);
9519 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9520 nullptr, "wide.masked.gather");
9521 State.addMetadata(NewLI, LI);
9522 } else {
9523 auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
9524 if (isMaskRequired)
9525 NewLI = Builder.CreateMaskedLoad(
9526 DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9527 PoisonValue::get(DataTy), "wide.masked.load");
9528 else
9529 NewLI =
9530 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9531
9532 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9533 State.addMetadata(NewLI, LI);
9534 if (Reverse)
9535 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9536 }
9537
9538 State.set(getVPSingleValue(), NewLI, Part);
9539 }
9540}
9541
9542// Determine how to lower the scalar epilogue, which depends on 1) optimising
9543// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9544// predication, and 4) a TTI hook that analyses whether the loop is suitable
9545// for predication.
9550 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9551 // don't look at hints or options, and don't request a scalar epilogue.
9552 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9553 // LoopAccessInfo (due to code dependency and not being able to reliably get
9554 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9555 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9556 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9557 // back to the old way and vectorize with versioning when forced. See D81345.)
9558 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9562
9563 // 2) If set, obey the directives
9564 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9572 };
9573 }
9574
9575 // 3) If set, obey the hints
9576 switch (Hints.getPredicate()) {
9581 };
9582
9583 // 4) if the TTI hook indicates this is profitable, request predication.
9584 TailFoldingInfo TFI(TLI, &LVL, IAI);
9587
9589}
9590
9591// Process the loop in the VPlan-native vectorization path. This path builds
9592// VPlan upfront in the vectorization pipeline, which allows to apply
9593// VPlan-to-VPlan transformations from the very beginning without modifying the
9594// input LLVM IR.
9601 LoopVectorizationRequirements &Requirements) {
9602
9603 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9604 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9605 return false;
9606 }
9607 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9608 Function *F = L->getHeader()->getParent();
9609 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9610
9612 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9613
9614 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9615 &Hints, IAI);
9616 // Use the planner for outer loop vectorization.
9617 // TODO: CM is not used at this point inside the planner. Turn CM into an
9618 // optional argument if we don't need it in the future.
9619 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9620 ORE);
9621
9622 // Get user vectorization factor.
9623 ElementCount UserVF = Hints.getWidth();
9624
9626
9627 // Plan how to best vectorize, return the best VF and its cost.
9628 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9629
9630 // If we are stress testing VPlan builds, do not attempt to generate vector
9631 // code. Masked vector code generation support will follow soon.
9632 // Also, do not attempt to vectorize if no vector code will be produced.
9634 return false;
9635
9636 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9637
9638 {
9639 bool AddBranchWeights =
9640 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9641 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9642 F->getParent()->getDataLayout(), AddBranchWeights);
9643 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9644 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9645 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9646 << L->getHeader()->getParent()->getName() << "\"\n");
9647 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9648 }
9649
9650 reportVectorization(ORE, L, VF, 1);
9651
9652 // Mark the loop as already vectorized to avoid vectorizing again.
9653 Hints.setAlreadyVectorized();
9654 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9655 return true;
9656}
9657
9658// Emit a remark if there are stores to floats that required a floating point
9659// extension. If the vectorized loop was generated with floating point there
9660// will be a performance penalty from the conversion overhead and the change in
9661// the vector width.
9664 for (BasicBlock *BB : L->getBlocks()) {
9665 for (Instruction &Inst : *BB) {
9666 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9667 if (S->getValueOperand()->getType()->isFloatTy())
9668 Worklist.push_back(S);
9669 }
9670 }
9671 }
9672
9673 // Traverse the floating point stores upwards searching, for floating point
9674 // conversions.
9677 while (!Worklist.empty()) {
9678 auto *I = Worklist.pop_back_val();
9679 if (!L->contains(I))
9680 continue;
9681 if (!Visited.insert(I).second)
9682 continue;
9683
9684 // Emit a remark if the floating point store required a floating
9685 // point conversion.
9686 // TODO: More work could be done to identify the root cause such as a
9687 // constant or a function return type and point the user to it.
9688 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9689 ORE->emit([&]() {
9690 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9691 I->getDebugLoc(), L->getHeader())
9692 << "floating point conversion changes vector width. "
9693 << "Mixed floating point precision requires an up/down "
9694 << "cast that will negatively impact performance.";
9695 });
9696
9697 for (Use &Op : I->operands())
9698 if (auto *OpI = dyn_cast<Instruction>(Op))
9699 Worklist.push_back(OpI);
9700 }
9701}
9702
9703static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9705 std::optional<unsigned> VScale, Loop *L,
9706 ScalarEvolution &SE,
9708 InstructionCost CheckCost = Checks.getCost();
9709 if (!CheckCost.isValid())
9710 return false;
9711
9712 // When interleaving only scalar and vector cost will be equal, which in turn
9713 // would lead to a divide by 0. Fall back to hard threshold.
9714 if (VF.Width.isScalar()) {
9715 if (CheckCost > VectorizeMemoryCheckThreshold) {
9716 LLVM_DEBUG(
9717 dbgs()
9718 << "LV: Interleaving only is not profitable due to runtime checks\n");
9719 return false;
9720 }
9721 return true;
9722 }
9723
9724 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9725 double ScalarC = *VF.ScalarCost.getValue();
9726 if (ScalarC == 0)
9727 return true;
9728
9729 // First, compute the minimum iteration count required so that the vector
9730 // loop outperforms the scalar loop.
9731 // The total cost of the scalar loop is
9732 // ScalarC * TC
9733 // where
9734 // * TC is the actual trip count of the loop.
9735 // * ScalarC is the cost of a single scalar iteration.
9736 //
9737 // The total cost of the vector loop is
9738 // RtC + VecC * (TC / VF) + EpiC
9739 // where
9740 // * RtC is the cost of the generated runtime checks
9741 // * VecC is the cost of a single vector iteration.
9742 // * TC is the actual trip count of the loop
9743 // * VF is the vectorization factor
9744 // * EpiCost is the cost of the generated epilogue, including the cost
9745 // of the remaining scalar operations.
9746 //
9747 // Vectorization is profitable once the total vector cost is less than the
9748 // total scalar cost:
9749 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9750 //
9751 // Now we can compute the minimum required trip count TC as
9752 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9753 //
9754 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9755 // the computations are performed on doubles, not integers and the result
9756 // is rounded up, hence we get an upper estimate of the TC.
9757 unsigned IntVF = VF.Width.getKnownMinValue();
9758 if (VF.Width.isScalable()) {
9759 unsigned AssumedMinimumVscale = 1;
9760 if (VScale)
9761 AssumedMinimumVscale = *VScale;
9762 IntVF *= AssumedMinimumVscale;
9763 }
9764 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
9765 double RtC = *CheckCost.getValue();
9766 double MinTC1 = RtC / (ScalarC - VecCOverVF);
9767
9768 // Second, compute a minimum iteration count so that the cost of the
9769 // runtime checks is only a fraction of the total scalar loop cost. This
9770 // adds a loop-dependent bound on the overhead incurred if the runtime
9771 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9772 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9773 // cost, compute
9774 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9775 double MinTC2 = RtC * 10 / ScalarC;
9776
9777 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9778 // epilogue is allowed, choose the next closest multiple of VF. This should
9779 // partly compensate for ignoring the epilogue cost.
9780 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
9781 if (SEL == CM_ScalarEpilogueAllowed)
9782 MinTC = alignTo(MinTC, IntVF);
9784
9785 LLVM_DEBUG(
9786 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9787 << VF.MinProfitableTripCount << "\n");
9788
9789 // Skip vectorization if the expected trip count is less than the minimum
9790 // required trip count.
9791 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9794 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9795 "trip count < minimum profitable VF ("
9796 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9797 << ")\n");
9798
9799 return false;
9800 }
9801 }
9802 return true;
9803}
9804
9806 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9808 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9810
9812 assert((EnableVPlanNativePath || L->isInnermost()) &&
9813 "VPlan-native path is not enabled. Only process inner loops.");
9814
9815#ifndef NDEBUG
9816 const std::string DebugLocStr = getDebugLocString(L);
9817#endif /* NDEBUG */
9818
9819 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9820 << L->getHeader()->getParent()->getName() << "' from "
9821 << DebugLocStr << "\n");
9822
9823 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9824
9825 LLVM_DEBUG(
9826 dbgs() << "LV: Loop hints:"
9827 << " force="
9829 ? "disabled"
9831 ? "enabled"
9832 : "?"))
9833 << " width=" << Hints.getWidth()
9834 << " interleave=" << Hints.getInterleave() << "\n");
9835
9836 // Function containing loop
9837 Function *F = L->getHeader()->getParent();
9838
9839 // Looking at the diagnostic output is the only way to determine if a loop
9840 // was vectorized (other than looking at the IR or machine code), so it
9841 // is important to generate an optimization remark for each loop. Most of
9842 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9843 // generated as OptimizationRemark and OptimizationRemarkMissed are
9844 // less verbose reporting vectorized loops and unvectorized loops that may
9845 // benefit from vectorization, respectively.
9846
9847 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9848 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9849 return false;
9850 }
9851
9852 PredicatedScalarEvolution PSE(*SE, *L);
9853
9854 // Check if it is legal to vectorize the loop.
9855 LoopVectorizationRequirements Requirements;
9856 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9857 &Requirements, &Hints, DB, AC, BFI, PSI);
9859 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9860 Hints.emitRemarkWithHints();
9861 return false;
9862 }
9863
9864 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9865 // here. They may require CFG and instruction level transformations before
9866 // even evaluating whether vectorization is profitable. Since we cannot modify
9867 // the incoming IR, we need to build VPlan upfront in the vectorization
9868 // pipeline.
9869 if (!L->isInnermost())
9870 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9871 ORE, BFI, PSI, Hints, Requirements);
9872
9873 assert(L->isInnermost() && "Inner loop expected.");
9874
9875 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9876 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9877
9878 // If an override option has been passed in for interleaved accesses, use it.
9879 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9880 UseInterleaved = EnableInterleavedMemAccesses;
9881
9882 // Analyze interleaved memory accesses.
9883 if (UseInterleaved)
9885
9886 // Check the function attributes and profiles to find out if this function
9887 // should be optimized for size.
9889 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9890
9891 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9892 // count by optimizing for size, to minimize overheads.
9893 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9894 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9895 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9896 << "This loop is worth vectorizing only if no scalar "
9897 << "iteration overheads are incurred.");
9899 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9900 else {
9901 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9902 LLVM_DEBUG(dbgs() << "\n");
9903 // Predicate tail-folded loops are efficient even when the loop
9904 // iteration count is low. However, setting the epilogue policy to
9905 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9906 // with runtime checks. It's more effective to let
9907 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9908 // for the loop.
9911 } else {
9912 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9913 "small to consider vectorizing.\n");
9915 "The trip count is below the minial threshold value.",
9916 "loop trip count is too low, avoiding vectorization",
9917 "LowTripCount", ORE, L);
9918 Hints.emitRemarkWithHints();
9919 return false;
9920 }
9921 }
9922 }
9923
9924 // Check the function attributes to see if implicit floats or vectors are
9925 // allowed.
9926 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9928 "Can't vectorize when the NoImplicitFloat attribute is used",
9929 "loop not vectorized due to NoImplicitFloat attribute",
9930 "NoImplicitFloat", ORE, L);
9931 Hints.emitRemarkWithHints();
9932 return false;
9933 }
9934
9935 // Check if the target supports potentially unsafe FP vectorization.
9936 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9937 // for the target we're vectorizing for, to make sure none of the
9938 // additional fp-math flags can help.
9939 if (Hints.isPotentiallyUnsafe() &&
9942 "Potentially unsafe FP op prevents vectorization",
9943 "loop not vectorized due to unsafe FP support.",
9944 "UnsafeFP", ORE, L);
9945 Hints.emitRemarkWithHints();
9946 return false;
9947 }
9948
9949 bool AllowOrderedReductions;
9950 // If the flag is set, use that instead and override the TTI behaviour.
9951 if (ForceOrderedReductions.getNumOccurrences() > 0)
9952 AllowOrderedReductions = ForceOrderedReductions;
9953 else
9954 AllowOrderedReductions = TTI->enableOrderedReductions();
9955 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9956 ORE->emit([&]() {
9957 auto *ExactFPMathInst = Requirements.getExactFPInst();
9958 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9959 ExactFPMathInst->getDebugLoc(),
9960 ExactFPMathInst->getParent())
9961 << "loop not vectorized: cannot prove it is safe to reorder "
9962 "floating-point operations";
9963 });
9964 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9965 "reorder floating-point operations\n");
9966 Hints.emitRemarkWithHints();
9967 return false;
9968 }
9969
9970 // Use the cost model.
9971 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9972 F, &Hints, IAI);
9973 // Use the planner for vectorization.
9974 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9975 ORE);
9976
9977 // Get user vectorization factor and interleave count.
9978 ElementCount UserVF = Hints.getWidth();
9979 unsigned UserIC = Hints.getInterleave();
9980
9981 // Plan how to best vectorize, return the best VF and its cost.
9982 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9983
9985 unsigned IC = 1;
9986
9987 bool AddBranchWeights =
9988 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9989 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9990 F->getParent()->getDataLayout(), AddBranchWeights);
9991 if (MaybeVF) {
9992 VF = *MaybeVF;
9993 // Select the interleave count.
9994 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9995
9996 unsigned SelectedIC = std::max(IC, UserIC);
9997 // Optimistically generate runtime checks if they are needed. Drop them if
9998 // they turn out to not be profitable.
9999 if (VF.Width.isVector() || SelectedIC > 1)
10000 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10001
10002 // Check if it is profitable to vectorize with runtime checks.
10003 bool ForceVectorization =
10005 if (!ForceVectorization &&
10007 *PSE.getSE(), SEL)) {
10008 ORE->emit([&]() {
10010 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10011 L->getHeader())
10012 << "loop not vectorized: cannot prove it is safe to reorder "
10013 "memory operations";
10014 });
10015 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10016 Hints.emitRemarkWithHints();
10017 return false;
10018 }
10019 }
10020
10021 // Identify the diagnostic messages that should be produced.
10022 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10023 bool VectorizeLoop = true, InterleaveLoop = true;
10024 if (VF.Width.isScalar()) {
10025 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10026 VecDiagMsg = std::make_pair(
10027 "VectorizationNotBeneficial",
10028 "the cost-model indicates that vectorization is not beneficial");
10029 VectorizeLoop = false;
10030 }
10031
10032 if (!MaybeVF && UserIC > 1) {
10033 // Tell the user interleaving was avoided up-front, despite being explicitly
10034 // requested.
10035 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10036 "interleaving should be avoided up front\n");
10037 IntDiagMsg = std::make_pair(
10038 "InterleavingAvoided",
10039 "Ignoring UserIC, because interleaving was avoided up front");
10040 InterleaveLoop = false;
10041 } else if (IC == 1 && UserIC <= 1) {
10042 // Tell the user interleaving is not beneficial.
10043 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10044 IntDiagMsg = std::make_pair(
10045 "InterleavingNotBeneficial",
10046 "the cost-model indicates that interleaving is not beneficial");
10047 InterleaveLoop = false;
10048 if (UserIC == 1) {
10049 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10050 IntDiagMsg.second +=
10051 " and is explicitly disabled or interleave count is set to 1";
10052 }
10053 } else if (IC > 1 && UserIC == 1) {
10054 // Tell the user interleaving is beneficial, but it explicitly disabled.
10055 LLVM_DEBUG(
10056 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10057 IntDiagMsg = std::make_pair(
10058 "InterleavingBeneficialButDisabled",
10059 "the cost-model indicates that interleaving is beneficial "
10060 "but is explicitly disabled or interleave count is set to 1");
10061 InterleaveLoop = false;
10062 }
10063
10064 // Override IC if user provided an interleave count.
10065 IC = UserIC > 0 ? UserIC : IC;
10066
10067 // Emit diagnostic messages, if any.
10068 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10069 if (!VectorizeLoop && !InterleaveLoop) {
10070 // Do not vectorize or interleaving the loop.
10071 ORE->emit([&]() {
10072 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10073 L->getStartLoc(), L->getHeader())
10074 << VecDiagMsg.second;
10075 });
10076 ORE->emit([&]() {
10077 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10078 L->getStartLoc(), L->getHeader())
10079 << IntDiagMsg.second;
10080 });
10081 return false;
10082 } else if (!VectorizeLoop && InterleaveLoop) {
10083 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10084 ORE->emit([&]() {
10085 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10086 L->getStartLoc(), L->getHeader())
10087 << VecDiagMsg.second;
10088 });
10089 } else if (VectorizeLoop && !InterleaveLoop) {
10090 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10091 << ") in " << DebugLocStr << '\n');
10092 ORE->emit([&]() {
10093 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10094 L->getStartLoc(), L->getHeader())
10095 << IntDiagMsg.second;
10096 });
10097 } else if (VectorizeLoop && InterleaveLoop) {
10098 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10099 << ") in " << DebugLocStr << '\n');
10100 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10101 }
10102
10103 bool DisableRuntimeUnroll = false;
10104 MDNode *OrigLoopID = L->getLoopID();
10105 {
10106 using namespace ore;
10107 if (!VectorizeLoop) {
10108 assert(IC > 1 && "interleave count should not be 1 or 0");
10109 // If we decided that it is not legal to vectorize the loop, then
10110 // interleave it.
10111 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10112 &CM, BFI, PSI, Checks);
10113
10114 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10115 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10116
10117 ORE->emit([&]() {
10118 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10119 L->getHeader())
10120 << "interleaved loop (interleaved count: "
10121 << NV("InterleaveCount", IC) << ")";
10122 });
10123 } else {
10124 // If we decided that it is *legal* to vectorize the loop, then do it.
10125
10126 // Consider vectorizing the epilogue too if it's profitable.
10127 VectorizationFactor EpilogueVF =
10129 if (EpilogueVF.Width.isVector()) {
10130
10131 // The first pass vectorizes the main loop and creates a scalar epilogue
10132 // to be vectorized by executing the plan (potentially with a different
10133 // factor) again shortly afterwards.
10134 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10135 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10136 EPI, &LVL, &CM, BFI, PSI, Checks);
10137
10138 std::unique_ptr<VPlan> BestMainPlan(
10140 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10141 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
10142 ++LoopsVectorized;
10143
10144 // Second pass vectorizes the epilogue and adjusts the control flow
10145 // edges from the first pass.
10146 EPI.MainLoopVF = EPI.EpilogueVF;
10147 EPI.MainLoopUF = EPI.EpilogueUF;
10148 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10149 ORE, EPI, &LVL, &CM, BFI, PSI,
10150 Checks);
10151
10152 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10153 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10154 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10155 Header->setName("vec.epilog.vector.body");
10156
10157 // Re-use the trip count and steps expanded for the main loop, as
10158 // skeleton creation needs it as a value that dominates both the scalar
10159 // and vector epilogue loops
10160 // TODO: This is a workaround needed for epilogue vectorization and it
10161 // should be removed once induction resume value creation is done
10162 // directly in VPlan.
10163 EpilogILV.setTripCount(MainILV.getTripCount());
10164 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10165 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10166 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
10167 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10168 ExpandR->replaceAllUsesWith(ExpandedVal);
10169 if (BestEpiPlan.getTripCount() == ExpandR)
10170 BestEpiPlan.resetTripCount(ExpandedVal);
10171 ExpandR->eraseFromParent();
10172 }
10173
10174 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10175 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10176 // before vectorizing the epilogue loop.
10177 for (VPRecipeBase &R : Header->phis()) {
10178 if (isa<VPCanonicalIVPHIRecipe>(&R))
10179 continue;
10180
10181 Value *ResumeV = nullptr;
10182 // TODO: Move setting of resume values to prepareToExecute.
10183 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10184 const RecurrenceDescriptor &RdxDesc =
10185 ReductionPhi->getRecurrenceDescriptor();
10186 RecurKind RK = RdxDesc.getRecurrenceKind();
10187 ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
10189 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10190 // start value; compare the final value from the main vector loop
10191 // to the start value.
10192 IRBuilder<> Builder(
10193 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10194 ResumeV = Builder.CreateICmpNE(ResumeV,
10195 RdxDesc.getRecurrenceStartValue());
10196 }
10197 } else {
10198 // Create induction resume values for both widened pointer and
10199 // integer/fp inductions and update the start value of the induction
10200 // recipes to use the resume value.
10201 PHINode *IndPhi = nullptr;
10202 const InductionDescriptor *ID;
10203 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10204 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10205 ID = &Ind->getInductionDescriptor();
10206 } else {
10207 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10208 IndPhi = WidenInd->getPHINode();
10209 ID = &WidenInd->getInductionDescriptor();
10210 }
10211
10212 ResumeV = MainILV.createInductionResumeValue(
10213 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10215 }
10216 assert(ResumeV && "Must have a resume value");
10217 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
10218 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10219 }
10220
10221 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10222 DT, true, &ExpandedSCEVs);
10223 ++LoopsEpilogueVectorized;
10224
10225 if (!MainILV.areSafetyChecksAdded())
10226 DisableRuntimeUnroll = true;
10227 } else {
10228 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10229 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10230 PSI, Checks);
10231
10232 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10233 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10234 ++LoopsVectorized;
10235
10236 // Add metadata to disable runtime unrolling a scalar loop when there
10237 // are no runtime checks about strides and memory. A scalar loop that is
10238 // rarely used is not worth unrolling.
10239 if (!LB.areSafetyChecksAdded())
10240 DisableRuntimeUnroll = true;
10241 }
10242 // Report the vectorization decision.
10243 reportVectorization(ORE, L, VF, IC);
10244 }
10245
10248 }
10249
10250 std::optional<MDNode *> RemainderLoopID =
10253 if (RemainderLoopID) {
10254 L->setLoopID(*RemainderLoopID);
10255 } else {
10256 if (DisableRuntimeUnroll)
10258
10259 // Mark the loop as already vectorized to avoid vectorizing again.
10260 Hints.setAlreadyVectorized();
10261 }
10262
10263 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10264 return true;
10265}
10266
10272 SE = &SE_;
10273 LI = &LI_;
10274 TTI = &TTI_;
10275 DT = &DT_;
10276 BFI = BFI_;
10277 TLI = TLI_;
10278 AC = &AC_;
10279 LAIs = &LAIs_;
10280 DB = &DB_;
10281 ORE = &ORE_;
10282 PSI = PSI_;
10283
10284 // Don't attempt if
10285 // 1. the target claims to have no vector registers, and
10286 // 2. interleaving won't help ILP.
10287 //
10288 // The second condition is necessary because, even if the target has no
10289 // vector registers, loop vectorization may still enable scalar
10290 // interleaving.
10293 return LoopVectorizeResult(false, false);
10294
10295 bool Changed = false, CFGChanged = false;
10296
10297 // The vectorizer requires loops to be in simplified form.
10298 // Since simplification may add new inner loops, it has to run before the
10299 // legality and profitability checks. This means running the loop vectorizer
10300 // will simplify all loops, regardless of whether anything end up being
10301 // vectorized.
10302 for (const auto &L : *LI)
10303 Changed |= CFGChanged |=
10304 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10305
10306 // Build up a worklist of inner-loops to vectorize. This is necessary as
10307 // the act of vectorizing or partially unrolling a loop creates new loops
10308 // and can invalidate iterators across the loops.
10309 SmallVector<Loop *, 8> Worklist;
10310
10311 for (Loop *L : *LI)
10312 collectSupportedLoops(*L, LI, ORE, Worklist);
10313
10314 LoopsAnalyzed += Worklist.size();
10315
10316 // Now walk the identified inner loops.
10317 while (!Worklist.empty()) {
10318 Loop *L = Worklist.pop_back_val();
10319
10320 // For the inner loops we actually process, form LCSSA to simplify the
10321 // transform.
10322 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10323
10324 Changed |= CFGChanged |= processLoop(L);
10325
10326 if (Changed) {
10327 LAIs->clear();
10328
10329#ifndef NDEBUG
10330 if (VerifySCEV)
10331 SE->verify();
10332#endif
10333 }
10334 }
10335
10336 // Process each loop nest in the function.
10337 return LoopVectorizeResult(Changed, CFGChanged);
10338}
10339
10342 auto &LI = AM.getResult<LoopAnalysis>(F);
10343 // There are no loops in the function. Return before computing other expensive
10344 // analyses.
10345 if (LI.empty())
10346 return PreservedAnalyses::all();
10348 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10349 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10350 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10351 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10352 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10354
10356 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10358 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10359 BlockFrequencyInfo *BFI = nullptr;
10360 if (PSI && PSI->hasProfileSummary())
10362 LoopVectorizeResult Result =
10363 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10364 if (!Result.MadeAnyChange)
10365 return PreservedAnalyses::all();
10367
10368 if (isAssignmentTrackingEnabled(*F.getParent())) {
10369 for (auto &BB : F)
10371 }
10372
10373 // We currently do not preserve loopinfo/dominator analyses with outer loop
10374 // vectorization. Until this is addressed, mark these analyses as preserved
10375 // only for non-VPlan-native path.
10376 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10377 if (!EnableVPlanNativePath) {
10378 PA.preserve<LoopAnalysis>();
10381 }
10382
10383 if (Result.MadeCFGChange) {
10384 // Making CFG changes likely means a loop got vectorized. Indicate that
10385 // extra simplification passes should be run.
10386 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10387 // be run if runtime checks have been added.
10390 } else {
10392 }
10393 return PA;
10394}
10395
10397 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10398 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10399 OS, MapClassName2PassName);
10400
10401 OS << '<';
10402 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10403 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10404 OS << '>';
10405}
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:693
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
uint64_t Addr
std::string Name
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static std::optional< unsigned > getSmallBestKnownTC(ScalarEvolution &SE, Loop *L)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void createAndCollectMergePhiForReduction(VPInstruction *RedResult, DenseMap< const RecurrenceDescriptor *, Value * > &ReductionResumeValues, VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock)
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it's operands.
static void emitInvalidCostRemarks(SmallVector< InstructionVFPair > InvalidCosts, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
const char LLVMLoopVectorizeFollowupAll[]
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static Type * MaybeVectorizeType(Type *Elt, ElementCount VF)
static Instruction * lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, VectorType *DataTy, Value *Addr, bool IsGather, Value *Mask, Value *EVL, const Align &Alignment)
Creates either vp_load or vp_gather intrinsics calls to represent predicated load/gather.
static Instruction * lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, Value *StoredVal, bool IsScatter, Value *Mask, Value *EVL, const Align &Alignment)
Creates either vp_store or vp_scatter intrinsics calls to represent predicated store/scatter.
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static Type * smallestIntegerVectorType(Type *T1, Type *T2)
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional< unsigned > VScale, Loop *L, ScalarEvolution &SE, ScalarEpilogueLowering SEL)
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I)
Create an analysis remark that explains why vectorization failed.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static std::string getDebugLocString(const Loop *L)
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static bool useActiveLaneMask(TailFoldingStyle Style)
static Type * largestIntegerVectorType(Type *T1, Type *T2)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static void AddRuntimeUnrollDisableMetaData(Loop *L)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static cl::opt< bool > PrintVPlansInDotFormat("vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans"))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define R2(n)
This file contains the declarations for metadata subclasses.
#define T1
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(VerifyEach)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
separate const offset from gep
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
This defines the Use class.
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:411
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:194
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:499
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:367
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:360
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:452
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
BinaryOps getOpcode() const
Definition: InstrTypes.h:486
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
static BranchInst * Create(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore)
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:2195
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1715
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1660
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1651
unsigned arg_size() const
Definition: InstrTypes.h:1658
This class represents a function call, abstracting a target machine's calling convention.
static bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:966
@ FIRST_ICMP_PREDICATE
Definition: InstrTypes.h:997
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:989
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:991
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:992
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:849
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:856
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:101
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:311
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:302
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:318
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:103
param_iterator param_begin() const
Definition: DerivedTypes.h:128
param_iterator param_end() const
Definition: DerivedTypes.h:129
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:681
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:201
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:709
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:678
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:683
static GetElementPtrInst * Create(Type *PointeeType, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:511
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2450
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1214
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2506
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:578
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1110
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:526
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:311
Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Definition: IRBuilder.cpp:1170
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2235
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1721
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:486
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2195
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2387
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2231
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:145
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1344
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2484
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:598
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1327
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:471
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1666
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1826
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2341
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:516
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1404
Value * CreateStepVector(Type *DstType, const Twine &Name="")
Creates a vector of type DstType with the linear sequence <0, 1, ...>
Definition: IRBuilder.cpp:109
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2656
A struct for saving information about induction variables.
BinaryOperator * getInductionBinOp() const
InductionKind getKind() const
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
virtual std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
PHINode * createInductionResumeValue(PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef< BasicBlock * > BypassBlocks, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create a new phi node for the induction variable OrigPhi to resume iteration count in the scalar epil...
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
BasicBlock * LoopScalarBody
The scalar loop body.
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
DenseMap< PHINode *, Value * > IVEndValues
ElementCount MinProfitableTripCount
Value * createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL)
Returns a bitcasted value to the requested vector type.
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
LoopVectorizationCostModel * Cost
The profitablity analysis.
SmallMapVector< const RecurrenceDescriptor *, PHINode *, 4 > ReductionResumeValues
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State)
Create code for the loop exit value of the reduction.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * completeLoopSkeleton()
Complete the loop skeleton by adding debug MDs, creating appropriate conditional branches in the midd...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
LoopVectorizationLegality * Legal
The legality analysis.
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
LoopInfo * LI
Loop Info.
void createInductionResumeValues(const SCEV2ValueTy &ExpandedSCEVs, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create new phi nodes for the induction variables to resume iteration count in the scalar epilogue,...
ProfileSummaryInfo * PSI
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State)
Fix the non-induction PHIs in Plan.
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void vectorizeInterleaveGroup(const InterleaveGroup< Instruction > *Group, ArrayRef< VPValue * > VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef< VPValue * > StoredValues, VPValue *BlockInMask, bool NeedsMaskForGaps)
Try to vectorize interleaved access group Group with the base address given in Addr,...
void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State)
Create the exit value of first order recurrences in the middle block and update their users.
virtual std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
unsigned UF
The vectorization unroll factor to use.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan)
Fix the vectorized code, taking care of header phi's, live-outs, and more.
BasicBlock * LoopExitBlock
The unique ExitBlock of the scalar loop if one exists.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:80
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:149
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:451
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:444
uint32_t getFactor() const
Definition: VectorUtils.h:460
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:514
uint32_t getIndex(const InstTy *Instr) const
Get the index for the given member.
Definition: VectorUtils.h:521
bool isReverse() const
Definition: VectorUtils.h:459
InstTy * getInsertPos() const
Definition: VectorUtils.h:530
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
Definition: VectorUtils.h:461
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:586
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:631
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:642
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:623
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:606
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:636
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:184
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1222
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
bool isAccessInterleaved(Instruction *Instr)
Check if Instr belongs to any interleaved access group.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
std::pair< InstructionCost, bool > VectorizationCostTy
The vectorization cost is a combination of the cost itself and a boolean indicating whether any of th...
DemandedBits * DB
Demanded bits analysis.
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
VectorizationCostTy expectedCost(ElementCount VF, SmallVectorImpl< InstructionVFPair > *Invalid=nullptr)
Returns the expected execution cost.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool isEpilogueVectorizationProfitable(const ElementCount VF) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr)
Get the interleaved access group that Instr belongs to.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if value V is uniform across VF lanes, when VF is provided, and otherwise if V is invari...
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Type * getWidestInductionType()
Returns the widest induction type.
const LoopAccessInfo * getLAI() const
bool prepareToFoldTailByMasking()
Return true if we can vectorize this loop while folding its tail by masking, and mark all respective ...
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
std::optional< VectorizationFactor > plan(ElementCount UserVF, unsigned UserIC)
Plan how to best vectorize, return the best VF and its cost, or std::nullopt if vectorization and int...
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
std::pair< DenseMap< const SCEV *, Value * >, DenseMap< const RecurrenceDescriptor *, Value * > > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
VPlan & getBestPlanFor(ElementCount VF) const
Return the best VPlan for VF.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
void printPlans(raw_ostream &O)
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:66
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:501
Metadata node.
Definition: Metadata.h:1067
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1071
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:600
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:191
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:756
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:129
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
This class provides computation of slot numbers for LLVM Assembly writing.
Definition: AsmWriter.cpp:693
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:243
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:234
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
op_iterator op_end()
Definition: User.h:236
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:2693
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:2761
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:443
iterator end()
Definition: VPlan.h:2724
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:2722
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:2771
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:210
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:2752
bool empty() const
Definition: VPlan.h:2733
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:1934
VPRegionBlock * getParent()
Definition: VPlan.h:498
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:175
void setName(const Twine &newName)
Definition: VPlan.h:491
VPlan * getPlan()
Definition: VPlan.cpp:148
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:153
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:533
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:3247
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:2426
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition: VPlanValue.h:421
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:399
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition: VPlanValue.h:411
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:2631
VPValue * getStartValue() const
Definition: VPlan.h:2630
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1621
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1665
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1654
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1160
@ FirstOrderRecurrenceSplice
Definition: VPlan.h:1166
unsigned getOpcode() const
Definition: VPlan.h:1260
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:1991
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2032
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2038
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition: VPlan.h:2045
unsigned getNumStoreOperands() const
Returns the number of stored operands of this interleave group.
Definition: VPlan.h:2065
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:169
static VPLane getFirstLane()
Definition: VPlan.h:167
A value that is used outside the VPlan.
Definition: VPlan.h:678
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:718
VPBasicBlock * getParent()
Definition: VPlan.h:743
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:809
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPValue * getVPValueOrAddLiveIn(Value *V, VPlan &Plan)
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
void createHeaderMask()
Create the mask for the vector loop header block.
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1083
A recipe for handling reduction phis.
Definition: VPlan.h:1875
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:1929
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:1921
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2080
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:2826
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:2897
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2128
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:835
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:894
This class can be used to assign consecutive numbers to all VPValues in a VPlan and allows querying t...
Definition: VPlanValue.h:449
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:204
operand_range operands()
Definition: VPlanValue.h:279
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:259
unsigned getNumOperands() const
Definition: VPlanValue.h:253
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:254
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:248
Value * getUnderlyingValue()
Return the underlying Value attached to this VPValue.
Definition: VPlanValue.h:78
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition: VPlan.cpp:1302
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1270
user_iterator user_begin()
Definition: VPlanValue.h:130
unsigned getNumUsers() const
Definition: VPlanValue.h:113
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:174
user_iterator user_end()
Definition: VPlanValue.h:132
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:169
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1274
user_range users()
Definition: VPlanValue.h:134
A recipe to compute the pointers for widened memory accesses of IndexTy for all parts.
Definition: VPlan.h:1565
A recipe for widening Call instructions.
Definition: VPlan.h:1450
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:2551
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1361
A recipe for handling GEP instructions.
Definition: VPlan.h:1523
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:1678
A Recipe for widening load/store operations.
Definition: VPlan.h:2289
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2348
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2342
void execute(VPTransformState &State) override
Generate the wide load/store.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition: VPlan.h:2357
bool isStore() const
Returns true if this recipe is a store.
Definition: VPlan.h:2354
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:1803
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:1842
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:1839
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void execute(VPTransformState &State) override
Generate vector values for the pointer induction.
VPWidenRecipe is a recipe for producing a copy of vector type its ingredient.
Definition: VPlan.h:1329
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:2927
void prepareToExecute(Value *TripCount, Value *VectorTripCount, Value *CanonicalIVStartValue, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:783
VPBasicBlock * getEntry()
Definition: VPlan.h:3020
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3045
void setName(const Twine &newName)
Definition: VPlan.h:3076
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3048
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3024
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3038
void removeLiveOut(PHINode *PN)
Definition: VPlan.h:3130
void addLiveOut(PHINode *PN, VPValue *V)
Definition: VPlan.cpp:993
VPBasicBlock * getPreheader()
Definition: VPlan.h:3149
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.h:3111
static VPlanPtr createInitialVPlan(const SCEV *TripCount, ScalarEvolution &PSE)
Create initial VPlan skeleton, having an "entry" VPBasicBlock (wrapping original scalar pre-header) w...
Definition: VPlan.cpp:769
bool hasVF(ElementCount VF)
Definition: VPlan.h:3058
bool hasUF(unsigned UF) const
Definition: VPlan.h:3065
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3031
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3080
LLVM_DUMP_METHOD void dump() const
Dump the plan to stderr (for debugging).
Definition: VPlan.cpp:990
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:825
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:3119
const MapVector< PHINode *, VPLiveOut * > & getLiveOuts() const
Definition: VPlan.h:3135
VPValue * getSCEVExpansion(const SCEV *S) const
Definition: VPlan.h:3139
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1074
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:693
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
VectorBuilder & setEVL(Value *NewExplicitVectorLength)
Definition: VectorBuilder.h:77
VectorBuilder & setMask(Value *NewMask)
Definition: VectorBuilder.h:73
Value * createVectorInstruction(unsigned Opcode, Type *ReturnTy, ArrayRef< Value * > VecOpArray, const Twine &Name=Twine())
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:217
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:203
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:243
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:210
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:239
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:224
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:109
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:765
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:718
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlan.cpp:1414
bool isUniformAfterVectorization(VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlan.h:3471
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:456
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1808
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7033
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const SCEV * createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *OrigLoop)
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:425
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:53
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:134
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< bool > EnableLoopVectorization
Align getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:572
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
Type * ToVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
Definition: VectorUtils.h:133
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2433
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1616
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
@ Invalid
Denotes invalid value.
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:1868
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:613
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
#define OP(n)
Definition: regex2.h:73
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:26
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:50
ElementCountComparator creates a total ordering for ElementCount for the purposes of using it in a se...
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
LoopVectorizeResult runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:74
A marker to determine if extra passes after loop vectorization should be run.
Definition: LoopVectorize.h:85
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:87
ElementCount End
Definition: VPlan.h:92
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:1848
VPIteration represents a single point in the iteration space of the output (vectorized and/or unrolle...
Definition: VPlan.h:219
bool isFirstIteration() const
Definition: VPlan.h:231
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:374
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:382
BasicBlock * ExitBB
The last IR BasicBlock in the output IR.
Definition: VPlan.h:378
BasicBlock * getPreheaderBBFor(VPRecipeBase *R)
Returns the BasicBlock* mapped to the pre-header of the loop region containing R.
Definition: VPlan.cpp:348
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:236
Value * get(VPValue *Def, unsigned Part, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def and a given Part if IsScalar is false,...
Definition: VPlan.cpp:247
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:418
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:421
void addMetadata(Value *To, Instruction *From)
Add metadata from one instruction to another.
Definition: VPlan.cpp:361
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:414
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:353
VPValue * EVL
If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid value set during plan transf...
Definition: VPlan.h:252
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:393
void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar=false)
Set the generated vector Value for a given VPValue and a given Part, if IsScalar is false.
Definition: VPlan.h:297
std::optional< VPIteration > Instance
Hold the indices to generate specific scalar instructions.
Definition: VPlan.h:257
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:398
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:404
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:401
ElementCount VF
The chosen Vectorization and Unroll Factors of the loop being vectorized.
Definition: VPlan.h:242
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:372
A recipe for widening select instructions.
Definition: VPlan.h:1489
static void addExplicitVectorLength(VPlan &Plan)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimize(VPlan &Plan, ScalarEvolution &SE)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs, LLVMContext &Ctx)
Insert truncates and extends for any truncated recipe.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Sink users of fixed-order recurrences after the recipe defining their previous value.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.