LLVM 19.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanTransforms.h"
63#include "VPlanVerifier.h"
64#include "llvm/ADT/APInt.h"
65#include "llvm/ADT/ArrayRef.h"
66#include "llvm/ADT/DenseMap.h"
68#include "llvm/ADT/Hashing.h"
69#include "llvm/ADT/MapVector.h"
70#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallSet.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
81#include "llvm/Analysis/CFG.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfo.h"
105#include "llvm/IR/DebugLoc.h"
106#include "llvm/IR/DerivedTypes.h"
108#include "llvm/IR/Dominators.h"
109#include "llvm/IR/Function.h"
110#include "llvm/IR/IRBuilder.h"
111#include "llvm/IR/InstrTypes.h"
112#include "llvm/IR/Instruction.h"
113#include "llvm/IR/Instructions.h"
115#include "llvm/IR/Intrinsics.h"
116#include "llvm/IR/MDBuilder.h"
117#include "llvm/IR/Metadata.h"
118#include "llvm/IR/Module.h"
119#include "llvm/IR/Operator.h"
120#include "llvm/IR/PatternMatch.h"
122#include "llvm/IR/Type.h"
123#include "llvm/IR/Use.h"
124#include "llvm/IR/User.h"
125#include "llvm/IR/Value.h"
126#include "llvm/IR/ValueHandle.h"
128#include "llvm/IR/Verifier.h"
129#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
145#include <algorithm>
146#include <cassert>
147#include <cmath>
148#include <cstdint>
149#include <functional>
150#include <iterator>
151#include <limits>
152#include <map>
153#include <memory>
154#include <string>
155#include <tuple>
156#include <utility>
157
158using namespace llvm;
159
160#define LV_NAME "loop-vectorize"
161#define DEBUG_TYPE LV_NAME
162
163#ifndef NDEBUG
164const char VerboseDebug[] = DEBUG_TYPE "-verbose";
165#endif
166
167/// @{
168/// Metadata attribute names
169const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
171 "llvm.loop.vectorize.followup_vectorized";
173 "llvm.loop.vectorize.followup_epilogue";
174/// @}
175
176STATISTIC(LoopsVectorized, "Number of loops vectorized");
177STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
178STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
179
181 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
182 cl::desc("Enable vectorization of epilogue loops."));
183
185 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
186 cl::desc("When epilogue vectorization is enabled, and a value greater than "
187 "1 is specified, forces the given VF for all applicable epilogue "
188 "loops."));
189
191 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
192 cl::desc("Only loops with vectorization factor equal to or larger than "
193 "the specified value are considered for epilogue vectorization."));
194
195/// Loops with a known constant trip count below this number are vectorized only
196/// if no scalar iteration overheads are incurred.
198 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
199 cl::desc("Loops with a constant trip count that is smaller than this "
200 "value are vectorized only if no scalar iteration overheads "
201 "are incurred."));
202
204 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
205 cl::desc("The maximum allowed number of runtime memory checks"));
206
207// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208// that predication is preferred, and this lists all options. I.e., the
209// vectorizer will try to fold the tail-loop (epilogue) into the vector body
210// and predicate the instructions accordingly. If tail-folding fails, there are
211// different fallback strategies depending on these values:
213 enum Option {
217 };
218} // namespace PreferPredicateTy
219
221 "prefer-predicate-over-epilogue",
224 cl::desc("Tail-folding and predication preferences over creating a scalar "
225 "epilogue loop."),
227 "scalar-epilogue",
228 "Don't tail-predicate loops, create scalar epilogue"),
230 "predicate-else-scalar-epilogue",
231 "prefer tail-folding, create scalar epilogue if tail "
232 "folding fails."),
234 "predicate-dont-vectorize",
235 "prefers tail-folding, don't attempt vectorization if "
236 "tail-folding fails.")));
237
239 "force-tail-folding-style", cl::desc("Force the tail folding style"),
240 cl::init(TailFoldingStyle::None),
242 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
244 TailFoldingStyle::Data, "data",
245 "Create lane mask for data only, using active.lane.mask intrinsic"),
246 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
247 "data-without-lane-mask",
248 "Create lane mask with compare/stepvector"),
249 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
250 "Create lane mask using active.lane.mask intrinsic, and use "
251 "it for both data and control flow"),
252 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253 "data-and-control-without-rt-check",
254 "Similar to data-and-control, but remove the runtime check"),
255 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
256 "Use predicated EVL instructions for tail folding. If EVL "
257 "is unsupported, fallback to data-without-lane-mask.")));
258
260 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
261 cl::desc("Maximize bandwidth when selecting vectorization factor which "
262 "will be determined by the smallest type in loop."));
263
265 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
266 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
267
268/// An interleave-group may need masking if it resides in a block that needs
269/// predication, or in order to mask away gaps.
271 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
272 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
273
275 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
276 cl::desc("A flag that overrides the target's number of scalar registers."));
277
279 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
280 cl::desc("A flag that overrides the target's number of vector registers."));
281
283 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
284 cl::desc("A flag that overrides the target's max interleave factor for "
285 "scalar loops."));
286
288 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
289 cl::desc("A flag that overrides the target's max interleave factor for "
290 "vectorized loops."));
291
293 "force-target-instruction-cost", cl::init(0), cl::Hidden,
294 cl::desc("A flag that overrides the target's expected cost for "
295 "an instruction to a single constant value. Mostly "
296 "useful for getting consistent testing."));
297
299 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
300 cl::desc(
301 "Pretend that scalable vectors are supported, even if the target does "
302 "not support them. This flag should only be used for testing."));
303
305 "small-loop-cost", cl::init(20), cl::Hidden,
306 cl::desc(
307 "The cost of a loop that is considered 'small' by the interleaver."));
308
310 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
311 cl::desc("Enable the use of the block frequency analysis to access PGO "
312 "heuristics minimizing code growth in cold regions and being more "
313 "aggressive in hot regions."));
314
315// Runtime interleave loops for load/store throughput.
317 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
318 cl::desc(
319 "Enable runtime interleaving until load/store ports are saturated"));
320
321/// The number of stores in a loop that are allowed to need predication.
323 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
324 cl::desc("Max number of stores to be predicated behind an if."));
325
327 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
328 cl::desc("Count the induction variable only once when interleaving"));
329
331 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
332 cl::desc("Enable if predication of stores during vectorization."));
333
335 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
336 cl::desc("The maximum interleave count to use when interleaving a scalar "
337 "reduction in a nested loop."));
338
339static cl::opt<bool>
340 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
342 cl::desc("Prefer in-loop vector reductions, "
343 "overriding the targets preference."));
344
346 "force-ordered-reductions", cl::init(false), cl::Hidden,
347 cl::desc("Enable the vectorisation of loops with in-order (strict) "
348 "FP reductions"));
349
351 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
352 cl::desc(
353 "Prefer predicating a reduction operation over an after loop select."));
354
355namespace llvm {
357 "enable-vplan-native-path", cl::Hidden,
358 cl::desc("Enable VPlan-native vectorization path with "
359 "support for outer loop vectorization."));
360}
361
362// This flag enables the stress testing of the VPlan H-CFG construction in the
363// VPlan-native vectorization path. It must be used in conjuction with
364// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
365// verification of the H-CFGs built.
367 "vplan-build-stress-test", cl::init(false), cl::Hidden,
368 cl::desc(
369 "Build VPlan for every supported loop nest in the function and bail "
370 "out right after the build (stress test the VPlan H-CFG construction "
371 "in the VPlan-native vectorization path)."));
372
374 "interleave-loops", cl::init(true), cl::Hidden,
375 cl::desc("Enable loop interleaving in Loop vectorization passes"));
377 "vectorize-loops", cl::init(true), cl::Hidden,
378 cl::desc("Run the Loop vectorization passes"));
379
381 "vplan-print-in-dot-format", cl::Hidden,
382 cl::desc("Use dot format instead of plain text when dumping VPlans"));
383
385 "force-widen-divrem-via-safe-divisor", cl::Hidden,
386 cl::desc(
387 "Override cost based safe divisor widening for div/rem instructions"));
388
390 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
392 cl::desc("Try wider VFs if they enable the use of vector variants"));
393
394// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
395// variables not overflowing do not hold. See `emitSCEVChecks`.
396static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
397// Likelyhood of bypassing the vectorized loop because pointers overlap. See
398// `emitMemRuntimeChecks`.
399static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
400// Likelyhood of bypassing the vectorized loop because there are zero trips left
401// after prolog. See `emitIterationCountCheck`.
402static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
403
404/// A helper function that returns true if the given type is irregular. The
405/// type is irregular if its allocated size doesn't equal the store size of an
406/// element of the corresponding vector type.
407static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
408 // Determine if an array of N elements of type Ty is "bitcast compatible"
409 // with a <N x Ty> vector.
410 // This is only true if there is no padding between the array elements.
411 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
412}
413
414/// A helper function that returns the reciprocal of the block probability of
415/// predicated blocks. If we return X, we are assuming the predicated block
416/// will execute once for every X iterations of the loop header.
417///
418/// TODO: We should use actual block probability here, if available. Currently,
419/// we always assume predicated blocks have a 50% chance of executing.
420static unsigned getReciprocalPredBlockProb() { return 2; }
421
422/// Returns "best known" trip count for the specified loop \p L as defined by
423/// the following procedure:
424/// 1) Returns exact trip count if it is known.
425/// 2) Returns expected trip count according to profile data if any.
426/// 3) Returns upper bound estimate if it is known.
427/// 4) Returns std::nullopt if all of the above failed.
428static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
429 Loop *L) {
430 // Check if exact trip count is known.
431 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
432 return ExpectedTC;
433
434 // Check if there is an expected trip count available from profile data.
436 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
437 return *EstimatedTC;
438
439 // Check if upper bound estimate is known.
440 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
441 return ExpectedTC;
442
443 return std::nullopt;
444}
445
446/// Return a vector containing interleaved elements from multiple
447/// smaller input vectors.
449 const Twine &Name) {
450 unsigned Factor = Vals.size();
451 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
452
453 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
454#ifndef NDEBUG
455 for (Value *Val : Vals)
456 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
457#endif
458
459 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
460 // must use intrinsics to interleave.
461 if (VecTy->isScalableTy()) {
462 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
463 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
464 Vals,
465 /*FMFSource=*/nullptr, Name);
466 }
467
468 // Fixed length. Start by concatenating all vectors into a wide vector.
469 Value *WideVec = concatenateVectors(Builder, Vals);
470
471 // Interleave the elements into the wide vector.
472 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
473 return Builder.CreateShuffleVector(
474 WideVec, createInterleaveMask(NumElts, Factor), Name);
475}
476
477namespace {
478// Forward declare GeneratedRTChecks.
479class GeneratedRTChecks;
480
481using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
482} // namespace
483
484namespace llvm {
485
487
488/// InnerLoopVectorizer vectorizes loops which contain only one basic
489/// block to a specified vectorization factor (VF).
490/// This class performs the widening of scalars into vectors, or multiple
491/// scalars. This class also implements the following features:
492/// * It inserts an epilogue loop for handling loops that don't have iteration
493/// counts that are known to be a multiple of the vectorization factor.
494/// * It handles the code generation for reduction variables.
495/// * Scalarization (implementation using scalars) of un-vectorizable
496/// instructions.
497/// InnerLoopVectorizer does not perform any vectorization-legality
498/// checks, and relies on the caller to check for the different legality
499/// aspects. The InnerLoopVectorizer relies on the
500/// LoopVectorizationLegality class to provide information about the induction
501/// and reduction variables that were found to a given vectorization factor.
503public:
506 const TargetLibraryInfo *TLI,
510 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
512 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
513 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
514 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
515 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
517 // Query this against the original loop and save it here because the profile
518 // of the original loop header may change as the transformation happens.
521
523 this->MinProfitableTripCount = VecWidth;
524 else
525 this->MinProfitableTripCount = MinProfitableTripCount;
526 }
527
528 virtual ~InnerLoopVectorizer() = default;
529
530 /// Create a new empty loop that will contain vectorized instructions later
531 /// on, while the old loop will be used as the scalar remainder. Control flow
532 /// is generated around the vectorized (and scalar epilogue) loops consisting
533 /// of various checks and bypasses. Return the pre-header block of the new
534 /// loop and the start value for the canonical induction, if it is != 0. The
535 /// latter is the case when vectorizing the epilogue loop. In the case of
536 /// epilogue vectorization, this function is overriden to handle the more
537 /// complex control flow around the loops. \p ExpandedSCEVs is used to
538 /// look up SCEV expansions for expressions needed during skeleton creation.
539 virtual std::pair<BasicBlock *, Value *>
540 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
541
542 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
543 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
544
545 // Return true if any runtime check is added.
547
548 /// A helper function to scalarize a single Instruction in the innermost loop.
549 /// Generates a sequence of scalar instances for each lane between \p MinLane
550 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
551 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
552 /// Instr's operands.
553 void scalarizeInstruction(const Instruction *Instr,
554 VPReplicateRecipe *RepRecipe,
555 const VPIteration &Instance,
556 VPTransformState &State);
557
558 /// Try to vectorize interleaved access group \p Group with the base address
559 /// given in \p Addr, optionally masking the vector operations if \p
560 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
561 /// values in the vectorized loop.
563 ArrayRef<VPValue *> VPDefs,
565 ArrayRef<VPValue *> StoredValues,
566 VPValue *BlockInMask, bool NeedsMaskForGaps);
567
568 /// Fix the non-induction PHIs in \p Plan.
569 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
570
571 /// Create a new phi node for the induction variable \p OrigPhi to resume
572 /// iteration count in the scalar epilogue, from where the vectorized loop
573 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
574 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
575 /// and the resume values can come from an additional bypass block, the \p
576 /// AdditionalBypass pair provides information about the bypass block and the
577 /// end value on the edge from bypass to this loop.
579 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
580 ArrayRef<BasicBlock *> BypassBlocks,
581 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
582
583 /// Returns the original loop trip count.
584 Value *getTripCount() const { return TripCount; }
585
586 /// Used to set the trip count after ILV's construction and after the
587 /// preheader block has been executed. Note that this always holds the trip
588 /// count of the original loop for both main loop and epilogue vectorization.
589 void setTripCount(Value *TC) { TripCount = TC; }
590
591protected:
593
594 /// A small list of PHINodes.
596
597 /// A type for scalarized values in the new loop. Each value from the
598 /// original loop, when scalarized, is represented by UF x VF scalar values
599 /// in the new unrolled loop, where UF is the unroll factor and VF is the
600 /// vectorization factor.
602
603 /// Set up the values of the IVs correctly when exiting the vector loop.
604 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
605 Value *VectorTripCount, Value *EndValue,
606 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
607 VPlan &Plan, VPTransformState &State);
608
609 /// Create the exit value of first order recurrences in the middle block and
610 /// update their users.
612 VPTransformState &State);
613
614 /// Iteratively sink the scalarized operands of a predicated instruction into
615 /// the block that was created for it.
616 void sinkScalarOperands(Instruction *PredInst);
617
618 /// Returns (and creates if needed) the trip count of the widened loop.
620
621 /// Returns a bitcasted value to the requested vector type.
622 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
624 const DataLayout &DL);
625
626 /// Emit a bypass check to see if the vector trip count is zero, including if
627 /// it overflows.
629
630 /// Emit a bypass check to see if all of the SCEV assumptions we've
631 /// had to make are correct. Returns the block containing the checks or
632 /// nullptr if no checks have been added.
634
635 /// Emit bypass checks to check any memory assumptions we may have made.
636 /// Returns the block containing the checks or nullptr if no checks have been
637 /// added.
639
640 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
641 /// vector loop preheader, middle block and scalar preheader.
643
644 /// Create new phi nodes for the induction variables to resume iteration count
645 /// in the scalar epilogue, from where the vectorized loop left off.
646 /// In cases where the loop skeleton is more complicated (eg. epilogue
647 /// vectorization) and the resume values can come from an additional bypass
648 /// block, the \p AdditionalBypass pair provides information about the bypass
649 /// block and the end value on the edge from bypass to this loop.
651 const SCEV2ValueTy &ExpandedSCEVs,
652 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
653
654 /// Complete the loop skeleton by adding debug MDs, creating appropriate
655 /// conditional branches in the middle block, preparing the builder and
656 /// running the verifier. Return the preheader of the completed vector loop.
658
659 /// Allow subclasses to override and print debug traces before/after vplan
660 /// execution, when trace information is requested.
661 virtual void printDebugTracesAtStart(){};
662 virtual void printDebugTracesAtEnd(){};
663
664 /// The original loop.
666
667 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
668 /// dynamic knowledge to simplify SCEV expressions and converts them to a
669 /// more usable form.
671
672 /// Loop Info.
674
675 /// Dominator Tree.
677
678 /// Target Library Info.
680
681 /// Target Transform Info.
683
684 /// Assumption Cache.
686
687 /// Interface to emit optimization remarks.
689
690 /// The vectorization SIMD factor to use. Each vector will have this many
691 /// vector elements.
693
695
696 /// The vectorization unroll factor to use. Each scalar is vectorized to this
697 /// many different vector instructions.
698 unsigned UF;
699
700 /// The builder that we use
702
703 // --- Vectorization state ---
704
705 /// The vector-loop preheader.
707
708 /// The scalar-loop preheader.
710
711 /// Middle Block between the vector and the scalar.
713
714 /// The unique ExitBlock of the scalar loop if one exists. Note that
715 /// there can be multiple exiting edges reaching this block.
717
718 /// The scalar loop body.
720
721 /// A list of all bypass blocks. The first block is the entry of the loop.
723
724 /// Store instructions that were predicated.
726
727 /// Trip count of the original loop.
728 Value *TripCount = nullptr;
729
730 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
732
733 /// The legality analysis.
735
736 /// The profitablity analysis.
738
739 // Record whether runtime checks are added.
740 bool AddedSafetyChecks = false;
741
742 // Holds the end values for each induction variable. We save the end values
743 // so we can later fix-up the external users of the induction variables.
745
746 /// BFI and PSI are used to check for profile guided size optimizations.
749
750 // Whether this loop should be optimized for size based on profile guided size
751 // optimizatios.
753
754 /// Structure to hold information about generated runtime checks, responsible
755 /// for cleaning the checks, if vectorization turns out unprofitable.
756 GeneratedRTChecks &RTChecks;
757
758 // Holds the resume values for reductions in the loops, used to set the
759 // correct start value of reduction PHIs when vectorizing the epilogue.
762};
763
765public:
768 const TargetLibraryInfo *TLI,
770 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
773 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
775 ElementCount::getFixed(1),
776 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
777 BFI, PSI, Check) {}
778};
779
780/// Encapsulate information regarding vectorization of a loop and its epilogue.
781/// This information is meant to be updated and used across two stages of
782/// epilogue vectorization.
785 unsigned MainLoopUF = 0;
787 unsigned EpilogueUF = 0;
792 Value *TripCount = nullptr;
794
796 ElementCount EVF, unsigned EUF)
797 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
798 assert(EUF == 1 &&
799 "A high UF for the epilogue loop is likely not beneficial.");
800 }
801};
802
803/// An extension of the inner loop vectorizer that creates a skeleton for a
804/// vectorized loop that has its epilogue (residual) also vectorized.
805/// The idea is to run the vplan on a given loop twice, firstly to setup the
806/// skeleton and vectorize the main loop, and secondly to complete the skeleton
807/// from the first step and vectorize the epilogue. This is achieved by
808/// deriving two concrete strategy classes from this base class and invoking
809/// them in succession from the loop vectorizer planner.
811public:
819 GeneratedRTChecks &Checks)
821 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
822 CM, BFI, PSI, Checks),
823 EPI(EPI) {}
824
825 // Override this function to handle the more complex control flow around the
826 // three loops.
827 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
828 const SCEV2ValueTy &ExpandedSCEVs) final {
829 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
830 }
831
832 /// The interface for creating a vectorized skeleton using one of two
833 /// different strategies, each corresponding to one execution of the vplan
834 /// as described above.
835 virtual std::pair<BasicBlock *, Value *>
836 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
837
838 /// Holds and updates state information required to vectorize the main loop
839 /// and its epilogue in two separate passes. This setup helps us avoid
840 /// regenerating and recomputing runtime safety checks. It also helps us to
841 /// shorten the iteration-count-check path length for the cases where the
842 /// iteration count of the loop is so small that the main vector loop is
843 /// completely skipped.
845};
846
847/// A specialized derived class of inner loop vectorizer that performs
848/// vectorization of *main* loops in the process of vectorizing loops and their
849/// epilogues.
851public:
859 GeneratedRTChecks &Check)
861 EPI, LVL, CM, BFI, PSI, Check) {}
862 /// Implements the interface for creating a vectorized skeleton using the
863 /// *main loop* strategy (ie the first pass of vplan execution).
864 std::pair<BasicBlock *, Value *>
865 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
866
867protected:
868 /// Emits an iteration count bypass check once for the main loop (when \p
869 /// ForEpilogue is false) and once for the epilogue loop (when \p
870 /// ForEpilogue is true).
871 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
872 void printDebugTracesAtStart() override;
873 void printDebugTracesAtEnd() override;
874};
875
876// A specialized derived class of inner loop vectorizer that performs
877// vectorization of *epilogue* loops in the process of vectorizing loops and
878// their epilogues.
880public:
888 GeneratedRTChecks &Checks)
890 EPI, LVL, CM, BFI, PSI, Checks) {
892 }
893 /// Implements the interface for creating a vectorized skeleton using the
894 /// *epilogue loop* strategy (ie the second pass of vplan execution).
895 std::pair<BasicBlock *, Value *>
896 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
897
898protected:
899 /// Emits an iteration count bypass check after the main vector loop has
900 /// finished to see if there are any iterations left to execute by either
901 /// the vector epilogue or the scalar epilogue.
903 BasicBlock *Bypass,
904 BasicBlock *Insert);
905 void printDebugTracesAtStart() override;
906 void printDebugTracesAtEnd() override;
907};
908} // end namespace llvm
909
910/// Look for a meaningful debug location on the instruction or it's
911/// operands.
913 if (!I)
914 return DebugLoc();
915
917 if (I->getDebugLoc() != Empty)
918 return I->getDebugLoc();
919
920 for (Use &Op : I->operands()) {
921 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
922 if (OpInst->getDebugLoc() != Empty)
923 return OpInst->getDebugLoc();
924 }
925
926 return I->getDebugLoc();
927}
928
929/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
930/// is passed, the message relates to that particular instruction.
931#ifndef NDEBUG
932static void debugVectorizationMessage(const StringRef Prefix,
933 const StringRef DebugMsg,
934 Instruction *I) {
935 dbgs() << "LV: " << Prefix << DebugMsg;
936 if (I != nullptr)
937 dbgs() << " " << *I;
938 else
939 dbgs() << '.';
940 dbgs() << '\n';
941}
942#endif
943
944/// Create an analysis remark that explains why vectorization failed
945///
946/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
947/// RemarkName is the identifier for the remark. If \p I is passed it is an
948/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
949/// the location of the remark. \return the remark object that can be
950/// streamed to.
952 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
953 Value *CodeRegion = TheLoop->getHeader();
954 DebugLoc DL = TheLoop->getStartLoc();
955
956 if (I) {
957 CodeRegion = I->getParent();
958 // If there is no debug location attached to the instruction, revert back to
959 // using the loop's.
960 if (I->getDebugLoc())
961 DL = I->getDebugLoc();
962 }
963
964 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
965}
966
967namespace llvm {
968
969/// Return a value for Step multiplied by VF.
971 int64_t Step) {
972 assert(Ty->isIntegerTy() && "Expected an integer step");
973 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
974}
975
976/// Return the runtime value for VF.
978 return B.CreateElementCount(Ty, VF);
979}
980
982 Loop *OrigLoop) {
983 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
984 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
985
986 ScalarEvolution &SE = *PSE.getSE();
987 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
988}
989
991 const StringRef OREMsg, const StringRef ORETag,
992 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
993 Instruction *I) {
994 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
995 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
996 ORE->emit(
997 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
998 << "loop not vectorized: " << OREMsg);
999}
1000
1001void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1002 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1003 Instruction *I) {
1005 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1006 ORE->emit(
1007 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1008 << Msg);
1009}
1010
1011/// Report successful vectorization of the loop. In case an outer loop is
1012/// vectorized, prepend "outer" to the vectorization remark.
1014 VectorizationFactor VF, unsigned IC) {
1016 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1017 nullptr));
1018 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1019 ORE->emit([&]() {
1020 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1021 TheLoop->getHeader())
1022 << "vectorized " << LoopType << "loop (vectorization width: "
1023 << ore::NV("VectorizationFactor", VF.Width)
1024 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1025 });
1026}
1027
1028} // end namespace llvm
1029
1030#ifndef NDEBUG
1031/// \return string containing a file name and a line # for the given loop.
1032static std::string getDebugLocString(const Loop *L) {
1033 std::string Result;
1034 if (L) {
1035 raw_string_ostream OS(Result);
1036 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1037 LoopDbgLoc.print(OS);
1038 else
1039 // Just print the module name.
1040 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1041 OS.flush();
1042 }
1043 return Result;
1044}
1045#endif
1046
1047namespace llvm {
1048
1049// Loop vectorization cost-model hints how the scalar epilogue loop should be
1050// lowered.
1052
1053 // The default: allowing scalar epilogues.
1055
1056 // Vectorization with OptForSize: don't allow epilogues.
1058
1059 // A special case of vectorisation with OptForSize: loops with a very small
1060 // trip count are considered for vectorization under OptForSize, thereby
1061 // making sure the cost of their loop body is dominant, free of runtime
1062 // guards and scalar iteration overheads.
1064
1065 // Loop hint predicate indicating an epilogue is undesired.
1067
1068 // Directive indicating we must either tail fold or not vectorize
1071
1072using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1073
1074/// LoopVectorizationCostModel - estimates the expected speedups due to
1075/// vectorization.
1076/// In many cases vectorization is not profitable. This can happen because of
1077/// a number of reasons. In this class we mainly attempt to predict the
1078/// expected speedup/slowdowns due to the supported instruction set. We use the
1079/// TargetTransformInfo to query the different backends for the cost of
1080/// different operations.
1082public:
1086 const TargetTransformInfo &TTI,
1092 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1093 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1094 Hints(Hints), InterleaveInfo(IAI) {}
1095
1096 /// \return An upper bound for the vectorization factors (both fixed and
1097 /// scalable). If the factors are 0, vectorization and interleaving should be
1098 /// avoided up front.
1099 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1100
1101 /// \return True if runtime checks are required for vectorization, and false
1102 /// otherwise.
1103 bool runtimeChecksRequired();
1104
1105 /// Setup cost-based decisions for user vectorization factor.
1106 /// \return true if the UserVF is a feasible VF to be chosen.
1110 return expectedCost(UserVF).first.isValid();
1111 }
1112
1113 /// \return The size (in bits) of the smallest and widest types in the code
1114 /// that needs to be vectorized. We ignore values that remain scalar such as
1115 /// 64 bit loop indices.
1116 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1117
1118 /// \return The desired interleave count.
1119 /// If interleave count has been specified by metadata it will be returned.
1120 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1121 /// are the selected vectorization factor and the cost of the selected VF.
1122 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1123
1124 /// Memory access instruction may be vectorized in more than one way.
1125 /// Form of instruction after vectorization depends on cost.
1126 /// This function takes cost-based decisions for Load/Store instructions
1127 /// and collects them in a map. This decisions map is used for building
1128 /// the lists of loop-uniform and loop-scalar instructions.
1129 /// The calculated cost is saved with widening decision in order to
1130 /// avoid redundant calculations.
1132
1133 /// A call may be vectorized in different ways depending on whether we have
1134 /// vectorized variants available and whether the target supports masking.
1135 /// This function analyzes all calls in the function at the supplied VF,
1136 /// makes a decision based on the costs of available options, and stores that
1137 /// decision in a map for use in planning and plan execution.
1139
1140 /// A struct that represents some properties of the register usage
1141 /// of a loop.
1143 /// Holds the number of loop invariant values that are used in the loop.
1144 /// The key is ClassID of target-provided register class.
1146 /// Holds the maximum number of concurrent live intervals in the loop.
1147 /// The key is ClassID of target-provided register class.
1149 };
1150
1151 /// \return Returns information about the register usages of the loop for the
1152 /// given vectorization factors.
1155
1156 /// Collect values we want to ignore in the cost model.
1157 void collectValuesToIgnore();
1158
1159 /// Collect all element types in the loop for which widening is needed.
1161
1162 /// Split reductions into those that happen in the loop, and those that happen
1163 /// outside. In loop reductions are collected into InLoopReductions.
1165
1166 /// Returns true if we should use strict in-order reductions for the given
1167 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1168 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1169 /// of FP operations.
1170 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1171 return !Hints->allowReordering() && RdxDesc.isOrdered();
1172 }
1173
1174 /// \returns The smallest bitwidth each instruction can be represented with.
1175 /// The vector equivalents of these instructions should be truncated to this
1176 /// type.
1178 return MinBWs;
1179 }
1180
1181 /// \returns True if it is more profitable to scalarize instruction \p I for
1182 /// vectorization factor \p VF.
1184 assert(VF.isVector() &&
1185 "Profitable to scalarize relevant only for VF > 1.");
1186 assert(
1187 TheLoop->isInnermost() &&
1188 "cost-model should not be used for outer loops (in VPlan-native path)");
1189
1190 auto Scalars = InstsToScalarize.find(VF);
1191 assert(Scalars != InstsToScalarize.end() &&
1192 "VF not yet analyzed for scalarization profitability");
1193 return Scalars->second.contains(I);
1194 }
1195
1196 /// Returns true if \p I is known to be uniform after vectorization.
1198 assert(
1199 TheLoop->isInnermost() &&
1200 "cost-model should not be used for outer loops (in VPlan-native path)");
1201 // Pseudo probe needs to be duplicated for each unrolled iteration and
1202 // vector lane so that profiled loop trip count can be accurately
1203 // accumulated instead of being under counted.
1204 if (isa<PseudoProbeInst>(I))
1205 return false;
1206
1207 if (VF.isScalar())
1208 return true;
1209
1210 auto UniformsPerVF = Uniforms.find(VF);
1211 assert(UniformsPerVF != Uniforms.end() &&
1212 "VF not yet analyzed for uniformity");
1213 return UniformsPerVF->second.count(I);
1214 }
1215
1216 /// Returns true if \p I is known to be scalar after vectorization.
1218 assert(
1219 TheLoop->isInnermost() &&
1220 "cost-model should not be used for outer loops (in VPlan-native path)");
1221 if (VF.isScalar())
1222 return true;
1223
1224 auto ScalarsPerVF = Scalars.find(VF);
1225 assert(ScalarsPerVF != Scalars.end() &&
1226 "Scalar values are not calculated for VF");
1227 return ScalarsPerVF->second.count(I);
1228 }
1229
1230 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1231 /// for vectorization factor \p VF.
1233 return VF.isVector() && MinBWs.contains(I) &&
1234 !isProfitableToScalarize(I, VF) &&
1236 }
1237
1238 /// Decision that was taken during cost calculation for memory instruction.
1241 CM_Widen, // For consecutive accesses with stride +1.
1242 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1249
1250 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1251 /// instruction \p I and vector width \p VF.
1254 assert(VF.isVector() && "Expected VF >=2");
1255 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1256 }
1257
1258 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1259 /// interleaving group \p Grp and vector width \p VF.
1263 assert(VF.isVector() && "Expected VF >=2");
1264 /// Broadcast this decicion to all instructions inside the group.
1265 /// But the cost will be assigned to one instruction only.
1266 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1267 if (auto *I = Grp->getMember(i)) {
1268 if (Grp->getInsertPos() == I)
1269 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1270 else
1271 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1272 }
1273 }
1274 }
1275
1276 /// Return the cost model decision for the given instruction \p I and vector
1277 /// width \p VF. Return CM_Unknown if this instruction did not pass
1278 /// through the cost modeling.
1280 assert(VF.isVector() && "Expected VF to be a vector VF");
1281 assert(
1282 TheLoop->isInnermost() &&
1283 "cost-model should not be used for outer loops (in VPlan-native path)");
1284
1285 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1286 auto Itr = WideningDecisions.find(InstOnVF);
1287 if (Itr == WideningDecisions.end())
1288 return CM_Unknown;
1289 return Itr->second.first;
1290 }
1291
1292 /// Return the vectorization cost for the given instruction \p I and vector
1293 /// width \p VF.
1295 assert(VF.isVector() && "Expected VF >=2");
1296 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1297 assert(WideningDecisions.contains(InstOnVF) &&
1298 "The cost is not calculated");
1299 return WideningDecisions[InstOnVF].second;
1300 }
1301
1306 std::optional<unsigned> MaskPos;
1308 };
1309
1311 Function *Variant, Intrinsic::ID IID,
1312 std::optional<unsigned> MaskPos,
1314 assert(!VF.isScalar() && "Expected vector VF");
1315 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1316 MaskPos, Cost};
1317 }
1318
1320 ElementCount VF) const {
1321 assert(!VF.isScalar() && "Expected vector VF");
1322 return CallWideningDecisions.at(std::make_pair(CI, VF));
1323 }
1324
1325 /// Return True if instruction \p I is an optimizable truncate whose operand
1326 /// is an induction variable. Such a truncate will be removed by adding a new
1327 /// induction variable with the destination type.
1329 // If the instruction is not a truncate, return false.
1330 auto *Trunc = dyn_cast<TruncInst>(I);
1331 if (!Trunc)
1332 return false;
1333
1334 // Get the source and destination types of the truncate.
1335 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1336 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1337
1338 // If the truncate is free for the given types, return false. Replacing a
1339 // free truncate with an induction variable would add an induction variable
1340 // update instruction to each iteration of the loop. We exclude from this
1341 // check the primary induction variable since it will need an update
1342 // instruction regardless.
1343 Value *Op = Trunc->getOperand(0);
1344 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1345 return false;
1346
1347 // If the truncated value is not an induction variable, return false.
1348 return Legal->isInductionPhi(Op);
1349 }
1350
1351 /// Collects the instructions to scalarize for each predicated instruction in
1352 /// the loop.
1354
1355 /// Collect Uniform and Scalar values for the given \p VF.
1356 /// The sets depend on CM decision for Load/Store instructions
1357 /// that may be vectorized as interleave, gather-scatter or scalarized.
1358 /// Also make a decision on what to do about call instructions in the loop
1359 /// at that VF -- scalarize, call a known vector routine, or call a
1360 /// vector intrinsic.
1362 // Do the analysis once.
1363 if (VF.isScalar() || Uniforms.contains(VF))
1364 return;
1367 collectLoopUniforms(VF);
1368 collectLoopScalars(VF);
1369 }
1370
1371 /// Returns true if the target machine supports masked store operation
1372 /// for the given \p DataType and kind of access to \p Ptr.
1373 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1374 return Legal->isConsecutivePtr(DataType, Ptr) &&
1375 TTI.isLegalMaskedStore(DataType, Alignment);
1376 }
1377
1378 /// Returns true if the target machine supports masked load operation
1379 /// for the given \p DataType and kind of access to \p Ptr.
1380 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1381 return Legal->isConsecutivePtr(DataType, Ptr) &&
1382 TTI.isLegalMaskedLoad(DataType, Alignment);
1383 }
1384
1385 /// Returns true if the target machine can represent \p V as a masked gather
1386 /// or scatter operation.
1388 bool LI = isa<LoadInst>(V);
1389 bool SI = isa<StoreInst>(V);
1390 if (!LI && !SI)
1391 return false;
1392 auto *Ty = getLoadStoreType(V);
1394 if (VF.isVector())
1395 Ty = VectorType::get(Ty, VF);
1396 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1397 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1398 }
1399
1400 /// Returns true if the target machine supports all of the reduction
1401 /// variables found for the given VF.
1403 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1404 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1405 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1406 }));
1407 }
1408
1409 /// Given costs for both strategies, return true if the scalar predication
1410 /// lowering should be used for div/rem. This incorporates an override
1411 /// option so it is not simply a cost comparison.
1413 InstructionCost SafeDivisorCost) const {
1414 switch (ForceSafeDivisor) {
1415 case cl::BOU_UNSET:
1416 return ScalarCost < SafeDivisorCost;
1417 case cl::BOU_TRUE:
1418 return false;
1419 case cl::BOU_FALSE:
1420 return true;
1421 };
1422 llvm_unreachable("impossible case value");
1423 }
1424
1425 /// Returns true if \p I is an instruction which requires predication and
1426 /// for which our chosen predication strategy is scalarization (i.e. we
1427 /// don't have an alternate strategy such as masking available).
1428 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1430
1431 /// Returns true if \p I is an instruction that needs to be predicated
1432 /// at runtime. The result is independent of the predication mechanism.
1433 /// Superset of instructions that return true for isScalarWithPredication.
1434 bool isPredicatedInst(Instruction *I) const;
1435
1436 /// Return the costs for our two available strategies for lowering a
1437 /// div/rem operation which requires speculating at least one lane.
1438 /// First result is for scalarization (will be invalid for scalable
1439 /// vectors); second is for the safe-divisor strategy.
1440 std::pair<InstructionCost, InstructionCost>
1442 ElementCount VF) const;
1443
1444 /// Returns true if \p I is a memory instruction with consecutive memory
1445 /// access that can be widened.
1447
1448 /// Returns true if \p I is a memory instruction in an interleaved-group
1449 /// of memory accesses that can be vectorized with wide vector loads/stores
1450 /// and shuffles.
1452
1453 /// Check if \p Instr belongs to any interleaved access group.
1455 return InterleaveInfo.isInterleaved(Instr);
1456 }
1457
1458 /// Get the interleaved access group that \p Instr belongs to.
1461 return InterleaveInfo.getInterleaveGroup(Instr);
1462 }
1463
1464 /// Returns true if we're required to use a scalar epilogue for at least
1465 /// the final iteration of the original loop.
1466 bool requiresScalarEpilogue(bool IsVectorizing) const {
1468 return false;
1469 // If we might exit from anywhere but the latch, must run the exiting
1470 // iteration in scalar form.
1472 return true;
1473 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1474 }
1475
1476 /// Returns true if we're required to use a scalar epilogue for at least
1477 /// the final iteration of the original loop for all VFs in \p Range.
1478 /// A scalar epilogue must either be required for all VFs in \p Range or for
1479 /// none.
1481 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1482 return requiresScalarEpilogue(VF.isVector());
1483 };
1484 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1485 assert(
1486 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1487 "all VFs in range must agree on whether a scalar epilogue is required");
1488 return IsRequired;
1489 }
1490
1491 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1492 /// loop hint annotation.
1494 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1495 }
1496
1497 /// Returns the TailFoldingStyle that is best for the current loop.
1498 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1499 if (!ChosenTailFoldingStyle)
1501 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1502 : ChosenTailFoldingStyle->second;
1503 }
1504
1505 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1506 /// overflow or not.
1507 /// \param IsScalableVF true if scalable vector factors enabled.
1508 /// \param UserIC User specific interleave count.
1509 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1510 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1512 ChosenTailFoldingStyle =
1514 return;
1515 }
1516
1517 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1518 ChosenTailFoldingStyle = std::make_pair(
1519 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1520 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1521 return;
1522 }
1523
1524 // Set styles when forced.
1525 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1526 ForceTailFoldingStyle.getValue());
1528 return;
1529 // Override forced styles if needed.
1530 // FIXME: use actual opcode/data type for analysis here.
1531 // FIXME: Investigate opportunity for fixed vector factor.
1532 bool EVLIsLegal =
1533 IsScalableVF && UserIC <= 1 &&
1534 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1536 // FIXME: implement support for max safe dependency distance.
1538 // FIXME: remove this once reductions are supported.
1540 if (!EVLIsLegal) {
1541 // If for some reason EVL mode is unsupported, fallback to
1542 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1543 // in a generic way.
1544 ChosenTailFoldingStyle =
1547 LLVM_DEBUG(
1548 dbgs()
1549 << "LV: Preference for VP intrinsics indicated. Will "
1550 "not try to generate VP Intrinsics "
1551 << (UserIC > 1
1552 ? "since interleave count specified is greater than 1.\n"
1553 : "due to non-interleaving reasons.\n"));
1554 }
1555 }
1556
1557 /// Returns true if all loop blocks should be masked to fold tail loop.
1558 bool foldTailByMasking() const {
1559 // TODO: check if it is possible to check for None style independent of
1560 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1562 }
1563
1564 /// Returns true if the instructions in this block requires predication
1565 /// for any reason, e.g. because tail folding now requires a predicate
1566 /// or because the block in the original loop was predicated.
1569 }
1570
1571 /// Returns true if VP intrinsics with explicit vector length support should
1572 /// be generated in the tail folded loop.
1573 bool foldTailWithEVL() const {
1575 }
1576
1577 /// Returns true if the Phi is part of an inloop reduction.
1578 bool isInLoopReduction(PHINode *Phi) const {
1579 return InLoopReductions.contains(Phi);
1580 }
1581
1582 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1583 /// with factor VF. Return the cost of the instruction, including
1584 /// scalarization overhead if it's needed.
1586
1587 /// Estimate cost of a call instruction CI if it were vectorized with factor
1588 /// VF. Return the cost of the instruction, including scalarization overhead
1589 /// if it's needed.
1591
1592 /// Invalidates decisions already taken by the cost model.
1594 WideningDecisions.clear();
1595 CallWideningDecisions.clear();
1596 Uniforms.clear();
1597 Scalars.clear();
1598 }
1599
1600 /// The vectorization cost is a combination of the cost itself and a boolean
1601 /// indicating whether any of the contributing operations will actually
1602 /// operate on vector values after type legalization in the backend. If this
1603 /// latter value is false, then all operations will be scalarized (i.e. no
1604 /// vectorization has actually taken place).
1605 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1606
1607 /// Returns the expected execution cost. The unit of the cost does
1608 /// not matter because we use the 'cost' units to compare different
1609 /// vector widths. The cost that is returned is *not* normalized by
1610 /// the factor width. If \p Invalid is not nullptr, this function
1611 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1612 /// each instruction that has an Invalid cost for the given VF.
1616
1617 bool hasPredStores() const { return NumPredStores > 0; }
1618
1619 /// Returns true if epilogue vectorization is considered profitable, and
1620 /// false otherwise.
1621 /// \p VF is the vectorization factor chosen for the original loop.
1623
1624private:
1625 unsigned NumPredStores = 0;
1626
1627 /// \return An upper bound for the vectorization factors for both
1628 /// fixed and scalable vectorization, where the minimum-known number of
1629 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1630 /// disabled or unsupported, then the scalable part will be equal to
1631 /// ElementCount::getScalable(0).
1632 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1633 ElementCount UserVF,
1634 bool FoldTailByMasking);
1635
1636 /// \return the maximized element count based on the targets vector
1637 /// registers and the loop trip-count, but limited to a maximum safe VF.
1638 /// This is a helper function of computeFeasibleMaxVF.
1639 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1640 unsigned SmallestType,
1641 unsigned WidestType,
1642 ElementCount MaxSafeVF,
1643 bool FoldTailByMasking);
1644
1645 /// \return the maximum legal scalable VF, based on the safe max number
1646 /// of elements.
1647 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1648
1649 /// Returns the execution time cost of an instruction for a given vector
1650 /// width. Vector width of one means scalar.
1651 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1652
1653 /// The cost-computation logic from getInstructionCost which provides
1654 /// the vector type as an output parameter.
1655 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1656 Type *&VectorTy);
1657
1658 /// Return the cost of instructions in an inloop reduction pattern, if I is
1659 /// part of that pattern.
1660 std::optional<InstructionCost>
1661 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1663
1664 /// Calculate vectorization cost of memory instruction \p I.
1665 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1666
1667 /// The cost computation for scalarized memory instruction.
1668 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1669
1670 /// The cost computation for interleaving group of memory instructions.
1671 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1672
1673 /// The cost computation for Gather/Scatter instruction.
1674 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1675
1676 /// The cost computation for widening instruction \p I with consecutive
1677 /// memory access.
1678 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1679
1680 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1681 /// Load: scalar load + broadcast.
1682 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1683 /// element)
1684 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1685
1686 /// Estimate the overhead of scalarizing an instruction. This is a
1687 /// convenience wrapper for the type-based getScalarizationOverhead API.
1688 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1690
1691 /// Returns true if an artificially high cost for emulated masked memrefs
1692 /// should be used.
1693 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1694
1695 /// Map of scalar integer values to the smallest bitwidth they can be legally
1696 /// represented as. The vector equivalents of these values should be truncated
1697 /// to this type.
1699
1700 /// A type representing the costs for instructions if they were to be
1701 /// scalarized rather than vectorized. The entries are Instruction-Cost
1702 /// pairs.
1703 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1704
1705 /// A set containing all BasicBlocks that are known to present after
1706 /// vectorization as a predicated block.
1708 PredicatedBBsAfterVectorization;
1709
1710 /// Records whether it is allowed to have the original scalar loop execute at
1711 /// least once. This may be needed as a fallback loop in case runtime
1712 /// aliasing/dependence checks fail, or to handle the tail/remainder
1713 /// iterations when the trip count is unknown or doesn't divide by the VF,
1714 /// or as a peel-loop to handle gaps in interleave-groups.
1715 /// Under optsize and when the trip count is very small we don't allow any
1716 /// iterations to execute in the scalar loop.
1717 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1718
1719 /// Control finally chosen tail folding style. The first element is used if
1720 /// the IV update may overflow, the second element - if it does not.
1721 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1722 ChosenTailFoldingStyle;
1723
1724 /// A map holding scalar costs for different vectorization factors. The
1725 /// presence of a cost for an instruction in the mapping indicates that the
1726 /// instruction will be scalarized when vectorizing with the associated
1727 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1729
1730 /// Holds the instructions known to be uniform after vectorization.
1731 /// The data is collected per VF.
1733
1734 /// Holds the instructions known to be scalar after vectorization.
1735 /// The data is collected per VF.
1737
1738 /// Holds the instructions (address computations) that are forced to be
1739 /// scalarized.
1741
1742 /// PHINodes of the reductions that should be expanded in-loop.
1743 SmallPtrSet<PHINode *, 4> InLoopReductions;
1744
1745 /// A Map of inloop reduction operations and their immediate chain operand.
1746 /// FIXME: This can be removed once reductions can be costed correctly in
1747 /// VPlan. This was added to allow quick lookup of the inloop operations.
1748 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1749
1750 /// Returns the expected difference in cost from scalarizing the expression
1751 /// feeding a predicated instruction \p PredInst. The instructions to
1752 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1753 /// non-negative return value implies the expression will be scalarized.
1754 /// Currently, only single-use chains are considered for scalarization.
1755 InstructionCost computePredInstDiscount(Instruction *PredInst,
1756 ScalarCostsTy &ScalarCosts,
1757 ElementCount VF);
1758
1759 /// Collect the instructions that are uniform after vectorization. An
1760 /// instruction is uniform if we represent it with a single scalar value in
1761 /// the vectorized loop corresponding to each vector iteration. Examples of
1762 /// uniform instructions include pointer operands of consecutive or
1763 /// interleaved memory accesses. Note that although uniformity implies an
1764 /// instruction will be scalar, the reverse is not true. In general, a
1765 /// scalarized instruction will be represented by VF scalar values in the
1766 /// vectorized loop, each corresponding to an iteration of the original
1767 /// scalar loop.
1768 void collectLoopUniforms(ElementCount VF);
1769
1770 /// Collect the instructions that are scalar after vectorization. An
1771 /// instruction is scalar if it is known to be uniform or will be scalarized
1772 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1773 /// to the list if they are used by a load/store instruction that is marked as
1774 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1775 /// VF values in the vectorized loop, each corresponding to an iteration of
1776 /// the original scalar loop.
1777 void collectLoopScalars(ElementCount VF);
1778
1779 /// Keeps cost model vectorization decision and cost for instructions.
1780 /// Right now it is used for memory instructions only.
1782 std::pair<InstWidening, InstructionCost>>;
1783
1784 DecisionList WideningDecisions;
1785
1786 using CallDecisionList =
1787 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1788
1789 CallDecisionList CallWideningDecisions;
1790
1791 /// Returns true if \p V is expected to be vectorized and it needs to be
1792 /// extracted.
1793 bool needsExtract(Value *V, ElementCount VF) const {
1794 Instruction *I = dyn_cast<Instruction>(V);
1795 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1797 return false;
1798
1799 // Assume we can vectorize V (and hence we need extraction) if the
1800 // scalars are not computed yet. This can happen, because it is called
1801 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1802 // the scalars are collected. That should be a safe assumption in most
1803 // cases, because we check if the operands have vectorizable types
1804 // beforehand in LoopVectorizationLegality.
1805 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1806 };
1807
1808 /// Returns a range containing only operands needing to be extracted.
1809 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1810 ElementCount VF) const {
1812 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1813 }
1814
1815public:
1816 /// The loop that we evaluate.
1818
1819 /// Predicated scalar evolution analysis.
1821
1822 /// Loop Info analysis.
1824
1825 /// Vectorization legality.
1827
1828 /// Vector target information.
1830
1831 /// Target Library Info.
1833
1834 /// Demanded bits analysis.
1836
1837 /// Assumption cache.
1839
1840 /// Interface to emit optimization remarks.
1842
1844
1845 /// Loop Vectorize Hint.
1847
1848 /// The interleave access information contains groups of interleaved accesses
1849 /// with the same stride and close to each other.
1851
1852 /// Values to ignore in the cost model.
1854
1855 /// Values to ignore in the cost model when VF > 1.
1857
1858 /// All element types found in the loop.
1860};
1861} // end namespace llvm
1862
1863namespace {
1864/// Helper struct to manage generating runtime checks for vectorization.
1865///
1866/// The runtime checks are created up-front in temporary blocks to allow better
1867/// estimating the cost and un-linked from the existing IR. After deciding to
1868/// vectorize, the checks are moved back. If deciding not to vectorize, the
1869/// temporary blocks are completely removed.
1870class GeneratedRTChecks {
1871 /// Basic block which contains the generated SCEV checks, if any.
1872 BasicBlock *SCEVCheckBlock = nullptr;
1873
1874 /// The value representing the result of the generated SCEV checks. If it is
1875 /// nullptr, either no SCEV checks have been generated or they have been used.
1876 Value *SCEVCheckCond = nullptr;
1877
1878 /// Basic block which contains the generated memory runtime checks, if any.
1879 BasicBlock *MemCheckBlock = nullptr;
1880
1881 /// The value representing the result of the generated memory runtime checks.
1882 /// If it is nullptr, either no memory runtime checks have been generated or
1883 /// they have been used.
1884 Value *MemRuntimeCheckCond = nullptr;
1885
1886 DominatorTree *DT;
1887 LoopInfo *LI;
1889
1890 SCEVExpander SCEVExp;
1891 SCEVExpander MemCheckExp;
1892
1893 bool CostTooHigh = false;
1894 const bool AddBranchWeights;
1895
1896 Loop *OuterLoop = nullptr;
1897
1898public:
1899 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1901 bool AddBranchWeights)
1902 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1903 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1904
1905 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1906 /// accurately estimate the cost of the runtime checks. The blocks are
1907 /// un-linked from the IR and is added back during vector code generation. If
1908 /// there is no vector code generation, the check blocks are removed
1909 /// completely.
1910 void Create(Loop *L, const LoopAccessInfo &LAI,
1911 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1912
1913 // Hard cutoff to limit compile-time increase in case a very large number of
1914 // runtime checks needs to be generated.
1915 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1916 // profile info.
1917 CostTooHigh =
1919 if (CostTooHigh)
1920 return;
1921
1922 BasicBlock *LoopHeader = L->getHeader();
1923 BasicBlock *Preheader = L->getLoopPreheader();
1924
1925 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1926 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1927 // may be used by SCEVExpander. The blocks will be un-linked from their
1928 // predecessors and removed from LI & DT at the end of the function.
1929 if (!UnionPred.isAlwaysTrue()) {
1930 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1931 nullptr, "vector.scevcheck");
1932
1933 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1934 &UnionPred, SCEVCheckBlock->getTerminator());
1935 }
1936
1937 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1938 if (RtPtrChecking.Need) {
1939 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1940 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1941 "vector.memcheck");
1942
1943 auto DiffChecks = RtPtrChecking.getDiffChecks();
1944 if (DiffChecks) {
1945 Value *RuntimeVF = nullptr;
1946 MemRuntimeCheckCond = addDiffRuntimeChecks(
1947 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1948 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1949 if (!RuntimeVF)
1950 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1951 return RuntimeVF;
1952 },
1953 IC);
1954 } else {
1955 MemRuntimeCheckCond = addRuntimeChecks(
1956 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1958 }
1959 assert(MemRuntimeCheckCond &&
1960 "no RT checks generated although RtPtrChecking "
1961 "claimed checks are required");
1962 }
1963
1964 if (!MemCheckBlock && !SCEVCheckBlock)
1965 return;
1966
1967 // Unhook the temporary block with the checks, update various places
1968 // accordingly.
1969 if (SCEVCheckBlock)
1970 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1971 if (MemCheckBlock)
1972 MemCheckBlock->replaceAllUsesWith(Preheader);
1973
1974 if (SCEVCheckBlock) {
1975 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1976 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1977 Preheader->getTerminator()->eraseFromParent();
1978 }
1979 if (MemCheckBlock) {
1980 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1981 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1982 Preheader->getTerminator()->eraseFromParent();
1983 }
1984
1985 DT->changeImmediateDominator(LoopHeader, Preheader);
1986 if (MemCheckBlock) {
1987 DT->eraseNode(MemCheckBlock);
1988 LI->removeBlock(MemCheckBlock);
1989 }
1990 if (SCEVCheckBlock) {
1991 DT->eraseNode(SCEVCheckBlock);
1992 LI->removeBlock(SCEVCheckBlock);
1993 }
1994
1995 // Outer loop is used as part of the later cost calculations.
1996 OuterLoop = L->getParentLoop();
1997 }
1998
1999 InstructionCost getCost() {
2000 if (SCEVCheckBlock || MemCheckBlock)
2001 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2002
2003 if (CostTooHigh) {
2005 Cost.setInvalid();
2006 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2007 return Cost;
2008 }
2009
2010 InstructionCost RTCheckCost = 0;
2011 if (SCEVCheckBlock)
2012 for (Instruction &I : *SCEVCheckBlock) {
2013 if (SCEVCheckBlock->getTerminator() == &I)
2014 continue;
2017 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2018 RTCheckCost += C;
2019 }
2020 if (MemCheckBlock) {
2021 InstructionCost MemCheckCost = 0;
2022 for (Instruction &I : *MemCheckBlock) {
2023 if (MemCheckBlock->getTerminator() == &I)
2024 continue;
2027 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2028 MemCheckCost += C;
2029 }
2030
2031 // If the runtime memory checks are being created inside an outer loop
2032 // we should find out if these checks are outer loop invariant. If so,
2033 // the checks will likely be hoisted out and so the effective cost will
2034 // reduce according to the outer loop trip count.
2035 if (OuterLoop) {
2036 ScalarEvolution *SE = MemCheckExp.getSE();
2037 // TODO: If profitable, we could refine this further by analysing every
2038 // individual memory check, since there could be a mixture of loop
2039 // variant and invariant checks that mean the final condition is
2040 // variant.
2041 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2042 if (SE->isLoopInvariant(Cond, OuterLoop)) {
2043 // It seems reasonable to assume that we can reduce the effective
2044 // cost of the checks even when we know nothing about the trip
2045 // count. Assume that the outer loop executes at least twice.
2046 unsigned BestTripCount = 2;
2047
2048 // If exact trip count is known use that.
2049 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2050 BestTripCount = SmallTC;
2052 // Else use profile data if available.
2053 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2054 BestTripCount = *EstimatedTC;
2055 }
2056
2057 BestTripCount = std::max(BestTripCount, 1U);
2058 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2059
2060 // Let's ensure the cost is always at least 1.
2061 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2063
2064 if (BestTripCount > 1)
2066 << "We expect runtime memory checks to be hoisted "
2067 << "out of the outer loop. Cost reduced from "
2068 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2069
2070 MemCheckCost = NewMemCheckCost;
2071 }
2072 }
2073
2074 RTCheckCost += MemCheckCost;
2075 }
2076
2077 if (SCEVCheckBlock || MemCheckBlock)
2078 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2079 << "\n");
2080
2081 return RTCheckCost;
2082 }
2083
2084 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2085 /// unused.
2086 ~GeneratedRTChecks() {
2087 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2088 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2089 if (!SCEVCheckCond)
2090 SCEVCleaner.markResultUsed();
2091
2092 if (!MemRuntimeCheckCond)
2093 MemCheckCleaner.markResultUsed();
2094
2095 if (MemRuntimeCheckCond) {
2096 auto &SE = *MemCheckExp.getSE();
2097 // Memory runtime check generation creates compares that use expanded
2098 // values. Remove them before running the SCEVExpanderCleaners.
2099 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2100 if (MemCheckExp.isInsertedInstruction(&I))
2101 continue;
2102 SE.forgetValue(&I);
2103 I.eraseFromParent();
2104 }
2105 }
2106 MemCheckCleaner.cleanup();
2107 SCEVCleaner.cleanup();
2108
2109 if (SCEVCheckCond)
2110 SCEVCheckBlock->eraseFromParent();
2111 if (MemRuntimeCheckCond)
2112 MemCheckBlock->eraseFromParent();
2113 }
2114
2115 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2116 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2117 /// depending on the generated condition.
2118 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2119 BasicBlock *LoopVectorPreHeader,
2120 BasicBlock *LoopExitBlock) {
2121 if (!SCEVCheckCond)
2122 return nullptr;
2123
2124 Value *Cond = SCEVCheckCond;
2125 // Mark the check as used, to prevent it from being removed during cleanup.
2126 SCEVCheckCond = nullptr;
2127 if (auto *C = dyn_cast<ConstantInt>(Cond))
2128 if (C->isZero())
2129 return nullptr;
2130
2131 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2132
2133 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2134 // Create new preheader for vector loop.
2135 if (OuterLoop)
2136 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2137
2138 SCEVCheckBlock->getTerminator()->eraseFromParent();
2139 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2140 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2141 SCEVCheckBlock);
2142
2143 DT->addNewBlock(SCEVCheckBlock, Pred);
2144 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2145
2146 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2147 if (AddBranchWeights)
2149 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2150 return SCEVCheckBlock;
2151 }
2152
2153 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2154 /// the branches to branch to the vector preheader or \p Bypass, depending on
2155 /// the generated condition.
2156 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2157 BasicBlock *LoopVectorPreHeader) {
2158 // Check if we generated code that checks in runtime if arrays overlap.
2159 if (!MemRuntimeCheckCond)
2160 return nullptr;
2161
2162 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2163 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2164 MemCheckBlock);
2165
2166 DT->addNewBlock(MemCheckBlock, Pred);
2167 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2168 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2169
2170 if (OuterLoop)
2171 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2172
2173 BranchInst &BI =
2174 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2175 if (AddBranchWeights) {
2177 }
2178 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2179 MemCheckBlock->getTerminator()->setDebugLoc(
2180 Pred->getTerminator()->getDebugLoc());
2181
2182 // Mark the check as used, to prevent it from being removed during cleanup.
2183 MemRuntimeCheckCond = nullptr;
2184 return MemCheckBlock;
2185 }
2186};
2187} // namespace
2188
2190 return Style == TailFoldingStyle::Data ||
2191 Style == TailFoldingStyle::DataAndControlFlow ||
2192 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2193}
2194
2196 return Style == TailFoldingStyle::DataAndControlFlow ||
2197 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2198}
2199
2200// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2201// vectorization. The loop needs to be annotated with #pragma omp simd
2202// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2203// vector length information is not provided, vectorization is not considered
2204// explicit. Interleave hints are not allowed either. These limitations will be
2205// relaxed in the future.
2206// Please, note that we are currently forced to abuse the pragma 'clang
2207// vectorize' semantics. This pragma provides *auto-vectorization hints*
2208// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2209// provides *explicit vectorization hints* (LV can bypass legal checks and
2210// assume that vectorization is legal). However, both hints are implemented
2211// using the same metadata (llvm.loop.vectorize, processed by
2212// LoopVectorizeHints). This will be fixed in the future when the native IR
2213// representation for pragma 'omp simd' is introduced.
2214static bool isExplicitVecOuterLoop(Loop *OuterLp,
2216 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2217 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2218
2219 // Only outer loops with an explicit vectorization hint are supported.
2220 // Unannotated outer loops are ignored.
2222 return false;
2223
2224 Function *Fn = OuterLp->getHeader()->getParent();
2225 if (!Hints.allowVectorization(Fn, OuterLp,
2226 true /*VectorizeOnlyWhenForced*/)) {
2227 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2228 return false;
2229 }
2230
2231 if (Hints.getInterleave() > 1) {
2232 // TODO: Interleave support is future work.
2233 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2234 "outer loops.\n");
2235 Hints.emitRemarkWithHints();
2236 return false;
2237 }
2238
2239 return true;
2240}
2241
2245 // Collect inner loops and outer loops without irreducible control flow. For
2246 // now, only collect outer loops that have explicit vectorization hints. If we
2247 // are stress testing the VPlan H-CFG construction, we collect the outermost
2248 // loop of every loop nest.
2249 if (L.isInnermost() || VPlanBuildStressTest ||
2251 LoopBlocksRPO RPOT(&L);
2252 RPOT.perform(LI);
2253 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2254 V.push_back(&L);
2255 // TODO: Collect inner loops inside marked outer loops in case
2256 // vectorization fails for the outer loop. Do not invoke
2257 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2258 // already known to be reducible. We can use an inherited attribute for
2259 // that.
2260 return;
2261 }
2262 }
2263 for (Loop *InnerL : L)
2264 collectSupportedLoops(*InnerL, LI, ORE, V);
2265}
2266
2267//===----------------------------------------------------------------------===//
2268// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2269// LoopVectorizationCostModel and LoopVectorizationPlanner.
2270//===----------------------------------------------------------------------===//
2271
2272/// Compute the transformed value of Index at offset StartValue using step
2273/// StepValue.
2274/// For integer induction, returns StartValue + Index * StepValue.
2275/// For pointer induction, returns StartValue[Index * StepValue].
2276/// FIXME: The newly created binary instructions should contain nsw/nuw
2277/// flags, which can be found from the original scalar operations.
2278static Value *
2280 Value *Step,
2282 const BinaryOperator *InductionBinOp) {
2283 Type *StepTy = Step->getType();
2284 Value *CastedIndex = StepTy->isIntegerTy()
2285 ? B.CreateSExtOrTrunc(Index, StepTy)
2286 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2287 if (CastedIndex != Index) {
2288 CastedIndex->setName(CastedIndex->getName() + ".cast");
2289 Index = CastedIndex;
2290 }
2291
2292 // Note: the IR at this point is broken. We cannot use SE to create any new
2293 // SCEV and then expand it, hoping that SCEV's simplification will give us
2294 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2295 // lead to various SCEV crashes. So all we can do is to use builder and rely
2296 // on InstCombine for future simplifications. Here we handle some trivial
2297 // cases only.
2298 auto CreateAdd = [&B](Value *X, Value *Y) {
2299 assert(X->getType() == Y->getType() && "Types don't match!");
2300 if (auto *CX = dyn_cast<ConstantInt>(X))
2301 if (CX->isZero())
2302 return Y;
2303 if (auto *CY = dyn_cast<ConstantInt>(Y))
2304 if (CY->isZero())
2305 return X;
2306 return B.CreateAdd(X, Y);
2307 };
2308
2309 // We allow X to be a vector type, in which case Y will potentially be
2310 // splatted into a vector with the same element count.
2311 auto CreateMul = [&B](Value *X, Value *Y) {
2312 assert(X->getType()->getScalarType() == Y->getType() &&
2313 "Types don't match!");
2314 if (auto *CX = dyn_cast<ConstantInt>(X))
2315 if (CX->isOne())
2316 return Y;
2317 if (auto *CY = dyn_cast<ConstantInt>(Y))
2318 if (CY->isOne())
2319 return X;
2320 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2321 if (XVTy && !isa<VectorType>(Y->getType()))
2322 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2323 return B.CreateMul(X, Y);
2324 };
2325
2326 switch (InductionKind) {
2328 assert(!isa<VectorType>(Index->getType()) &&
2329 "Vector indices not supported for integer inductions yet");
2330 assert(Index->getType() == StartValue->getType() &&
2331 "Index type does not match StartValue type");
2332 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2333 return B.CreateSub(StartValue, Index);
2334 auto *Offset = CreateMul(Index, Step);
2335 return CreateAdd(StartValue, Offset);
2336 }
2338 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2340 assert(!isa<VectorType>(Index->getType()) &&
2341 "Vector indices not supported for FP inductions yet");
2342 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2343 assert(InductionBinOp &&
2344 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2345 InductionBinOp->getOpcode() == Instruction::FSub) &&
2346 "Original bin op should be defined for FP induction");
2347
2348 Value *MulExp = B.CreateFMul(Step, Index);
2349 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2350 "induction");
2351 }
2353 return nullptr;
2354 }
2355 llvm_unreachable("invalid enum");
2356}
2357
2358std::optional<unsigned> getMaxVScale(const Function &F,
2359 const TargetTransformInfo &TTI) {
2360 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2361 return MaxVScale;
2362
2363 if (F.hasFnAttribute(Attribute::VScaleRange))
2364 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2365
2366 return std::nullopt;
2367}
2368
2369/// For the given VF and UF and maximum trip count computed for the loop, return
2370/// whether the induction variable might overflow in the vectorized loop. If not,
2371/// then we know a runtime overflow check always evaluates to false and can be
2372/// removed.
2375 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2376 // Always be conservative if we don't know the exact unroll factor.
2377 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2378
2379 Type *IdxTy = Cost->Legal->getWidestInductionType();
2380 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2381
2382 // We know the runtime overflow check is known false iff the (max) trip-count
2383 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2384 // the vector loop induction variable.
2385 if (unsigned TC =
2386 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2387 uint64_t MaxVF = VF.getKnownMinValue();
2388 if (VF.isScalable()) {
2389 std::optional<unsigned> MaxVScale =
2390 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2391 if (!MaxVScale)
2392 return false;
2393 MaxVF *= *MaxVScale;
2394 }
2395
2396 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2397 }
2398
2399 return false;
2400}
2401
2402// Return whether we allow using masked interleave-groups (for dealing with
2403// strided loads/stores that reside in predicated blocks, or for dealing
2404// with gaps).
2406 // If an override option has been passed in for interleaved accesses, use it.
2407 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2409
2411}
2412
2413// Try to vectorize the interleave group that \p Instr belongs to.
2414//
2415// E.g. Translate following interleaved load group (factor = 3):
2416// for (i = 0; i < N; i+=3) {
2417// R = Pic[i]; // Member of index 0
2418// G = Pic[i+1]; // Member of index 1
2419// B = Pic[i+2]; // Member of index 2
2420// ... // do something to R, G, B
2421// }
2422// To:
2423// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2424// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2425// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2426// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2427//
2428// Or translate following interleaved store group (factor = 3):
2429// for (i = 0; i < N; i+=3) {
2430// ... do something to R, G, B
2431// Pic[i] = R; // Member of index 0
2432// Pic[i+1] = G; // Member of index 1
2433// Pic[i+2] = B; // Member of index 2
2434// }
2435// To:
2436// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2437// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2438// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2439// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2440// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2443 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2444 VPValue *BlockInMask, bool NeedsMaskForGaps) {
2445 Instruction *Instr = Group->getInsertPos();
2446 const DataLayout &DL = Instr->getModule()->getDataLayout();
2447
2448 // Prepare for the vector type of the interleaved load/store.
2449 Type *ScalarTy = getLoadStoreType(Instr);
2450 unsigned InterleaveFactor = Group->getFactor();
2451 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2452
2453 // Prepare for the new pointers.
2454 SmallVector<Value *, 2> AddrParts;
2455 unsigned Index = Group->getIndex(Instr);
2456
2457 // TODO: extend the masked interleaved-group support to reversed access.
2458 assert((!BlockInMask || !Group->isReverse()) &&
2459 "Reversed masked interleave-group not supported.");
2460
2461 Value *Idx;
2462 // If the group is reverse, adjust the index to refer to the last vector lane
2463 // instead of the first. We adjust the index from the first vector lane,
2464 // rather than directly getting the pointer for lane VF - 1, because the
2465 // pointer operand of the interleaved access is supposed to be uniform. For
2466 // uniform instructions, we're only required to generate a value for the
2467 // first vector lane in each unroll iteration.
2468 if (Group->isReverse()) {
2469 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2470 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2474 } else
2476
2477 for (unsigned Part = 0; Part < UF; Part++) {
2478 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2479 if (auto *I = dyn_cast<Instruction>(AddrPart))
2480 State.setDebugLocFrom(I->getDebugLoc());
2481
2482 // Notice current instruction could be any index. Need to adjust the address
2483 // to the member of index 0.
2484 //
2485 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2486 // b = A[i]; // Member of index 0
2487 // Current pointer is pointed to A[i+1], adjust it to A[i].
2488 //
2489 // E.g. A[i+1] = a; // Member of index 1
2490 // A[i] = b; // Member of index 0
2491 // A[i+2] = c; // Member of index 2 (Current instruction)
2492 // Current pointer is pointed to A[i+2], adjust it to A[i].
2493
2494 bool InBounds = false;
2495 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2496 InBounds = gep->isInBounds();
2497 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2498 AddrParts.push_back(AddrPart);
2499 }
2500
2501 State.setDebugLocFrom(Instr->getDebugLoc());
2502 Value *PoisonVec = PoisonValue::get(VecTy);
2503
2504 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2505 unsigned Part, Value *MaskForGaps) -> Value * {
2506 if (VF.isScalable()) {
2507 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2508 assert(InterleaveFactor == 2 &&
2509 "Unsupported deinterleave factor for scalable vectors");
2510 auto *BlockInMaskPart = State.get(BlockInMask, Part);
2511 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2512 auto *MaskTy =
2514 return Builder.CreateIntrinsic(MaskTy, Intrinsic::vector_interleave2, Ops,
2515 /*FMFSource=*/nullptr, "interleaved.mask");
2516 }
2517
2518 if (!BlockInMask)
2519 return MaskForGaps;
2520
2521 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2522 Value *ShuffledMask = Builder.CreateShuffleVector(
2523 BlockInMaskPart,
2524 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2525 "interleaved.mask");
2526 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2527 MaskForGaps)
2528 : ShuffledMask;
2529 };
2530
2531 // Vectorize the interleaved load group.
2532 if (isa<LoadInst>(Instr)) {
2533 Value *MaskForGaps = nullptr;
2534 if (NeedsMaskForGaps) {
2535 MaskForGaps =
2537 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2538 }
2539
2540 // For each unroll part, create a wide load for the group.
2541 SmallVector<Value *, 2> NewLoads;
2542 for (unsigned Part = 0; Part < UF; Part++) {
2543 Instruction *NewLoad;
2544 if (BlockInMask || MaskForGaps) {
2546 "masked interleaved groups are not allowed.");
2547 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2548 NewLoad =
2549 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2550 GroupMask, PoisonVec, "wide.masked.vec");
2551 }
2552 else
2553 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2554 Group->getAlign(), "wide.vec");
2555 Group->addMetadata(NewLoad);
2556 NewLoads.push_back(NewLoad);
2557 }
2558
2559 if (VecTy->isScalableTy()) {
2560 assert(InterleaveFactor == 2 &&
2561 "Unsupported deinterleave factor for scalable vectors");
2562
2563 for (unsigned Part = 0; Part < UF; ++Part) {
2564 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2565 // so must use intrinsics to deinterleave.
2567 Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
2568 /*FMFSource=*/nullptr, "strided.vec");
2569 unsigned J = 0;
2570 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2571 Instruction *Member = Group->getMember(I);
2572
2573 if (!Member)
2574 continue;
2575
2576 Value *StridedVec = Builder.CreateExtractValue(DI, I);
2577 // If this member has different type, cast the result type.
2578 if (Member->getType() != ScalarTy) {
2579 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2580 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2581 }
2582
2583 if (Group->isReverse())
2584 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2585
2586 State.set(VPDefs[J], StridedVec, Part);
2587 ++J;
2588 }
2589 }
2590
2591 return;
2592 }
2593
2594 // For each member in the group, shuffle out the appropriate data from the
2595 // wide loads.
2596 unsigned J = 0;
2597 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2598 Instruction *Member = Group->getMember(I);
2599
2600 // Skip the gaps in the group.
2601 if (!Member)
2602 continue;
2603
2604 auto StrideMask =
2605 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2606 for (unsigned Part = 0; Part < UF; Part++) {
2607 Value *StridedVec = Builder.CreateShuffleVector(
2608 NewLoads[Part], StrideMask, "strided.vec");
2609
2610 // If this member has different type, cast the result type.
2611 if (Member->getType() != ScalarTy) {
2612 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2613 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2614 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2615 }
2616
2617 if (Group->isReverse())
2618 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2619
2620 State.set(VPDefs[J], StridedVec, Part);
2621 }
2622 ++J;
2623 }
2624 return;
2625 }
2626
2627 // The sub vector type for current instruction.
2628 auto *SubVT = VectorType::get(ScalarTy, VF);
2629
2630 // Vectorize the interleaved store group.
2631 Value *MaskForGaps =
2633 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2634 "masked interleaved groups are not allowed.");
2635 assert((!MaskForGaps || !VF.isScalable()) &&
2636 "masking gaps for scalable vectors is not yet supported.");
2637 for (unsigned Part = 0; Part < UF; Part++) {
2638 // Collect the stored vector from each member.
2639 SmallVector<Value *, 4> StoredVecs;
2640 unsigned StoredIdx = 0;
2641 for (unsigned i = 0; i < InterleaveFactor; i++) {
2642 assert((Group->getMember(i) || MaskForGaps) &&
2643 "Fail to get a member from an interleaved store group");
2644 Instruction *Member = Group->getMember(i);
2645
2646 // Skip the gaps in the group.
2647 if (!Member) {
2648 Value *Undef = PoisonValue::get(SubVT);
2649 StoredVecs.push_back(Undef);
2650 continue;
2651 }
2652
2653 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2654 ++StoredIdx;
2655
2656 if (Group->isReverse())
2657 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2658
2659 // If this member has different type, cast it to a unified type.
2660
2661 if (StoredVec->getType() != SubVT)
2662 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2663
2664 StoredVecs.push_back(StoredVec);
2665 }
2666
2667 // Interleave all the smaller vectors into one wider vector.
2668 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2669 Instruction *NewStoreInstr;
2670 if (BlockInMask || MaskForGaps) {
2671 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2672 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2673 Group->getAlign(), GroupMask);
2674 } else
2675 NewStoreInstr =
2676 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2677
2678 Group->addMetadata(NewStoreInstr);
2679 }
2680}
2681
2683 VPReplicateRecipe *RepRecipe,
2684 const VPIteration &Instance,
2685 VPTransformState &State) {
2686 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2687
2688 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2689 // the first lane and part.
2690 if (isa<NoAliasScopeDeclInst>(Instr))
2691 if (!Instance.isFirstIteration())
2692 return;
2693
2694 // Does this instruction return a value ?
2695 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2696
2697 Instruction *Cloned = Instr->clone();
2698 if (!IsVoidRetTy) {
2699 Cloned->setName(Instr->getName() + ".cloned");
2700#if !defined(NDEBUG)
2701 // Verify that VPlan type inference results agree with the type of the
2702 // generated values.
2703 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2704 "inferred type and type from generated instructions do not match");
2705#endif
2706 }
2707
2708 RepRecipe->setFlags(Cloned);
2709
2710 if (auto DL = Instr->getDebugLoc())
2711 State.setDebugLocFrom(DL);
2712
2713 // Replace the operands of the cloned instructions with their scalar
2714 // equivalents in the new loop.
2715 for (const auto &I : enumerate(RepRecipe->operands())) {
2716 auto InputInstance = Instance;
2717 VPValue *Operand = I.value();
2719 InputInstance.Lane = VPLane::getFirstLane();
2720 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2721 }
2722 State.addNewMetadata(Cloned, Instr);
2723
2724 // Place the cloned scalar in the new loop.
2725 State.Builder.Insert(Cloned);
2726
2727 State.set(RepRecipe, Cloned, Instance);
2728
2729 // If we just cloned a new assumption, add it the assumption cache.
2730 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2732
2733 // End if-block.
2734 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2735 if (IfPredicateInstr)
2736 PredicatedInstructions.push_back(Cloned);
2737}
2738
2739Value *
2741 if (VectorTripCount)
2742 return VectorTripCount;
2743
2744 Value *TC = getTripCount();
2745 IRBuilder<> Builder(InsertBlock->getTerminator());
2746
2747 Type *Ty = TC->getType();
2748 // This is where we can make the step a runtime constant.
2749 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2750
2751 // If the tail is to be folded by masking, round the number of iterations N
2752 // up to a multiple of Step instead of rounding down. This is done by first
2753 // adding Step-1 and then rounding down. Note that it's ok if this addition
2754 // overflows: the vector induction variable will eventually wrap to zero given
2755 // that it starts at zero and its Step is a power of two; the loop will then
2756 // exit, with the last early-exit vector comparison also producing all-true.
2757 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2758 // is accounted for in emitIterationCountCheck that adds an overflow check.
2759 if (Cost->foldTailByMasking()) {
2761 "VF*UF must be a power of 2 when folding tail by masking");
2762 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2763 TC = Builder.CreateAdd(
2764 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2765 }
2766
2767 // Now we need to generate the expression for the part of the loop that the
2768 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2769 // iterations are not required for correctness, or N - Step, otherwise. Step
2770 // is equal to the vectorization factor (number of SIMD elements) times the
2771 // unroll factor (number of SIMD instructions).
2772 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2773
2774 // There are cases where we *must* run at least one iteration in the remainder
2775 // loop. See the cost model for when this can happen. If the step evenly
2776 // divides the trip count, we set the remainder to be equal to the step. If
2777 // the step does not evenly divide the trip count, no adjustment is necessary
2778 // since there will already be scalar iterations. Note that the minimum
2779 // iterations check ensures that N >= Step.
2780 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2781 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2782 R = Builder.CreateSelect(IsZero, Step, R);
2783 }
2784
2785 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2786
2787 return VectorTripCount;
2788}
2789
2791 const DataLayout &DL) {
2792 // Verify that V is a vector type with same number of elements as DstVTy.
2793 auto *DstFVTy = cast<VectorType>(DstVTy);
2794 auto VF = DstFVTy->getElementCount();
2795 auto *SrcVecTy = cast<VectorType>(V->getType());
2796 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2797 Type *SrcElemTy = SrcVecTy->getElementType();
2798 Type *DstElemTy = DstFVTy->getElementType();
2799 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2800 "Vector elements must have same size");
2801
2802 // Do a direct cast if element types are castable.
2803 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2804 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2805 }
2806 // V cannot be directly casted to desired vector type.
2807 // May happen when V is a floating point vector but DstVTy is a vector of
2808 // pointers or vice-versa. Handle this using a two-step bitcast using an
2809 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2810 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2811 "Only one type should be a pointer type");
2812 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2813 "Only one type should be a floating point type");
2814 Type *IntTy =
2815 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2816 auto *VecIntTy = VectorType::get(IntTy, VF);
2817 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2818 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2819}
2820
2822 Value *Count = getTripCount();
2823 // Reuse existing vector loop preheader for TC checks.
2824 // Note that new preheader block is generated for vector loop.
2825 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2826 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2827
2828 // Generate code to check if the loop's trip count is less than VF * UF, or
2829 // equal to it in case a scalar epilogue is required; this implies that the
2830 // vector trip count is zero. This check also covers the case where adding one
2831 // to the backedge-taken count overflowed leading to an incorrect trip count
2832 // of zero. In this case we will also jump to the scalar loop.
2833 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2835
2836 // If tail is to be folded, vector loop takes care of all iterations.
2837 Type *CountTy = Count->getType();
2838 Value *CheckMinIters = Builder.getFalse();
2839 auto CreateStep = [&]() -> Value * {
2840 // Create step with max(MinProTripCount, UF * VF).
2842 return createStepForVF(Builder, CountTy, VF, UF);
2843
2844 Value *MinProfTC =
2846 if (!VF.isScalable())
2847 return MinProfTC;
2849 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2850 };
2851
2852 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2853 if (Style == TailFoldingStyle::None)
2854 CheckMinIters =
2855 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2856 else if (VF.isScalable() &&
2859 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2860 // an overflow to zero when updating induction variables and so an
2861 // additional overflow check is required before entering the vector loop.
2862
2863 // Get the maximum unsigned value for the type.
2864 Value *MaxUIntTripCount =
2865 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2866 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2867
2868 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2869 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2870 }
2871
2872 // Create new preheader for vector loop.
2874 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2875 "vector.ph");
2876
2877 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2878 DT->getNode(Bypass)->getIDom()) &&
2879 "TC check is expected to dominate Bypass");
2880
2881 // Update dominator for Bypass & LoopExit (if needed).
2882 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2883 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2884 // If there is an epilogue which must run, there's no edge from the
2885 // middle block to exit blocks and thus no need to update the immediate
2886 // dominator of the exit blocks.
2888
2889 BranchInst &BI =
2890 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2893 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2894 LoopBypassBlocks.push_back(TCCheckBlock);
2895}
2896
2898 BasicBlock *const SCEVCheckBlock =
2899 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2900 if (!SCEVCheckBlock)
2901 return nullptr;
2902
2903 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2905 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2906 "Cannot SCEV check stride or overflow when optimizing for size");
2907
2908
2909 // Update dominator only if this is first RT check.
2910 if (LoopBypassBlocks.empty()) {
2911 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2912 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2913 // If there is an epilogue which must run, there's no edge from the
2914 // middle block to exit blocks and thus no need to update the immediate
2915 // dominator of the exit blocks.
2916 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2917 }
2918
2919 LoopBypassBlocks.push_back(SCEVCheckBlock);
2920 AddedSafetyChecks = true;
2921 return SCEVCheckBlock;
2922}
2923
2925 // VPlan-native path does not do any analysis for runtime checks currently.
2927 return nullptr;
2928
2929 BasicBlock *const MemCheckBlock =
2930 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2931
2932 // Check if we generated code that checks in runtime if arrays overlap. We put
2933 // the checks into a separate block to make the more common case of few
2934 // elements faster.
2935 if (!MemCheckBlock)
2936 return nullptr;
2937
2938 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2939 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2940 "Cannot emit memory checks when optimizing for size, unless forced "
2941 "to vectorize.");
2942 ORE->emit([&]() {
2943 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2946 << "Code-size may be reduced by not forcing "
2947 "vectorization, or by source-code modifications "
2948 "eliminating the need for runtime checks "
2949 "(e.g., adding 'restrict').";
2950 });
2951 }
2952
2953 LoopBypassBlocks.push_back(MemCheckBlock);
2954
2955 AddedSafetyChecks = true;
2956
2957 return MemCheckBlock;
2958}
2959
2963 assert(LoopVectorPreHeader && "Invalid loop structure");
2964 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2965 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2966 "multiple exit loop without required epilogue?");
2967
2970 LI, nullptr, Twine(Prefix) + "middle.block");
2973 nullptr, Twine(Prefix) + "scalar.ph");
2974
2975 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2976
2977 // Set up the middle block terminator. Two cases:
2978 // 1) If we know that we must execute the scalar epilogue, emit an
2979 // unconditional branch.
2980 // 2) Otherwise, we must have a single unique exit block (due to how we
2981 // implement the multiple exit case). In this case, set up a conditional
2982 // branch from the middle block to the loop scalar preheader, and the
2983 // exit block. completeLoopSkeleton will update the condition to use an
2984 // iteration check, if required to decide whether to execute the remainder.
2985 BranchInst *BrInst =
2986 Cost->requiresScalarEpilogue(VF.isVector())
2989 Builder.getTrue());
2990 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
2992
2993 // Update dominator for loop exit. During skeleton creation, only the vector
2994 // pre-header and the middle block are created. The vector loop is entirely
2995 // created during VPlan exection.
2996 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2997 // If there is an epilogue which must run, there's no edge from the
2998 // middle block to exit blocks and thus no need to update the immediate
2999 // dominator of the exit blocks.
3001}
3002
3004 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3005 ArrayRef<BasicBlock *> BypassBlocks,
3006 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3008 assert(VectorTripCount && "Expected valid arguments");
3009
3010 Instruction *OldInduction = Legal->getPrimaryInduction();
3011 Value *&EndValue = IVEndValues[OrigPhi];
3012 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3013 if (OrigPhi == OldInduction) {
3014 // We know what the end value is.
3015 EndValue = VectorTripCount;
3016 } else {
3018
3019 // Fast-math-flags propagate from the original induction instruction.
3020 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3021 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3022
3024 Step, II.getKind(), II.getInductionBinOp());
3025 EndValue->setName("ind.end");
3026
3027 // Compute the end value for the additional bypass (if applicable).
3028 if (AdditionalBypass.first) {
3029 B.SetInsertPoint(AdditionalBypass.first,
3030 AdditionalBypass.first->getFirstInsertionPt());
3031 EndValueFromAdditionalBypass =
3032 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3033 Step, II.getKind(), II.getInductionBinOp());
3034 EndValueFromAdditionalBypass->setName("ind.end");
3035 }
3036 }
3037
3038 // Create phi nodes to merge from the backedge-taken check block.
3039 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3041 // Copy original phi DL over to the new one.
3042 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3043
3044 // The new PHI merges the original incoming value, in case of a bypass,
3045 // or the value at the end of the vectorized loop.
3046 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3047
3048 // Fix the scalar body counter (PHI node).
3049 // The old induction's phi node in the scalar body needs the truncated
3050 // value.
3051 for (BasicBlock *BB : BypassBlocks)
3052 BCResumeVal->addIncoming(II.getStartValue(), BB);
3053
3054 if (AdditionalBypass.first)
3055 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3056 EndValueFromAdditionalBypass);
3057 return BCResumeVal;
3058}
3059
3060/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3061/// expansion results.
3063 const SCEV2ValueTy &ExpandedSCEVs) {
3064 const SCEV *Step = ID.getStep();
3065 if (auto *C = dyn_cast<SCEVConstant>(Step))
3066 return C->getValue();
3067 if (auto *U = dyn_cast<SCEVUnknown>(Step))
3068 return U->getValue();
3069 auto I = ExpandedSCEVs.find(Step);
3070 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3071 return I->second;
3072}
3073
3075 const SCEV2ValueTy &ExpandedSCEVs,
3076 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3077 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3078 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3079 "Inconsistent information about additional bypass.");
3080 // We are going to resume the execution of the scalar loop.
3081 // Go over all of the induction variables that we found and fix the
3082 // PHIs that are left in the scalar version of the loop.
3083 // The starting values of PHI nodes depend on the counter of the last
3084 // iteration in the vectorized loop.
3085 // If we come from a bypass edge then we need to start from the original
3086 // start value.
3087 for (const auto &InductionEntry : Legal->getInductionVars()) {
3088 PHINode *OrigPhi = InductionEntry.first;
3089 const InductionDescriptor &II = InductionEntry.second;
3090 PHINode *BCResumeVal = createInductionResumeValue(
3091 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3092 AdditionalBypass);
3093 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3094 }
3095}
3096
3098 // The trip counts should be cached by now.
3099 Value *Count = getTripCount();
3101
3102 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3103
3104 // Add a check in the middle block to see if we have completed
3105 // all of the iterations in the first vector loop. Three cases:
3106 // 1) If we require a scalar epilogue, there is no conditional branch as
3107 // we unconditionally branch to the scalar preheader. Do nothing.
3108 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3109 // Thus if tail is to be folded, we know we don't need to run the
3110 // remainder and we can use the previous value for the condition (true).
3111 // 3) Otherwise, construct a runtime check.
3112 if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3113 !Cost->foldTailByMasking()) {
3114 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3115 // of the corresponding compare because they may have ended up with
3116 // different line numbers and we want to avoid awkward line stepping while
3117 // debugging. Eg. if the compare has got a line number inside the loop.
3118 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3119 // operands. Perform simplification directly on VPlan once the branch is
3120 // modeled there.
3122 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3123 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3124 BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3125 BI.setCondition(CmpN);
3126 if (hasBranchWeightMD(*ScalarLatchTerm)) {
3127 // Assume that `Count % VectorTripCount` is equally distributed.
3128 unsigned TripCount = UF * VF.getKnownMinValue();
3129 assert(TripCount > 0 && "trip count should not be zero");
3130 const uint32_t Weights[] = {1, TripCount - 1};
3131 setBranchWeights(BI, Weights);
3132 }
3133 }
3134
3135#ifdef EXPENSIVE_CHECKS
3136 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3137#endif
3138
3139 return LoopVectorPreHeader;
3140}
3141
3142std::pair<BasicBlock *, Value *>
3144 const SCEV2ValueTy &ExpandedSCEVs) {
3145 /*
3146 In this function we generate a new loop. The new loop will contain
3147 the vectorized instructions while the old loop will continue to run the
3148 scalar remainder.
3149
3150 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3151 / | preheader are expanded here. Eventually all required SCEV
3152 / | expansion should happen here.
3153 / v
3154 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3155 | / |
3156 | / v
3157 || [ ] <-- vector pre header.
3158 |/ |
3159 | v
3160 | [ ] \
3161 | [ ]_| <-- vector loop (created during VPlan execution).
3162 | |
3163 | v
3164 \ -[ ] <--- middle-block.
3165 \/ |
3166 /\ v
3167 | ->[ ] <--- new preheader.
3168 | |
3169 (opt) v <-- edge from middle to exit iff epilogue is not required.
3170 | [ ] \
3171 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3172 \ |
3173 \ v
3174 >[ ] <-- exit block(s).
3175 ...
3176 */
3177
3178 // Create an empty vector loop, and prepare basic blocks for the runtime
3179 // checks.
3181
3182 // Now, compare the new count to zero. If it is zero skip the vector loop and
3183 // jump to the scalar loop. This check also covers the case where the
3184 // backedge-taken count is uint##_max: adding one to it will overflow leading
3185 // to an incorrect trip count of zero. In this (rare) case we will also jump
3186 // to the scalar loop.
3188
3189 // Generate the code to check any assumptions that we've made for SCEV
3190 // expressions.
3192
3193 // Generate the code that checks in runtime if arrays overlap. We put the
3194 // checks into a separate block to make the more common case of few elements
3195 // faster.
3197
3198 // Emit phis for the new starting index of the scalar loop.
3199 createInductionResumeValues(ExpandedSCEVs);
3200
3201 return {completeLoopSkeleton(), nullptr};
3202}
3203
3204// Fix up external users of the induction variable. At this point, we are
3205// in LCSSA form, with all external PHIs that use the IV having one input value,
3206// coming from the remainder loop. We need those PHIs to also have a correct
3207// value for the IV when arriving directly from the middle block.
3209 const InductionDescriptor &II,
3210 Value *VectorTripCount, Value *EndValue,
3211 BasicBlock *MiddleBlock,
3212 BasicBlock *VectorHeader, VPlan &Plan,
3213 VPTransformState &State) {
3214 // There are two kinds of external IV usages - those that use the value
3215 // computed in the last iteration (the PHI) and those that use the penultimate
3216 // value (the value that feeds into the phi from the loop latch).
3217 // We allow both, but they, obviously, have different values.
3218
3219 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3220
3221 DenseMap<Value *, Value *> MissingVals;
3222
3223 // An external user of the last iteration's value should see the value that
3224 // the remainder loop uses to initialize its own IV.
3226 for (User *U : PostInc->users()) {
3227 Instruction *UI = cast<Instruction>(U);
3228 if (!OrigLoop->contains(UI)) {
3229 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3230 MissingVals[UI] = EndValue;
3231 }
3232 }
3233
3234 // An external user of the penultimate value need to see EndValue - Step.
3235 // The simplest way to get this is to recompute it from the constituent SCEVs,
3236 // that is Start + (Step * (CRD - 1)).
3237 for (User *U : OrigPhi->users()) {
3238 auto *UI = cast<Instruction>(U);
3239 if (!OrigLoop->contains(UI)) {
3240 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3241 IRBuilder<> B(MiddleBlock->getTerminator());
3242
3243 // Fast-math-flags propagate from the original induction instruction.
3244 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3245 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3246
3247 Value *CountMinusOne = B.CreateSub(
3248 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3249 CountMinusOne->setName("cmo");
3250
3251 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3252 assert(StepVPV && "step must have been expanded during VPlan execution");
3253 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3254 : State.get(StepVPV, {0, 0});
3255 Value *Escape =
3256 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3257 II.getKind(), II.getInductionBinOp());
3258 Escape->setName("ind.escape");
3259 MissingVals[UI] = Escape;
3260 }
3261 }
3262
3263 for (auto &I : MissingVals) {
3264 PHINode *PHI = cast<PHINode>(I.first);
3265 // One corner case we have to handle is two IVs "chasing" each-other,
3266 // that is %IV2 = phi [...], [ %IV1, %latch ]
3267 // In this case, if IV1 has an external use, we need to avoid adding both
3268 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3269 // don't already have an incoming value for the middle block.
3270 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3271 PHI->addIncoming(I.second, MiddleBlock);
3272 Plan.removeLiveOut(PHI);
3273 }
3274 }
3275}
3276
3277namespace {
3278
3279struct CSEDenseMapInfo {
3280 static bool canHandle(const Instruction *I) {
3281 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3282 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3283 }
3284
3285 static inline Instruction *getEmptyKey() {
3287 }
3288
3289 static inline Instruction *getTombstoneKey() {
3291 }
3292
3293 static unsigned getHashValue(const Instruction *I) {
3294 assert(canHandle(I) && "Unknown instruction!");
3295 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3296 I->value_op_end()));
3297 }
3298
3299 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3300 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3301 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3302 return LHS == RHS;
3303 return LHS->isIdenticalTo(RHS);
3304 }
3305};
3306
3307} // end anonymous namespace
3308
3309///Perform cse of induction variable instructions.
3310static void cse(BasicBlock *BB) {
3311 // Perform simple cse.
3313 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3314 if (!CSEDenseMapInfo::canHandle(&In))
3315 continue;
3316
3317 // Check if we can replace this instruction with any of the
3318 // visited instructions.
3319 if (Instruction *V = CSEMap.lookup(&In)) {
3320 In.replaceAllUsesWith(V);
3321 In.eraseFromParent();
3322 continue;
3323 }
3324
3325 CSEMap[&In] = &In;
3326 }
3327}
3328
3331 ElementCount VF) const {
3332 // We only need to calculate a cost if the VF is scalar; for actual vectors
3333 // we should already have a pre-calculated cost at each VF.
3334 if (!VF.isScalar())
3335 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3336
3338 Type *RetTy = CI->getType();
3340 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3341 return *RedCost;
3342
3344 for (auto &ArgOp : CI->args())
3345 Tys.push_back(ArgOp->getType());
3346
3347 InstructionCost ScalarCallCost =
3349
3350 // If this is an intrinsic we may have a lower cost for it.
3352 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3353 return std::min(ScalarCallCost, IntrinsicCost);
3354 }
3355 return ScalarCallCost;
3356}
3357
3359 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3360 return Elt;
3361 return VectorType::get(Elt, VF);
3362}
3363
3366 ElementCount VF) const {
3368 assert(ID && "Expected intrinsic call!");
3369 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3370 FastMathFlags FMF;
3371 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3372 FMF = FPMO->getFastMathFlags();
3373
3376 SmallVector<Type *> ParamTys;
3377 std::transform(FTy->param_begin(), FTy->param_end(),
3378 std::back_inserter(ParamTys),
3379 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3380
3381 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3382 dyn_cast<IntrinsicInst>(CI));
3383 return TTI.getIntrinsicInstrCost(CostAttrs,
3385}
3386
3388 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3389 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3390 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3391}
3392
3394 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3395 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3396 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3397}
3398
3400 VPlan &Plan) {
3401 // Fix widened non-induction PHIs by setting up the PHI operands.
3403 fixNonInductionPHIs(Plan, State);
3404
3405 // At this point every instruction in the original loop is widened to a
3406 // vector form. Now we need to fix the recurrences in the loop. These PHI
3407 // nodes are currently empty because we did not want to introduce cycles.
3408 // This is the second stage of vectorizing recurrences. Note that fixing
3409 // reduction phis are already modeled in VPlan.
3410 // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3411 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3412 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3413 for (VPRecipeBase &R : HeaderVPBB->phis()) {
3414 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3415 fixFixedOrderRecurrence(FOR, State);
3416 }
3417
3418 // Forget the original basic block.
3421
3422 // After vectorization, the exit blocks of the original loop will have
3423 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3424 // looked through single-entry phis.
3425 SmallVector<BasicBlock *> ExitBlocks;
3426 OrigLoop->getExitBlocks(ExitBlocks);
3427 for (BasicBlock *Exit : ExitBlocks)
3428 for (PHINode &PN : Exit->phis())
3430
3431 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3432 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3433 if (Cost->requiresScalarEpilogue(VF.isVector())) {
3434 // No edge from the middle block to the unique exit block has been inserted
3435 // and there is nothing to fix from vector loop; phis should have incoming
3436 // from scalar loop only.
3437 } else {
3438 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3439 // the cost model.
3440
3441 // If we inserted an edge from the middle block to the unique exit block,
3442 // update uses outside the loop (phis) to account for the newly inserted
3443 // edge.
3444
3445 // Fix-up external users of the induction variables.
3446 for (const auto &Entry : Legal->getInductionVars())
3447 fixupIVUsers(Entry.first, Entry.second,
3449 IVEndValues[Entry.first], LoopMiddleBlock,
3450 VectorLoop->getHeader(), Plan, State);
3451 }
3452
3453 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3454 // in the exit block, so update the builder.
3455 State.Builder.SetInsertPoint(State.CFG.ExitBB,
3456 State.CFG.ExitBB->getFirstNonPHIIt());
3457 for (const auto &KV : Plan.getLiveOuts())
3458 KV.second->fixPhi(Plan, State);
3459
3461 sinkScalarOperands(&*PI);
3462
3463 // Remove redundant induction instructions.
3464 cse(VectorLoop->getHeader());
3465
3466 // Set/update profile weights for the vector and remainder loops as original
3467 // loop iterations are now distributed among them. Note that original loop
3468 // represented by LoopScalarBody becomes remainder loop after vectorization.
3469 //
3470 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3471 // end up getting slightly roughened result but that should be OK since
3472 // profile is not inherently precise anyway. Note also possible bypass of
3473 // vector code caused by legality checks is ignored, assigning all the weight
3474 // to the vector loop, optimistically.
3475 //
3476 // For scalable vectorization we can't know at compile time how many iterations
3477 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3478 // vscale of '1'.
3481 VF.getKnownMinValue() * UF);
3482}
3483
3486 // This is the second phase of vectorizing first-order recurrences. An
3487 // overview of the transformation is described below. Suppose we have the
3488 // following loop.
3489 //
3490 // for (int i = 0; i < n; ++i)
3491 // b[i] = a[i] - a[i - 1];
3492 //
3493 // There is a first-order recurrence on "a". For this loop, the shorthand
3494 // scalar IR looks like:
3495 //
3496 // scalar.ph:
3497 // s_init = a[-1]
3498 // br scalar.body
3499 //
3500 // scalar.body:
3501 // i = phi [0, scalar.ph], [i+1, scalar.body]
3502 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3503 // s2 = a[i]
3504 // b[i] = s2 - s1
3505 // br cond, scalar.body, ...
3506 //
3507 // In this example, s1 is a recurrence because it's value depends on the
3508 // previous iteration. In the first phase of vectorization, we created a
3509 // vector phi v1 for s1. We now complete the vectorization and produce the
3510 // shorthand vector IR shown below (for VF = 4, UF = 1).
3511 //
3512 // vector.ph:
3513 // v_init = vector(..., ..., ..., a[-1])
3514 // br vector.body
3515 //
3516 // vector.body
3517 // i = phi [0, vector.ph], [i+4, vector.body]
3518 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3519 // v2 = a[i, i+1, i+2, i+3];
3520 // v3 = vector(v1(3), v2(0, 1, 2))
3521 // b[i, i+1, i+2, i+3] = v2 - v3
3522 // br cond, vector.body, middle.block
3523 //
3524 // middle.block:
3525 // x = v2(3)
3526 // br scalar.ph
3527 //
3528 // scalar.ph:
3529 // s_init = phi [x, middle.block], [a[-1], otherwise]
3530 // br scalar.body
3531 //
3532 // After execution completes the vector loop, we extract the next value of
3533 // the recurrence (x) to use as the initial value in the scalar loop.
3534
3535 // Extract the last vector element in the middle block. This will be the
3536 // initial value for the recurrence when jumping to the scalar loop.
3537 VPValue *PreviousDef = PhiR->getBackedgeValue();
3538 Value *Incoming = State.get(PreviousDef, UF - 1);
3539 auto *ExtractForScalar = Incoming;
3540 auto *IdxTy = Builder.getInt32Ty();
3541 Value *RuntimeVF = nullptr;
3542 if (VF.isVector()) {
3543 auto *One = ConstantInt::get(IdxTy, 1);
3545 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3546 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3547 ExtractForScalar =
3548 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3549 }
3550
3551 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3552 assert(PhiR->getNumUsers() == 1 &&
3553 RecurSplice->getOpcode() ==
3555 "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3556 SmallVector<VPLiveOut *> LiveOuts;
3557 for (VPUser *U : RecurSplice->users())
3558 if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3559 LiveOuts.push_back(LiveOut);
3560
3561 if (!LiveOuts.empty()) {
3562 // Extract the second last element in the middle block if the
3563 // Phi is used outside the loop. We need to extract the phi itself
3564 // and not the last element (the phi update in the current iteration). This
3565 // will be the value when jumping to the exit block from the
3566 // LoopMiddleBlock, when the scalar loop is not run at all.
3567 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3568 if (VF.isVector()) {
3569 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3570 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3571 Incoming, Idx, "vector.recur.extract.for.phi");
3572 } else {
3573 assert(UF > 1 && "VF and UF cannot both be 1");
3574 // When loop is unrolled without vectorizing, initialize
3575 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3576 // value of `Incoming`. This is analogous to the vectorized case above:
3577 // extracting the second last element when VF > 1.
3578 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3579 }
3580
3581 for (VPLiveOut *LiveOut : LiveOuts) {
3582 assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3583 PHINode *LCSSAPhi = LiveOut->getPhi();
3584 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3585 State.Plan->removeLiveOut(LCSSAPhi);
3586 }
3587 }
3588
3589 // Fix the initial value of the original recurrence in the scalar loop.
3591 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3592 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3593 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3594 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3595 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3596 Start->addIncoming(Incoming, BB);
3597 }
3598
3599 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3600 Phi->setName("scalar.recur");
3601}
3602
3604 // The basic block and loop containing the predicated instruction.
3605 auto *PredBB = PredInst->getParent();
3606 auto *VectorLoop = LI->getLoopFor(PredBB);
3607
3608 // Initialize a worklist with the operands of the predicated instruction.
3609 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3610
3611 // Holds instructions that we need to analyze again. An instruction may be
3612 // reanalyzed if we don't yet know if we can sink it or not.
3613 SmallVector<Instruction *, 8> InstsToReanalyze;
3614
3615 // Returns true if a given use occurs in the predicated block. Phi nodes use
3616 // their operands in their corresponding predecessor blocks.
3617 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3618 auto *I = cast<Instruction>(U.getUser());
3619 BasicBlock *BB = I->getParent();
3620 if (auto *Phi = dyn_cast<PHINode>(I))
3621 BB = Phi->getIncomingBlock(
3622 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3623 return BB == PredBB;
3624 };
3625
3626 // Iteratively sink the scalarized operands of the predicated instruction
3627 // into the block we created for it. When an instruction is sunk, it's
3628 // operands are then added to the worklist. The algorithm ends after one pass
3629 // through the worklist doesn't sink a single instruction.
3630 bool Changed;
3631 do {
3632 // Add the instructions that need to be reanalyzed to the worklist, and
3633 // reset the changed indicator.
3634 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3635 InstsToReanalyze.clear();
3636 Changed = false;
3637
3638 while (!Worklist.empty()) {
3639 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3640
3641 // We can't sink an instruction if it is a phi node, is not in the loop,
3642 // may have side effects or may read from memory.
3643 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3644 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3645 I->mayHaveSideEffects() || I->mayReadFromMemory())
3646 continue;
3647
3648 // If the instruction is already in PredBB, check if we can sink its
3649 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3650 // sinking the scalar instruction I, hence it appears in PredBB; but it
3651 // may have failed to sink I's operands (recursively), which we try
3652 // (again) here.
3653 if (I->getParent() == PredBB) {
3654 Worklist.insert(I->op_begin(), I->op_end());
3655 continue;
3656 }
3657
3658 // It's legal to sink the instruction if all its uses occur in the
3659 // predicated block. Otherwise, there's nothing to do yet, and we may
3660 // need to reanalyze the instruction.
3661 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3662 InstsToReanalyze.push_back(I);
3663 continue;
3664 }
3665
3666 // Move the instruction to the beginning of the predicated block, and add
3667 // it's operands to the worklist.
3668 I->moveBefore(&*PredBB->getFirstInsertionPt());
3669 Worklist.insert(I->op_begin(), I->op_end());
3670
3671 // The sinking may have enabled other instructions to be sunk, so we will
3672 // need to iterate.
3673 Changed = true;
3674 }
3675 } while (Changed);
3676}
3677
3679 VPTransformState &State) {
3680 auto Iter = vp_depth_first_deep(Plan.getEntry());
3681 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3682 for (VPRecipeBase &P : VPBB->phis()) {
3683 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3684 if (!VPPhi)
3685 continue;
3686 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3687 // Make sure the builder has a valid insert point.
3688 Builder.SetInsertPoint(NewPhi);
3689 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3690 VPValue *Inc = VPPhi->getIncomingValue(i);
3691 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3692 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3693 }
3694 }
3695 }
3696}
3697
3698void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3699 // We should not collect Scalars more than once per VF. Right now, this
3700 // function is called from collectUniformsAndScalars(), which already does
3701 // this check. Collecting Scalars for VF=1 does not make any sense.
3702 assert(VF.isVector() && !Scalars.contains(VF) &&
3703 "This function should not be visited twice for the same VF");
3704
3705 // This avoids any chances of creating a REPLICATE recipe during planning
3706 // since that would result in generation of scalarized code during execution,
3707 // which is not supported for scalable vectors.
3708 if (VF.isScalable()) {
3709 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3710 return;
3711 }
3712
3714
3715 // These sets are used to seed the analysis with pointers used by memory
3716 // accesses that will remain scalar.
3718 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3719 auto *Latch = TheLoop->getLoopLatch();
3720
3721 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3722 // The pointer operands of loads and stores will be scalar as long as the
3723 // memory access is not a gather or scatter operation. The value operand of a
3724 // store will remain scalar if the store is scalarized.
3725 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3726 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3727 assert(WideningDecision != CM_Unknown &&
3728 "Widening decision should be ready at this moment");
3729 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3730 if (Ptr == Store->getValueOperand())
3731 return WideningDecision == CM_Scalarize;
3732 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3733 "Ptr is neither a value or pointer operand");
3734 return WideningDecision != CM_GatherScatter;
3735 };
3736
3737 // A helper that returns true if the given value is a bitcast or
3738 // getelementptr instruction contained in the loop.
3739 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3740 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3741 isa<GetElementPtrInst>(V)) &&
3743 };
3744
3745 // A helper that evaluates a memory access's use of a pointer. If the use will
3746 // be a scalar use and the pointer is only used by memory accesses, we place
3747 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3748 // PossibleNonScalarPtrs.
3749 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3750 // We only care about bitcast and getelementptr instructions contained in
3751 // the loop.
3752 if (!isLoopVaryingBitCastOrGEP(Ptr))
3753 return;
3754
3755 // If the pointer has already been identified as scalar (e.g., if it was
3756 // also identified as uniform), there's nothing to do.
3757 auto *I = cast<Instruction>(Ptr);
3758 if (Worklist.count(I))
3759 return;
3760
3761 // If the use of the pointer will be a scalar use, and all users of the
3762 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3763 // place the pointer in PossibleNonScalarPtrs.
3764 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3765 return isa<LoadInst>(U) || isa<StoreInst>(U);
3766 }))
3767 ScalarPtrs.insert(I);
3768 else
3769 PossibleNonScalarPtrs.insert(I);
3770 };
3771
3772 // We seed the scalars analysis with three classes of instructions: (1)
3773 // instructions marked uniform-after-vectorization and (2) bitcast,
3774 // getelementptr and (pointer) phi instructions used by memory accesses
3775 // requiring a scalar use.
3776 //
3777 // (1) Add to the worklist all instructions that have been identified as
3778 // uniform-after-vectorization.
3779 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3780
3781 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3782 // memory accesses requiring a scalar use. The pointer operands of loads and
3783 // stores will be scalar as long as the memory accesses is not a gather or
3784 // scatter operation. The value operand of a store will remain scalar if the
3785 // store is scalarized.
3786 for (auto *BB : TheLoop->blocks())
3787 for (auto &I : *BB) {
3788 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3789 evaluatePtrUse(Load, Load->getPointerOperand());
3790 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3791 evaluatePtrUse(Store, Store->getPointerOperand());
3792 evaluatePtrUse(Store, Store->getValueOperand());
3793 }
3794 }
3795 for (auto *I : ScalarPtrs)
3796 if (!PossibleNonScalarPtrs.count(I)) {
3797 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3798 Worklist.insert(I);
3799 }
3800
3801 // Insert the forced scalars.
3802 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3803 // induction variable when the PHI user is scalarized.
3804 auto ForcedScalar = ForcedScalars.find(VF);
3805 if (ForcedScalar != ForcedScalars.end())
3806 for (auto *I : ForcedScalar->second) {
3807 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3808 Worklist.insert(I);
3809 }
3810
3811 // Expand the worklist by looking through any bitcasts and getelementptr
3812 // instructions we've already identified as scalar. This is similar to the
3813 // expansion step in collectLoopUniforms(); however, here we're only
3814 // expanding to include additional bitcasts and getelementptr instructions.
3815 unsigned Idx = 0;
3816 while (Idx != Worklist.size()) {
3817 Instruction *Dst = Worklist[Idx++];
3818 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3819 continue;
3820 auto *Src = cast<Instruction>(Dst->getOperand(0));
3821 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3822 auto *J = cast<Instruction>(U);
3823 return !TheLoop->contains(J) || Worklist.count(J) ||
3824 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3825 isScalarUse(J, Src));
3826 })) {
3827 Worklist.insert(Src);
3828 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3829 }
3830 }
3831
3832 // An induction variable will remain scalar if all users of the induction
3833 // variable and induction variable update remain scalar.
3834 for (const auto &Induction : Legal->getInductionVars()) {
3835 auto *Ind = Induction.first;
3836 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3837
3838 // If tail-folding is applied, the primary induction variable will be used
3839 // to feed a vector compare.
3840 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3841 continue;
3842
3843 // Returns true if \p Indvar is a pointer induction that is used directly by
3844 // load/store instruction \p I.
3845 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3846 Instruction *I) {
3847 return Induction.second.getKind() ==
3849 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3850 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3851 };
3852
3853 // Determine if all users of the induction variable are scalar after
3854 // vectorization.
3855 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3856 auto *I = cast<Instruction>(U);
3857 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3858 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3859 });
3860 if (!ScalarInd)
3861 continue;
3862
3863 // If the induction variable update is a fixed-order recurrence, neither the
3864 // induction variable or its update should be marked scalar after
3865 // vectorization.
3866 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3867 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3868 continue;
3869
3870 // Determine if all users of the induction variable update instruction are
3871 // scalar after vectorization.
3872 auto ScalarIndUpdate =
3873 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3874 auto *I = cast<Instruction>(U);
3875 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3876 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3877 });
3878 if (!ScalarIndUpdate)
3879 continue;
3880
3881 // The induction variable and its update instruction will remain scalar.
3882 Worklist.insert(Ind);
3883 Worklist.insert(IndUpdate);
3884 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3885 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3886 << "\n");
3887 }
3888
3889 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3890}
3891
3893 Instruction *I, ElementCount VF) const {
3894 if (!isPredicatedInst(I))
3895 return false;
3896
3897 // Do we have a non-scalar lowering for this predicated
3898 // instruction? No - it is scalar with predication.
3899 switch(I->getOpcode()) {
3900 default:
3901 return true;
3902 case Instruction::Call:
3903 if (VF.isScalar())
3904 return true;
3905 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3906 .Kind == CM_Scalarize;
3907 case Instruction::Load:
3908 case Instruction::Store: {
3910 auto *Ty = getLoadStoreType(I);
3911 Type *VTy = Ty;
3912 if (VF.isVector())
3913 VTy = VectorType::get(Ty, VF);
3914 const Align Alignment = getLoadStoreAlignment(I);
3915 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3916 TTI.isLegalMaskedGather(VTy, Alignment))
3917 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3918 TTI.isLegalMaskedScatter(VTy, Alignment));
3919 }
3920 case Instruction::UDiv:
3921 case Instruction::SDiv:
3922 case Instruction::SRem:
3923 case Instruction::URem: {
3924 // We have the option to use the safe-divisor idiom to avoid predication.
3925 // The cost based decision here will always select safe-divisor for
3926 // scalable vectors as scalarization isn't legal.
3927 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3928 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3929 }
3930 }
3931}
3932
3934 if (!blockNeedsPredicationForAnyReason(I->getParent()))
3935 return false;
3936
3937 // Can we prove this instruction is safe to unconditionally execute?
3938 // If not, we must use some form of predication.
3939 switch(I->getOpcode()) {
3940 default:
3941 return false;
3942 case Instruction::Load:
3943 case Instruction::Store: {
3944 if (!Legal->isMaskRequired(I))
3945 return false;
3946 // When we know the load's address is loop invariant and the instruction
3947 // in the original scalar loop was unconditionally executed then we
3948 // don't need to mark it as a predicated instruction. Tail folding may
3949 // introduce additional predication, but we're guaranteed to always have
3950 // at least one active lane. We call Legal->blockNeedsPredication here
3951 // because it doesn't query tail-folding. For stores, we need to prove
3952 // both speculation safety (which follows from the same argument as loads),
3953 // but also must prove the value being stored is correct. The easiest
3954 // form of the later is to require that all values stored are the same.
3956 (isa<LoadInst>(I) ||
3957 (isa<StoreInst>(I) &&
3958 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3959 !Legal->blockNeedsPredication(I->getParent()))
3960 return false;
3961 return true;
3962 }
3963 case Instruction::UDiv:
3964 case Instruction::SDiv:
3965 case Instruction::SRem:
3966 case Instruction::URem:
3967 // TODO: We can use the loop-preheader as context point here and get
3968 // context sensitive reasoning
3970 case Instruction::Call:
3971 return Legal->isMaskRequired(I);
3972 }
3973}
3974
3975std::pair<InstructionCost, InstructionCost>
3977 ElementCount VF) const {
3978 assert(I->getOpcode() == Instruction::UDiv ||
3979 I->getOpcode() == Instruction::SDiv ||
3980 I->getOpcode() == Instruction::SRem ||
3981 I->getOpcode() == Instruction::URem);
3983
3985
3986 // Scalarization isn't legal for scalable vector types
3987 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3988 if (!VF.isScalable()) {
3989 // Get the scalarization cost and scale this amount by the probability of
3990 // executing the predicated block. If the instruction is not predicated,
3991 // we fall through to the next case.
3992 ScalarizationCost = 0;
3993
3994 // These instructions have a non-void type, so account for the phi nodes
3995 // that we will create. This cost is likely to be zero. The phi node
3996 // cost, if any, should be scaled by the block probability because it
3997 // models a copy at the end of each predicated block.
3998 ScalarizationCost += VF.getKnownMinValue() *
3999 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4000
4001 // The cost of the non-predicated instruction.
4002 ScalarizationCost += VF.getKnownMinValue() *
4003 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4004
4005 // The cost of insertelement and extractelement instructions needed for
4006 // scalarization.
4007 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4008
4009 // Scale the cost by the probability of executing the predicated blocks.
4010 // This assumes the predicated block for each vector lane is equally
4011 // likely.
4012 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4013 }
4014 InstructionCost SafeDivisorCost = 0;
4015
4016 auto *VecTy = ToVectorTy(I->getType(), VF);
4017
4018 // The cost of the select guard to ensure all lanes are well defined
4019 // after we speculate above any internal control flow.
4020 SafeDivisorCost += TTI.getCmpSelInstrCost(
4021 Instruction::Select, VecTy,
4022 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4024
4025 // Certain instructions can be cheaper to vectorize if they have a constant
4026 // second vector operand. One example of this are shifts on x86.
4027 Value *Op2 = I->getOperand(1);
4028 auto Op2Info = TTI.getOperandInfo(Op2);
4029 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4030 Legal->isInvariant(Op2))
4032
4033 SmallVector<const Value *, 4> Operands(I->operand_values());
4034 SafeDivisorCost += TTI.getArithmeticInstrCost(
4035 I->getOpcode(), VecTy, CostKind,
4036 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4037 Op2Info, Operands, I);
4038 return {ScalarizationCost, SafeDivisorCost};
4039}
4040
4042 Instruction *I, ElementCount VF) {
4043 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4045 "Decision should not be set yet.");
4046 auto *Group = getInterleavedAccessGroup(I);
4047 assert(Group && "Must have a group.");
4048
4049 // If the instruction's allocated size doesn't equal it's type size, it
4050 // requires padding and will be scalarized.
4051 auto &DL = I->getModule()->getDataLayout();
4052 auto *ScalarTy = getLoadStoreType(I);
4053 if (hasIrregularType(ScalarTy, DL))
4054 return false;
4055
4056 // If the group involves a non-integral pointer, we may not be able to
4057 // losslessly cast all values to a common type.
4058 unsigned InterleaveFactor = Group->getFactor();
4059 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4060 for (unsigned i = 0; i < InterleaveFactor; i++) {
4061 Instruction *Member = Group->getMember(i);
4062 if (!Member)
4063 continue;
4064 auto *MemberTy = getLoadStoreType(Member);
4065 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4066 // Don't coerce non-integral pointers to integers or vice versa.
4067 if (MemberNI != ScalarNI) {
4068 // TODO: Consider adding special nullptr value case here
4069 return false;
4070 } else if (MemberNI && ScalarNI &&
4071 ScalarTy->getPointerAddressSpace() !=
4072 MemberTy->getPointerAddressSpace()) {
4073 return false;
4074 }
4075 }
4076
4077 // Check if masking is required.
4078 // A Group may need masking for one of two reasons: it resides in a block that
4079 // needs predication, or it was decided to use masking to deal with gaps
4080 // (either a gap at the end of a load-access that may result in a speculative
4081 // load, or any gaps in a store-access).
4082 bool PredicatedAccessRequiresMasking =
4083 blockNeedsPredicationForAnyReason(I->getParent()) &&
4085 bool LoadAccessWithGapsRequiresEpilogMasking =
4086 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4088 bool StoreAccessWithGapsRequiresMasking =
4089 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4090 if (!PredicatedAccessRequiresMasking &&
4091 !LoadAccessWithGapsRequiresEpilogMasking &&
4092 !StoreAccessWithGapsRequiresMasking)
4093 return true;
4094
4095 // If masked interleaving is required, we expect that the user/target had
4096 // enabled it, because otherwise it either wouldn't have been created or
4097 // it should have been invalidated by the CostModel.
4099 "Masked interleave-groups for predicated accesses are not enabled.");
4100
4101 if (Group->isReverse())
4102 return false;
4103
4104 auto *Ty = getLoadStoreType(I);
4105 const Align Alignment = getLoadStoreAlignment(I);
4106 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4107 : TTI.isLegalMaskedStore(Ty, Alignment);
4108}
4109
4111 Instruction *I, ElementCount VF) {
4112 // Get and ensure we have a valid memory instruction.
4113 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4114
4116 auto *ScalarTy = getLoadStoreType(I);
4117
4118 // In order to be widened, the pointer should be consecutive, first of all.
4119 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4120 return false;
4121
4122 // If the instruction is a store located in a predicated block, it will be
4123 // scalarized.
4124 if (isScalarWithPredication(I, VF))
4125 return false;
4126
4127 // If the instruction's allocated size doesn't equal it's type size, it
4128 // requires padding and will be scalarized.
4129 auto &DL = I->getModule()->getDataLayout();
4130 if (hasIrregularType(ScalarTy, DL))
4131 return false;
4132
4133 return true;
4134}
4135
4136void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4137 // We should not collect Uniforms more than once per VF. Right now,
4138 // this function is called from collectUniformsAndScalars(), which
4139 // already does this check. Collecting Uniforms for VF=1 does not make any
4140 // sense.
4141
4142 assert(VF.isVector() && !Uniforms.contains(VF) &&
4143 "This function should not be visited twice for the same VF");
4144
4145 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4146 // not analyze again. Uniforms.count(VF) will return 1.
4147 Uniforms[VF].clear();
4148
4149 // We now know that the loop is vectorizable!
4150 // Collect instructions inside the loop that will remain uniform after
4151 // vectorization.
4152
4153 // Global values, params and instructions outside of current loop are out of
4154 // scope.
4155 auto isOutOfScope = [&](Value *V) -> bool {
4156 Instruction *I = dyn_cast<Instruction>(V);
4157 return (!I || !TheLoop->contains(I));
4158 };
4159
4160 // Worklist containing uniform instructions demanding lane 0.
4161 SetVector<Instruction *> Worklist;
4162
4163 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4164 // that are scalar with predication must not be considered uniform after
4165 // vectorization, because that would create an erroneous replicating region
4166 // where only a single instance out of VF should be formed.
4167 // TODO: optimize such seldom cases if found important, see PR40816.
4168 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4169 if (isOutOfScope(I)) {
4170 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4171 << *I << "\n");
4172 return;
4173 }
4174 if (isScalarWithPredication(I, VF)) {
4175 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4176 << *I << "\n");
4177 return;
4178 }
4179 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4180 Worklist.insert(I);
4181 };
4182
4183 // Start with the conditional branches exiting the loop. If the branch
4184 // condition is an instruction contained in the loop that is only used by the
4185 // branch, it is uniform.
4187 TheLoop->getExitingBlocks(Exiting);
4188 for (BasicBlock *E : Exiting) {
4189 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
4190 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4191 addToWorklistIfAllowed(Cmp);
4192 }
4193
4194 auto PrevVF = VF.divideCoefficientBy(2);
4195 // Return true if all lanes perform the same memory operation, and we can
4196 // thus chose to execute only one.
4197 auto isUniformMemOpUse = [&](Instruction *I) {
4198 // If the value was already known to not be uniform for the previous
4199 // (smaller VF), it cannot be uniform for the larger VF.
4200 if (PrevVF.isVector()) {
4201 auto Iter = Uniforms.find(PrevVF);
4202 if (Iter != Uniforms.end() && !Iter->second.contains(I))
4203 return false;
4204 }
4205 if (!Legal->isUniformMemOp(*I, VF))
4206 return false;
4207 if (isa<LoadInst>(I))
4208 // Loading the same address always produces the same result - at least
4209 // assuming aliasing and ordering which have already been checked.
4210 return true;
4211 // Storing the same value on every iteration.
4212 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4213 };
4214
4215 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4216 InstWidening WideningDecision = getWideningDecision(I, VF);
4217 assert(WideningDecision != CM_Unknown &&
4218 "Widening decision should be ready at this moment");
4219
4220 if (isUniformMemOpUse(I))
4221 return true;
4222
4223 return (WideningDecision == CM_Widen ||
4224 WideningDecision == CM_Widen_Reverse ||
4225 WideningDecision == CM_Interleave);
4226 };
4227
4228 // Returns true if Ptr is the pointer operand of a memory access instruction
4229 // I, I is known to not require scalarization, and the pointer is not also
4230 // stored.
4231 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4232 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4233 return false;
4234 return getLoadStorePointerOperand(I) == Ptr &&
4235 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4236 };
4237
4238 // Holds a list of values which are known to have at least one uniform use.
4239 // Note that there may be other uses which aren't uniform. A "uniform use"
4240 // here is something which only demands lane 0 of the unrolled iterations;
4241 // it does not imply that all lanes produce the same value (e.g. this is not
4242 // the usual meaning of uniform)
4243 SetVector<Value *> HasUniformUse;
4244
4245 // Scan the loop for instructions which are either a) known to have only
4246 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4247 for (auto *BB : TheLoop->blocks())
4248 for (auto &I : *BB) {
4249 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4250 switch (II->getIntrinsicID()) {
4251 case Intrinsic::sideeffect:
4252 case Intrinsic::experimental_noalias_scope_decl:
4253 case Intrinsic::assume:
4254 case Intrinsic::lifetime_start:
4255 case Intrinsic::lifetime_end:
4257 addToWorklistIfAllowed(&I);
4258 break;
4259 default:
4260 break;
4261 }
4262 }
4263
4264 // ExtractValue instructions must be uniform, because the operands are
4265 // known to be loop-invariant.
4266 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4267 assert(isOutOfScope(EVI->getAggregateOperand()) &&
4268 "Expected aggregate value to be loop invariant");
4269 addToWorklistIfAllowed(EVI);
4270 continue;
4271 }
4272
4273 // If there's no pointer operand, there's nothing to do.
4275 if (!Ptr)
4276 continue;
4277
4278 if (isUniformMemOpUse(&I))
4279 addToWorklistIfAllowed(&I);
4280
4281 if (isVectorizedMemAccessUse(&I, Ptr))
4282 HasUniformUse.insert(Ptr);
4283 }
4284
4285 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4286 // demanding) users. Since loops are assumed to be in LCSSA form, this
4287 // disallows uses outside the loop as well.
4288 for (auto *V : HasUniformUse) {
4289 if (isOutOfScope(V))
4290 continue;
4291 auto *I = cast<Instruction>(V);
4292 auto UsersAreMemAccesses =
4293 llvm::all_of(I->users(), [&](User *U) -> bool {
4294 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4295 });
4296 if (UsersAreMemAccesses)
4297 addToWorklistIfAllowed(I);
4298 }
4299
4300 // Expand Worklist in topological order: whenever a new instruction
4301 // is added , its users should be already inside Worklist. It ensures
4302 // a uniform instruction will only be used by uniform instructions.
4303 unsigned idx = 0;
4304 while (idx != Worklist.size()) {
4305 Instruction *I = Worklist[idx++];
4306
4307 for (auto *OV : I->operand_values()) {
4308 // isOutOfScope operands cannot be uniform instructions.
4309 if (isOutOfScope(OV))
4310 continue;
4311 // First order recurrence Phi's should typically be considered
4312 // non-uniform.
4313 auto *OP = dyn_cast<PHINode>(OV);
4315 continue;
4316 // If all the users of the operand are uniform, then add the
4317 // operand into the uniform worklist.
4318 auto *OI = cast<Instruction>(OV);
4319 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4320 auto *J = cast<Instruction>(U);
4321 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4322 }))
4323 addToWorklistIfAllowed(OI);
4324 }
4325 }
4326
4327 // For an instruction to be added into Worklist above, all its users inside
4328 // the loop should also be in Worklist. However, this condition cannot be
4329 // true for phi nodes that form a cyclic dependence. We must process phi
4330 // nodes separately. An induction variable will remain uniform if all users
4331 // of the induction variable and induction variable update remain uniform.
4332 // The code below handles both pointer and non-pointer induction variables.
4333 BasicBlock *Latch = TheLoop->getLoopLatch();
4334 for (const auto &Induction : Legal->getInductionVars()) {
4335 auto *Ind = Induction.first;
4336 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4337
4338 // Determine if all users of the induction variable are uniform after
4339 // vectorization.
4340 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4341 auto *I = cast<Instruction>(U);
4342 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4343 isVectorizedMemAccessUse(I, Ind);
4344 });
4345 if (!UniformInd)
4346 continue;
4347
4348 // Determine if all users of the induction variable update instruction are
4349 // uniform after vectorization.
4350 auto UniformIndUpdate =
4351 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4352 auto *I = cast<Instruction>(U);
4353 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4354 isVectorizedMemAccessUse(I, IndUpdate);
4355 });
4356 if (!UniformIndUpdate)
4357 continue;
4358
4359 // The induction variable and its update instruction will remain uniform.
4360 addToWorklistIfAllowed(Ind);
4361 addToWorklistIfAllowed(IndUpdate);
4362 }
4363
4364 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4365}
4366
4368 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4369
4371 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4372 "runtime pointer checks needed. Enable vectorization of this "
4373 "loop with '#pragma clang loop vectorize(enable)' when "
4374 "compiling with -Os/-Oz",
4375 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4376 return true;
4377 }
4378
4379 if (!PSE.getPredicate().isAlwaysTrue()) {
4380 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4381 "runtime SCEV checks needed. Enable vectorization of this "
4382 "loop with '#pragma clang loop vectorize(enable)' when "
4383 "compiling with -Os/-Oz",
4384 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4385 return true;
4386 }
4387
4388 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4389 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4390 reportVectorizationFailure("Runtime stride check for small trip count",
4391 "runtime stride == 1 checks needed. Enable vectorization of "
4392 "this loop without such check by compiling with -Os/-Oz",
4393 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4394 return true;
4395 }
4396
4397 return false;
4398}
4399
4401LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4403 return ElementCount::getScalable(0);
4404
4406 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4407 "ScalableVectorizationDisabled", ORE, TheLoop);
4408 return ElementCount::getScalable(0);
4409 }
4410
4411 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4412
4413 auto MaxScalableVF = ElementCount::getScalable(
4414 std::numeric_limits<ElementCount::ScalarTy>::max());
4415
4416 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4417 // FIXME: While for scalable vectors this is currently sufficient, this should
4418 // be replaced by a more detailed mechanism that filters out specific VFs,
4419 // instead of invalidating vectorization for a whole set of VFs based on the
4420 // MaxVF.
4421
4422 // Disable scalable vectorization if the loop contains unsupported reductions.
4423 if (!canVectorizeReductions(MaxScalableVF)) {
4425 "Scalable vectorization not supported for the reduction "
4426 "operations found in this loop.",
4427 "ScalableVFUnfeasible", ORE, TheLoop);
4428 return ElementCount::getScalable(0);
4429 }
4430
4431 // Disable scalable vectorization if the loop contains any instructions
4432 // with element types not supported for scalable vectors.
4433 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4434 return !Ty->isVoidTy() &&
4436 })) {
4437 reportVectorizationInfo("Scalable vectorization is not supported "
4438 "for all element types found in this loop.",
4439 "ScalableVFUnfeasible", ORE, TheLoop);
4440 return ElementCount::getScalable(0);
4441 }
4442
4444 return MaxScalableVF;
4445
4446 // Limit MaxScalableVF by the maximum safe dependence distance.
4447 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4448 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4449 else
4450 MaxScalableVF = ElementCount::getScalable(0);
4451
4452 if (!MaxScalableVF)
4454 "Max legal vector width too small, scalable vectorization "
4455 "unfeasible.",
4456 "ScalableVFUnfeasible", ORE, TheLoop);
4457
4458 return MaxScalableVF;
4459}
4460
4461FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4462 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4464 unsigned SmallestType, WidestType;
4465 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4466
4467 // Get the maximum safe dependence distance in bits computed by LAA.
4468 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4469 // the memory accesses that is most restrictive (involved in the smallest
4470 // dependence distance).
4471 unsigned MaxSafeElements =
4473
4474 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4475 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4476
4477 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4478 << ".\n");
4479 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4480 << ".\n");
4481
4482 // First analyze the UserVF, fall back if the UserVF should be ignored.
4483 if (UserVF) {
4484 auto MaxSafeUserVF =
4485 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4486
4487 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4488 // If `VF=vscale x N` is safe, then so is `VF=N`
4489 if (UserVF.isScalable())
4490 return FixedScalableVFPair(
4491 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4492 else
4493 return UserVF;
4494 }
4495
4496 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4497
4498 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4499 // is better to ignore the hint and let the compiler choose a suitable VF.
4500 if (!UserVF.isScalable()) {
4501 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4502 << " is unsafe, clamping to max safe VF="
4503 << MaxSafeFixedVF << ".\n");
4504 ORE->emit([&]() {
4505 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4507 TheLoop->getHeader())
4508 << "User-specified vectorization factor "
4509 << ore::NV("UserVectorizationFactor", UserVF)
4510 << " is unsafe, clamping to maximum safe vectorization factor "
4511 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4512 });
4513 return MaxSafeFixedVF;
4514 }
4515
4517 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4518 << " is ignored because scalable vectors are not "
4519 "available.\n");
4520 ORE->emit([&]() {
4521 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4523 TheLoop->getHeader())
4524 << "User-specified vectorization factor "
4525 << ore::NV("UserVectorizationFactor", UserVF)
4526 << " is ignored because the target does not support scalable "
4527 "vectors. The compiler will pick a more suitable value.";
4528 });
4529 } else {
4530 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4531 << " is unsafe. Ignoring scalable UserVF.\n");
4532 ORE->emit([&]() {
4533 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4535 TheLoop->getHeader())
4536 << "User-specified vectorization factor "
4537 << ore::NV("UserVectorizationFactor", UserVF)
4538 << " is unsafe. Ignoring the hint to let the compiler pick a "
4539 "more suitable value.";
4540 });
4541 }
4542 }
4543
4544 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4545 << " / " << WidestType << " bits.\n");
4546
4549 if (auto MaxVF =
4550 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4551 MaxSafeFixedVF, FoldTailByMasking))
4552 Result.FixedVF = MaxVF;
4553
4554 if (auto MaxVF =
4555 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4556 MaxSafeScalableVF, FoldTailByMasking))
4557 if (MaxVF.isScalable()) {
4558 Result.ScalableVF = MaxVF;
4559 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4560 << "\n");
4561 }
4562
4563 return Result;
4564}
4565
4569 // TODO: It may by useful to do since it's still likely to be dynamically
4570 // uniform if the target can skip.
4572 "Not inserting runtime ptr check for divergent target",
4573 "runtime pointer checks needed. Not enabled for divergent target",
4574 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4576 }
4577
4578 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4579 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4580 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4581 if (TC == 1) {
4582 reportVectorizationFailure("Single iteration (non) loop",
4583 "loop trip count is one, irrelevant for vectorization",
4584 "SingleIterationLoop", ORE, TheLoop);
4586 }
4587
4588 switch (ScalarEpilogueStatus) {
4590 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4592 [[fallthrough]];
4594 LLVM_DEBUG(
4595 dbgs() << "LV: vector predicate hint/switch found.\n"
4596 << "LV: Not allowing scalar epilogue, creating predicated "
4597 << "vector loop.\n");
4598 break;
4600 // fallthrough as a special case of OptForSize
4602 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4603 LLVM_DEBUG(
4604 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4605 else
4606 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4607 << "count.\n");
4608
4609 // Bail if runtime checks are required, which are not good when optimising
4610 // for size.
4613
4614 break;
4615 }
4616
4617 // The only loops we can vectorize without a scalar epilogue, are loops with
4618 // a bottom-test and a single exiting block. We'd have to handle the fact
4619 // that not every instruction executes on the last iteration. This will
4620 // require a lane mask which varies through the vector loop body. (TODO)
4622 // If there was a tail-folding hint/switch, but we can't fold the tail by
4623 // masking, fallback to a vectorization with a scalar epilogue.
4624 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4625 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4626 "scalar epilogue instead.\n");
4627 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4628 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4629 }
4631 }
4632
4633 // Now try the tail folding
4634
4635 // Invalidate interleave groups that require an epilogue if we can't mask
4636 // the interleave-group.
4638 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4639 "No decisions should have been taken at this point");
4640 // Note: There is no need to invalidate any cost modeling decisions here, as
4641 // non where taken so far.
4643 }
4644
4645 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4646
4647 // Avoid tail folding if the trip count is known to be a multiple of any VF
4648 // we choose.
4649 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4650 MaxFactors.FixedVF.getFixedValue();
4651 if (MaxFactors.ScalableVF) {
4652 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4653 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4654 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4655 *MaxPowerOf2RuntimeVF,
4656 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4657 } else
4658 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4659 }
4660
4661 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4662 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4663 "MaxFixedVF must be a power of 2");
4664 unsigned MaxVFtimesIC =
4665 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4666 ScalarEvolution *SE = PSE.getSE();
4667 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4668 const SCEV *ExitCount = SE->getAddExpr(
4669 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4670 const SCEV *Rem = SE->getURemExpr(
4671 SE->applyLoopGuards(ExitCount, TheLoop),
4672 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4673 if (Rem->isZero()) {
4674 // Accept MaxFixedVF if we do not have a tail.
4675 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4676 return MaxFactors;
4677 }
4678 }
4679
4680 // If we don't know the precise trip count, or if the trip count that we
4681 // found modulo the vectorization factor is not zero, try to fold the tail
4682 // by masking.
4683 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4684 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4685 if (foldTailByMasking()) {
4687 LLVM_DEBUG(
4688 dbgs()
4689 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4690 "try to generate VP Intrinsics with scalable vector "
4691 "factors only.\n");
4692 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4693 // for now.
4694 // TODO: extend it for fixed vectors, if required.
4695 assert(MaxFactors.ScalableVF.isScalable() &&
4696 "Expected scalable vector factor.");
4697
4698 MaxFactors.FixedVF = ElementCount::getFixed(1);
4699 }
4700 return MaxFactors;
4701 }
4702
4703 // If there was a tail-folding hint/switch, but we can't fold the tail by
4704 // masking, fallback to a vectorization with a scalar epilogue.
4705 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4706 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4707 "scalar epilogue instead.\n");
4708 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4709 return MaxFactors;
4710 }
4711
4712 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4713 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4715 }
4716
4717 if (TC == 0) {
4719 "Unable to calculate the loop count due to complex control flow",
4720 "unable to calculate the loop count due to complex control flow",
4721 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4723 }
4724
4726 "Cannot optimize for size and vectorize at the same time.",
4727 "cannot optimize for size and vectorize at the same time. "
4728 "Enable vectorization of this loop with '#pragma clang loop "
4729 "vectorize(enable)' when compiling with -Os/-Oz",
4730 "NoTailLoopWithOptForSize", ORE, TheLoop);
4732}
4733
4734ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4735 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4736 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4737 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4738 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4739 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4741
4742 // Convenience function to return the minimum of two ElementCounts.
4743 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4744 assert((LHS.isScalable() == RHS.isScalable()) &&
4745 "Scalable flags must match");
4746 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4747 };
4748
4749 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4750 // Note that both WidestRegister and WidestType may not be a powers of 2.
4751 auto MaxVectorElementCount = ElementCount::get(
4752 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4753 ComputeScalableMaxVF);
4754 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4755 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4756 << (MaxVectorElementCount * WidestType) << " bits.\n");
4757
4758 if (!MaxVectorElementCount) {
4759 LLVM_DEBUG(dbgs() << "LV: The target has no "
4760 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4761 << " vector registers.\n");
4762 return ElementCount::getFixed(1);
4763 }
4764
4765 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4766 if (MaxVectorElementCount.isScalable() &&
4767 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4768 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4769 auto Min = Attr.getVScaleRangeMin();
4770 WidestRegisterMinEC *= Min;
4771 }
4772
4773 // When a scalar epilogue is required, at least one iteration of the scalar
4774 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4775 // max VF that results in a dead vector loop.
4776 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4777 MaxTripCount -= 1;
4778
4779 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4780 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4781 // If upper bound loop trip count (TC) is known at compile time there is no
4782 // point in choosing VF greater than TC (as done in the loop below). Select
4783 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4784 // scalable, we only fall back on a fixed VF when the TC is less than or
4785 // equal to the known number of lanes.
4786 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4787 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4788 "exceeding the constant trip count: "
4789 << ClampedUpperTripCount << "\n");
4790 return ElementCount::get(
4791 ClampedUpperTripCount,
4792 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4793 }
4794
4796 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4798 ElementCount MaxVF = MaxVectorElementCount;
4799 if (MaximizeBandwidth ||
4800 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4803 auto MaxVectorElementCountMaxBW = ElementCount::get(
4804 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4805 ComputeScalableMaxVF);
4806 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4807
4808 // Collect all viable vectorization factors larger than the default MaxVF
4809 // (i.e. MaxVectorElementCount).
4811 for (ElementCount VS = MaxVectorElementCount * 2;
4812 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4813 VFs.push_back(VS);
4814
4815 // For each VF calculate its register usage.
4816 auto RUs = calculateRegisterUsage(VFs);
4817
4818 // Select the largest VF which doesn't require more registers than existing
4819 // ones.
4820 for (int i = RUs.size() - 1; i >= 0; --i) {
4821 bool Selected = true;
4822 for (auto &pair : RUs[i].MaxLocalUsers) {
4823 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4824 if (pair.second > TargetNumRegisters)
4825 Selected = false;
4826 }
4827 if (Selected) {
4828 MaxVF = VFs[i];
4829 break;
4830 }
4831 }
4832 if (ElementCount MinVF =
4833 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4834 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4835 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4836 << ") with target's minimum: " << MinVF << '\n');
4837 MaxVF = MinVF;
4838 }
4839 }
4840
4841 // Invalidate any widening decisions we might have made, in case the loop
4842 // requires prediction (decided later), but we have already made some
4843 // load/store widening decisions.
4845 }
4846 return MaxVF;
4847}
4848
4849/// Convenience function that returns the value of vscale_range iff
4850/// vscale_range.min == vscale_range.max or otherwise returns the value
4851/// returned by the corresponding TTI method.
4852static std::optional<unsigned>
4854 const Function *Fn = L->getHeader()->getParent();
4855 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4856 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4857 auto Min = Attr.getVScaleRangeMin();
4858 auto Max = Attr.getVScaleRangeMax();
4859 if (Max && Min == Max)
4860 return Max;
4861 }
4862
4863 return TTI.getVScaleForTuning();
4864}
4865
4866bool LoopVectorizationPlanner::isMoreProfitable(
4867 const VectorizationFactor &A, const VectorizationFactor &B) const {
4868 InstructionCost CostA = A.Cost;
4869 InstructionCost CostB = B.Cost;
4870
4871 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4872
4873 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4874 // If the trip count is a known (possibly small) constant, the trip count
4875 // will be rounded up to an integer number of iterations under
4876 // FoldTailByMasking. The total cost in that case will be
4877 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4878 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4879 // some extra overheads, but for the purpose of comparing the costs of
4880 // different VFs we can use this to compare the total loop-body cost
4881 // expected after vectorization.
4882 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4883 InstructionCost VectorCost,
4884 InstructionCost ScalarCost) {
4885 return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4886 : VectorCost * (MaxTripCount / VF) +
4887 ScalarCost * (MaxTripCount % VF);
4888 };
4889 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4890 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4891
4892 return RTCostA < RTCostB;
4893 }
4894
4895 // Improve estimate for the vector width if it is scalable.
4896 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4897 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4898 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4899 if (A.Width.isScalable())
4900 EstimatedWidthA *= *VScale;
4901 if (B.Width.isScalable())
4902 EstimatedWidthB *= *VScale;
4903 }
4904
4905 // Assume vscale may be larger than 1 (or the value being tuned for),
4906 // so that scalable vectorization is slightly favorable over fixed-width
4907 // vectorization.
4908 if (A.Width.isScalable() && !B.Width.isScalable())
4909 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4910
4911 // To avoid the need for FP division:
4912 // (CostA / A.Width) < (CostB / B.Width)
4913 // <=> (CostA * B.Width) < (CostB * A.Width)
4914 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4915}
4916
4919 Loop *TheLoop) {
4920 if (InvalidCosts.empty())
4921 return;
4922
4923 // Emit a report of VFs with invalid costs in the loop.
4924
4925 // Group the remarks per instruction, keeping the instruction order from
4926 // InvalidCosts.
4927 std::map<Instruction *, unsigned> Numbering;
4928 unsigned I = 0;
4929 for (auto &Pair : InvalidCosts)
4930 if (!Numbering.count(Pair.first))
4931 Numbering[Pair.first] = I++;
4932
4933 // Sort the list, first on instruction(number) then on VF.
4934 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4935 if (Numbering[A.first] != Numbering[B.first])
4936 return Numbering[A.first] < Numbering[B.first];
4938 return ECC(A.second, B.second);
4939 });
4940
4941 // For a list of ordered instruction-vf pairs:
4942 // [(load, vf1), (load, vf2), (store, vf1)]
4943 // Group the instructions together to emit separate remarks for:
4944 // load (vf1, vf2)
4945 // store (vf1)
4946 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4947 auto Subset = ArrayRef<InstructionVFPair>();
4948 do {
4949 if (Subset.empty())
4950 Subset = Tail.take_front(1);
4951
4952 Instruction *I = Subset.front().first;
4953
4954 // If the next instruction is different, or if there are no other pairs,
4955 // emit a remark for the collated subset. e.g.
4956 // [(load, vf1), (load, vf2))]
4957 // to emit:
4958 // remark: invalid costs for 'load' at VF=(vf, vf2)
4959 if (Subset == Tail || Tail[Subset.size()].first != I) {
4960 std::string OutString;
4961 raw_string_ostream OS(OutString);
4962 assert(!Subset.empty() && "Unexpected empty range");
4963 OS << "Instruction with invalid costs prevented vectorization at VF=(";
4964 for (const auto &Pair : Subset)
4965 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4966 OS << "):";
4967 if (auto *CI = dyn_cast<CallInst>(I))
4968 OS << " call to " << CI->getCalledFunction()->getName();
4969 else
4970 OS << " " << I->getOpcodeName();
4971 OS.flush();
4972 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4973 Tail = Tail.drop_front(Subset.size());
4974 Subset = {};
4975 } else
4976 // Grow the subset by one element
4977 Subset = Tail.take_front(Subset.size() + 1);
4978 } while (!Tail.empty());
4979}
4980
4981VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
4982 const ElementCountSet &VFCandidates) {
4983 InstructionCost ExpectedCost =
4985 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4986 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4987 assert(VFCandidates.count(ElementCount::getFixed(1)) &&
4988 "Expected Scalar VF to be a candidate");
4989
4990 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4991 ExpectedCost);
4992 VectorizationFactor ChosenFactor = ScalarCost;
4993
4994 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4995 if (ForceVectorization && VFCandidates.size() > 1) {
4996 // Ignore scalar width, because the user explicitly wants vectorization.
4997 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4998 // evaluation.
4999 ChosenFactor.Cost = InstructionCost::getMax();
5000 }
5001
5002 SmallVector<InstructionVFPair> InvalidCosts;
5003 for (const auto &i : VFCandidates) {
5004 // The cost for scalar VF=1 is already calculated, so ignore it.
5005 if (i.isScalar())
5006 continue;
5007
5009 CM.expectedCost(i, &InvalidCosts);
5010 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5011
5012#ifndef NDEBUG
5013 unsigned AssumedMinimumVscale =
5014 getVScaleForTuning(OrigLoop, TTI).value_or(1);
5015 unsigned Width =
5016 Candidate.Width.isScalable()
5017 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5018 : Candidate.Width.getFixedValue();
5019 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5020 << " costs: " << (Candidate.Cost / Width));
5021 if (i.isScalable())
5022 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5023 << AssumedMinimumVscale << ")");
5024 LLVM_DEBUG(dbgs() << ".\n");
5025#endif
5026
5027 if (!C.second && !ForceVectorization) {
5028 LLVM_DEBUG(
5029 dbgs() << "LV: Not considering vector loop of width " << i
5030 << " because it will not generate any vector instructions.\n");
5031 continue;
5032 }
5033
5034 // If profitable add it to ProfitableVF list.
5035 if (isMoreProfitable(Candidate, ScalarCost))
5036 ProfitableVFs.push_back(Candidate);
5037
5038 if (isMoreProfitable(Candidate, ChosenFactor))
5039 ChosenFactor = Candidate;
5040 }
5041
5042 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
5043
5046 "There are conditional stores.",
5047 "store that is conditionally executed prevents vectorization",
5048 "ConditionalStore", ORE, OrigLoop);
5049 ChosenFactor = ScalarCost;
5050 }
5051
5052 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5053 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5054 << "LV: Vectorization seems to be not beneficial, "
5055 << "but was forced by a user.\n");
5056 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5057 return ChosenFactor;
5058}
5059
5060bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5061 ElementCount VF) const {
5062 // Cross iteration phis such as reductions need special handling and are
5063 // currently unsupported.
5064 if (any_of(OrigLoop->getHeader()->phis(),
5065 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5066 return false;
5067
5068 // Phis with uses outside of the loop require special handling and are
5069 // currently unsupported.
5070 for (const auto &Entry : Legal->getInductionVars()) {
5071 // Look for uses of the value of the induction at the last iteration.
5072 Value *PostInc =
5073 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5074 for (User *U : PostInc->users())
5075 if (!OrigLoop->contains(cast<Instruction>(U)))
5076 return false;
5077 // Look for uses of penultimate value of the induction.
5078 for (User *U : Entry.first->users())
5079 if (!OrigLoop->contains(cast<Instruction>(U)))
5080 return false;
5081 }
5082
5083 // Epilogue vectorization code has not been auditted to ensure it handles
5084 // non-latch exits properly. It may be fine, but it needs auditted and
5085 // tested.
5086 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5087 return false;
5088
5089 return true;
5090}
5091
5093 const ElementCount VF) const {
5094 // FIXME: We need a much better cost-model to take different parameters such
5095 // as register pressure, code size increase and cost of extra branches into
5096 // account. For now we apply a very crude heuristic and only consider loops
5097 // with vectorization factors larger than a certain value.
5098
5099 // Allow the target to opt out entirely.
5101 return false;
5102
5103 // We also consider epilogue vectorization unprofitable for targets that don't
5104 // consider interleaving beneficial (eg. MVE).
5105 if (TTI.getMaxInterleaveFactor(VF) <= 1)
5106 return false;
5107
5108 unsigned Multiplier = 1;
5109 if (VF.isScalable())
5110 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5111 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5112 return true;
5113 return false;
5114}
5115
5117 const ElementCount MainLoopVF, unsigned IC) {
5120 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5121 return Result;
5122 }
5123
5124 if (!CM.isScalarEpilogueAllowed()) {
5125 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5126 "epilogue is allowed.\n");
5127 return Result;
5128 }
5129
5130 // Not really a cost consideration, but check for unsupported cases here to
5131 // simplify the logic.
5132 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5133 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5134 "is not a supported candidate.\n");
5135 return Result;
5136 }
5137
5139 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5141 if (hasPlanWithVF(ForcedEC))
5142 return {ForcedEC, 0, 0};
5143 else {
5144 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5145 "viable.\n");
5146 return Result;
5147 }
5148 }
5149
5150 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5151 OrigLoop->getHeader()->getParent()->hasMinSize()) {
5152 LLVM_DEBUG(
5153 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5154 return Result;
5155 }
5156
5157 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5158 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5159 "this loop\n");
5160 return Result;
5161 }
5162
5163 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5164 // the main loop handles 8 lanes per iteration. We could still benefit from
5165 // vectorizing the epilogue loop with VF=4.
5166 ElementCount EstimatedRuntimeVF = MainLoopVF;
5167 if (MainLoopVF.isScalable()) {
5168 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5169 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5170 EstimatedRuntimeVF *= *VScale;
5171 }
5172
5173 ScalarEvolution &SE = *PSE.getSE();
5174 Type *TCType = Legal->getWidestInductionType();
5175 const SCEV *RemainingIterations = nullptr;
5176 for (auto &NextVF : ProfitableVFs) {
5177 // Skip candidate VFs without a corresponding VPlan.
5178 if (!hasPlanWithVF(NextVF.Width))
5179 continue;
5180
5181 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5182 // vectors) or the VF of the main loop (fixed vectors).
5183 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5184 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5185 ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5186 continue;
5187
5188 // If NextVF is greater than the number of remaining iterations, the
5189 // epilogue loop would be dead. Skip such factors.
5190 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5191 // TODO: extend to support scalable VFs.
5192 if (!RemainingIterations) {
5193 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5194 RemainingIterations = SE.getURemExpr(
5195 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5196 }
5197 if (SE.isKnownPredicate(
5199 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5200 RemainingIterations))
5201 continue;
5202 }
5203
5204 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5205 Result = NextVF;
5206 }
5207
5208 if (Result != VectorizationFactor::Disabled())
5209 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5210 << Result.Width << "\n");
5211 return Result;
5212}
5213
5214std::pair<unsigned, unsigned>
5216 unsigned MinWidth = -1U;
5217 unsigned MaxWidth = 8;
5219 // For in-loop reductions, no element types are added to ElementTypesInLoop
5220 // if there are no loads/stores in the loop. In this case, check through the
5221 // reduction variables to determine the maximum width.
5222 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5223 // Reset MaxWidth so that we can find the smallest type used by recurrences
5224 // in the loop.
5225 MaxWidth = -1U;
5226 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5227 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5228 // When finding the min width used by the recurrence we need to account
5229 // for casts on the input operands of the recurrence.
5230 MaxWidth = std::min<unsigned>(
5231 MaxWidth, std::min<unsigned>(
5234 }
5235 } else {
5236 for (Type *T : ElementTypesInLoop) {
5237 MinWidth = std::min<unsigned>(
5238 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5239 MaxWidth = std::max<unsigned>(
5240 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5241 }
5242 }
5243 return {MinWidth, MaxWidth};
5244}
5245
5247 ElementTypesInLoop.clear();
5248 // For each block.
5249 for (BasicBlock *BB : TheLoop->blocks()) {
5250 // For each instruction in the loop.
5251 for (Instruction &I : BB->instructionsWithoutDebug()) {
5252 Type *T = I.getType();
5253
5254 // Skip ignored values.
5255 if (ValuesToIgnore.count(&I))
5256 continue;
5257
5258 // Only examine Loads, Stores and PHINodes.
5259 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5260 continue;
5261
5262 // Examine PHI nodes that are reduction variables. Update the type to
5263 // account for the recurrence type.
5264 if (auto *PN = dyn_cast<PHINode>(&I)) {
5265 if (!Legal->isReductionVariable(PN))
5266 continue;
5267 const RecurrenceDescriptor &RdxDesc =
5268 Legal->getReductionVars().find(PN)->second;
5271 RdxDesc.getRecurrenceType(),
5273 continue;
5274 T = RdxDesc.getRecurrenceType();
5275 }
5276
5277 // Examine the stored values.
5278 if (auto *ST = dyn_cast<StoreInst>(&I))
5279 T = ST->getValueOperand()->getType();
5280
5281 assert(T->isSized() &&
5282 "Expected the load/store/recurrence type to be sized");
5283
5284 ElementTypesInLoop.insert(T);
5285 }
5286 }
5287}
5288
5289unsigned
5291 InstructionCost LoopCost) {
5292 // -- The interleave heuristics --
5293 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5294 // There are many micro-architectural considerations that we can't predict
5295 // at this level. For example, frontend pressure (on decode or fetch) due to
5296 // code size, or the number and capabilities of the execution ports.
5297 //
5298 // We use the following heuristics to select the interleave count:
5299 // 1. If the code has reductions, then we interleave to break the cross
5300 // iteration dependency.
5301 // 2. If the loop is really small, then we interleave to reduce the loop
5302 // overhead.
5303 // 3. We don't interleave if we think that we will spill registers to memory
5304 // due to the increased register pressure.
5305
5307 return 1;
5308
5309 // Do not interleave if EVL is preferred and no User IC is specified.
5310 if (foldTailWithEVL()) {
5311 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
5312 "Unroll factor forced to be 1.\n");
5313 return 1;
5314 }
5315
5316 // We used the distance for the interleave count.
5318 return 1;
5319
5320 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5321 const bool HasReductions = !Legal->getReductionVars().empty();
5322
5323 // If we did not calculate the cost for VF (because the user selected the VF)
5324 // then we calculate the cost of VF here.
5325 if (LoopCost == 0) {
5326 LoopCost = expectedCost(VF).first;
5327 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5328
5329 // Loop body is free and there is no need for interleaving.
5330 if (LoopCost == 0)
5331 return 1;
5332 }
5333
5335 // We divide by these constants so assume that we have at least one
5336 // instruction that uses at least one register.
5337 for (auto& pair : R.MaxLocalUsers) {
5338 pair.second = std::max(pair.second, 1U);
5339 }
5340
5341 // We calculate the interleave count using the following formula.
5342 // Subtract the number of loop invariants from the number of available
5343 // registers. These registers are used by all of the interleaved instances.
5344 // Next, divide the remaining registers by the number of registers that is
5345 // required by the loop, in order to estimate how many parallel instances
5346 // fit without causing spills. All of this is rounded down if necessary to be
5347 // a power of two. We want power of two interleave count to simplify any
5348 // addressing operations or alignment considerations.
5349 // We also want power of two interleave counts to ensure that the induction
5350 // variable of the vector loop wraps to zero, when tail is folded by masking;
5351 // this currently happens when OptForSize, in which case IC is set to 1 above.
5352 unsigned IC = UINT_MAX;
5353
5354 for (auto& pair : R.MaxLocalUsers) {
5355 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5356 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5357 << " registers of "
5358 << TTI.getRegisterClassName(pair.first) << " register class\n");
5359 if (VF.isScalar()) {
5360 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5361 TargetNumRegisters = ForceTargetNumScalarRegs;
5362 } else {
5363 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5364 TargetNumRegisters = ForceTargetNumVectorRegs;
5365 }
5366 unsigned MaxLocalUsers = pair.second;
5367 unsigned LoopInvariantRegs = 0;
5368 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5369 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5370
5371 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5372 MaxLocalUsers);
5373 // Don't count the induction variable as interleaved.
5375 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5376 std::max(1U, (MaxLocalUsers - 1)));
5377 }
5378
5379 IC = std::min(IC, TmpIC);
5380 }
5381
5382 // Clamp the interleave ranges to reasonable counts.
5383 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5384
5385 // Check if the user has overridden the max.
5386 if (VF.isScalar()) {
5387 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5388 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5389 } else {
5390 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5391 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5392 }
5393
5394 unsigned EstimatedVF = VF.getKnownMinValue();
5395 if (VF.isScalable()) {
5396 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5397 EstimatedVF *= *VScale;
5398 }
5399 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5400
5401 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5402 if (KnownTC > 0) {
5403 // At least one iteration must be scalar when this constraint holds. So the
5404 // maximum available iterations for interleaving is one less.
5405 unsigned AvailableTC =
5406 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5407
5408 // If trip count is known we select between two prospective ICs, where
5409 // 1) the aggressive IC is capped by the trip count divided by VF
5410 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5411 // The final IC is selected in a way that the epilogue loop trip count is
5412 // minimized while maximizing the IC itself, so that we either run the
5413 // vector loop at least once if it generates a small epilogue loop, or else
5414 // we run the vector loop at least twice.
5415
5416 unsigned InterleaveCountUB = bit_floor(
5417 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5418 unsigned InterleaveCountLB = bit_floor(std::max(
5419 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5420 MaxInterleaveCount = InterleaveCountLB;
5421
5422 if (InterleaveCountUB != InterleaveCountLB) {
5423 unsigned TailTripCountUB =
5424 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5425 unsigned TailTripCountLB =
5426 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5427 // If both produce same scalar tail, maximize the IC to do the same work
5428 // in fewer vector loop iterations
5429 if (TailTripCountUB == TailTripCountLB)
5430 MaxInterleaveCount = InterleaveCountUB;
5431 }
5432 } else if (BestKnownTC && *BestKnownTC > 0) {
5433 // At least one iteration must be scalar when this constraint holds. So the
5434 // maximum available iterations for interleaving is one less.
5435 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5436 ? (*BestKnownTC) - 1
5437 : *BestKnownTC;
5438
5439 // If trip count is an estimated compile time constant, limit the
5440 // IC to be capped by the trip count divided by VF * 2, such that the vector
5441 // loop runs at least twice to make interleaving seem profitable when there
5442 // is an epilogue loop present. Since exact Trip count is not known we
5443 // choose to be conservative in our IC estimate.
5444 MaxInterleaveCount = bit_floor(std::max(
5445 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5446 }
5447
5448 assert(MaxInterleaveCount > 0 &&
5449 "Maximum interleave count must be greater than 0");
5450
5451 // Clamp the calculated IC to be between the 1 and the max interleave count
5452 // that the target and trip count allows.
5453 if (IC > MaxInterleaveCount)
5454 IC = MaxInterleaveCount;
5455 else
5456 // Make sure IC is greater than 0.
5457 IC = std::max(1u, IC);
5458
5459 assert(IC > 0 && "Interleave count must be greater than 0.");
5460
5461 // Interleave if we vectorized this loop and there is a reduction that could
5462 // benefit from interleaving.
5463 if (VF.isVector() && HasReductions) {
5464 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5465 return IC;
5466 }
5467
5468 // For any scalar loop that either requires runtime checks or predication we
5469 // are better off leaving this to the unroller. Note that if we've already
5470 // vectorized the loop we will have done the runtime check and so interleaving
5471 // won't require further checks.
5472 bool ScalarInterleavingRequiresPredication =
5473 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5474 return Legal->blockNeedsPredication(BB);
5475 }));
5476 bool ScalarInterleavingRequiresRuntimePointerCheck =
5478
5479 // We want to interleave small loops in order to reduce the loop overhead and
5480 // potentially expose ILP opportunities.
5481 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5482 << "LV: IC is " << IC << '\n'
5483 << "LV: VF is " << VF << '\n');
5484 const bool AggressivelyInterleaveReductions =
5485 TTI.enableAggressiveInterleaving(HasReductions);
5486 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5487 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5488 // We assume that the cost overhead is 1 and we use the cost model
5489 // to estimate the cost of the loop and interleave until the cost of the
5490 // loop overhead is about 5% of the cost of the loop.
5491 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5492 SmallLoopCost / *LoopCost.getValue()));
5493
5494 // Interleave until store/load ports (estimated by max interleave count) are
5495 // saturated.
5496 unsigned NumStores = Legal->getNumStores();
5497 unsigned NumLoads = Legal->getNumLoads();
5498 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5499 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5500
5501 // There is little point in interleaving for reductions containing selects
5502 // and compares when VF=1 since it may just create more overhead than it's
5503 // worth for loops with small trip counts. This is because we still have to
5504 // do the final reduction after the loop.
5505 bool HasSelectCmpReductions =
5506 HasReductions &&
5507 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5508 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5509 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5510 RdxDesc.getRecurrenceKind());
5511 });
5512 if (HasSelectCmpReductions) {
5513 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5514 return 1;
5515 }
5516
5517 // If we have a scalar reduction (vector reductions are already dealt with
5518 // by this point), we can increase the critical path length if the loop
5519 // we're interleaving is inside another loop. For tree-wise reductions
5520 // set the limit to 2, and for ordered reductions it's best to disable
5521 // interleaving entirely.
5522 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5523 bool HasOrderedReductions =
5524 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5525 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5526 return RdxDesc.isOrdered();
5527 });
5528 if (HasOrderedReductions) {
5529 LLVM_DEBUG(
5530 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5531 return 1;
5532 }
5533
5534 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5535 SmallIC = std::min(SmallIC, F);
5536 StoresIC = std::min(StoresIC, F);
5537 LoadsIC = std::min(LoadsIC, F);
5538 }
5539
5541 std::max(StoresIC, LoadsIC) > SmallIC) {
5542 LLVM_DEBUG(
5543 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5544 return std::max(StoresIC, LoadsIC);
5545 }
5546
5547 // If there are scalar reductions and TTI has enabled aggressive
5548 // interleaving for reductions, we will interleave to expose ILP.
5549 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5550 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5551 // Interleave no less than SmallIC but not as aggressive as the normal IC
5552 // to satisfy the rare situation when resources are too limited.
5553 return std::max(IC / 2, SmallIC);
5554 } else {
5555 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5556 return SmallIC;
5557 }
5558 }
5559
5560 // Interleave if this is a large loop (small loops are already dealt with by
5561 // this point) that could benefit from interleaving.
5562 if (AggressivelyInterleaveReductions) {
5563 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5564 return IC;
5565 }
5566
5567 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5568 return 1;
5569}
5570
5573 // This function calculates the register usage by measuring the highest number
5574 // of values that are alive at a single location. Obviously, this is a very
5575 // rough estimation. We scan the loop in a topological order in order and
5576 // assign a number to each instruction. We use RPO to ensure that defs are
5577 // met before their users. We assume that each instruction that has in-loop
5578 // users starts an interval. We record every time that an in-loop value is
5579 // used, so we have a list of the first and last occurrences of each
5580 // instruction. Next, we transpose this data structure into a multi map that
5581 // holds the list of intervals that *end* at a specific location. This multi
5582 // map allows us to perform a linear search. We scan the instructions linearly
5583 // and record each time that a new interval starts, by placing it in a set.
5584 // If we find this value in the multi-map then we remove it from the set.
5585 // The max register usage is the maximum size of the set.
5586 // We also search for instructions that are defined outside the loop, but are
5587 // used inside the loop. We need this number separately from the max-interval
5588 // usage number because when we unroll, loop-invariant values do not take
5589 // more register.
5591 DFS.perform(LI);
5592
5593 RegisterUsage RU;
5594
5595 // Each 'key' in the map opens a new interval. The values
5596 // of the map are the index of the 'last seen' usage of the
5597 // instruction that is the key.
5599
5600 // Maps instruction to its index.
5602 // Marks the end of each interval.
5603 IntervalMap EndPoint;
5604 // Saves the list of instruction indices that are used in the loop.
5606 // Saves the list of values that are used in the loop but are defined outside
5607 // the loop (not including non-instruction values such as arguments and
5608 // constants).
5609 SmallSetVector<Instruction *, 8> LoopInvariants;
5610
5611 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5612 for (Instruction &I : BB->instructionsWithoutDebug()) {
5613 IdxToInstr.push_back(&I);
5614
5615 // Save the end location of each USE.
5616 for (Value *U : I.operands()) {
5617 auto *Instr = dyn_cast<Instruction>(U);
5618
5619 // Ignore non-instruction values such as arguments, constants, etc.
5620 // FIXME: Might need some motivation why these values are ignored. If
5621 // for example an argument is used inside the loop it will increase the
5622 // register pressure (so shouldn't we add it to LoopInvariants).
5623 if (!Instr)
5624 continue;
5625
5626 // If this instruction is outside the loop then record it and continue.
5627 if (!TheLoop->contains(Instr)) {
5628 LoopInvariants.insert(Instr);
5629 continue;
5630 }
5631
5632 // Overwrite previous end points.
5633 EndPoint[Instr] = IdxToInstr.size();
5634 Ends.insert(Instr);
5635 }
5636 }
5637 }
5638
5639 // Saves the list of intervals that end with the index in 'key'.
5640 using InstrList = SmallVector<Instruction *, 2>;
5641 DenseMap<unsigned, InstrList> TransposeEnds;
5642
5643 // Transpose the EndPoints to a list of values that end at each index.
5644 for (auto &Interval : EndPoint)
5645 TransposeEnds[Interval.second].push_back(Interval.first);
5646
5647 SmallPtrSet<Instruction *, 8> OpenIntervals;
5650
5651 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5652
5653 const auto &TTICapture = TTI;
5654 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5655 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5656 return 0;
5657 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5658 };
5659
5660 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5661 Instruction *I = IdxToInstr[i];
5662
5663 // Remove all of the instructions that end at this location.
5664 InstrList &List = TransposeEnds[i];
5665 for (Instruction *ToRemove : List)
5666 OpenIntervals.erase(ToRemove);
5667
5668 // Ignore instructions that are never used within the loop.
5669 if (!Ends.count(I))
5670 continue;
5671
5672 // Skip ignored values.
5673 if (ValuesToIgnore.count(I))
5674 continue;
5675
5677
5678 // For each VF find the maximum usage of registers.
5679 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5680 // Count the number of registers used, per register class, given all open
5681 // intervals.
5682 // Note that elements in this SmallMapVector will be default constructed
5683 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5684 // there is no previous entry for ClassID.
5686
5687 if (VFs[j].isScalar()) {
5688 for (auto *Inst : OpenIntervals) {
5689 unsigned ClassID =
5690 TTI.getRegisterClassForType(false, Inst->getType());
5691 // FIXME: The target might use more than one register for the type
5692 // even in the scalar case.
5693 RegUsage[ClassID] += 1;
5694 }
5695 } else {
5697 for (auto *Inst : OpenIntervals) {
5698 // Skip ignored values for VF > 1.
5699 if (VecValuesToIgnore.count(Inst))
5700 continue;
5701 if (isScalarAfterVectorization(Inst, VFs[j])) {
5702 unsigned ClassID =
5703 TTI.getRegisterClassForType(false, Inst->getType());
5704 // FIXME: The target might use more than one register for the type
5705 // even in the scalar case.
5706 RegUsage[ClassID] += 1;
5707 } else {
5708 unsigned ClassID =
5709 TTI.getRegisterClassForType(true, Inst->getType());
5710 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5711 }
5712 }
5713 }
5714
5715 for (auto& pair : RegUsage) {
5716 auto &Entry = MaxUsages[j][pair.first];
5717 Entry = std::max(Entry, pair.second);
5718 }
5719 }
5720
5721 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5722 << OpenIntervals.size() << '\n');
5723
5724 // Add the current instruction to the list of open intervals.
5725 OpenIntervals.insert(I);
5726 }
5727
5728 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5729 // Note that elements in this SmallMapVector will be default constructed
5730 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5731 // there is no previous entry for ClassID.
5733
5734 for (auto *Inst : LoopInvariants) {
5735 // FIXME: The target might use more than one register for the type
5736 // even in the scalar case.
5737 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5738 auto *I = cast<Instruction>(U);
5739 return TheLoop != LI->getLoopFor(I->getParent()) ||
5740 isScalarAfterVectorization(I, VFs[i]);
5741 });
5742
5743 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5744 unsigned ClassID =
5745 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5746 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5747 }
5748
5749 LLVM_DEBUG({
5750 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5751 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5752 << " item\n";
5753 for (const auto &pair : MaxUsages[i]) {
5754 dbgs() << "LV(REG): RegisterClass: "
5755 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5756 << " registers\n";
5757 }
5758 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5759 << " item\n";
5760 for (const auto &pair : Invariant) {
5761 dbgs() << "LV(REG): RegisterClass: "
5762 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5763 << " registers\n";
5764 }
5765 });
5766
5767 RU.LoopInvariantRegs = Invariant;
5768 RU.MaxLocalUsers = MaxUsages[i];
5769 RUs[i] = RU;
5770 }
5771
5772 return RUs;
5773}
5774
5775bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5776 ElementCount VF) {
5777 // TODO: Cost model for emulated masked load/store is completely
5778 // broken. This hack guides the cost model to use an artificially
5779 // high enough value to practically disable vectorization with such
5780 // operations, except where previously deployed legality hack allowed
5781 // using very low cost values. This is to avoid regressions coming simply
5782 // from moving "masked load/store" check from legality to cost model.
5783 // Masked Load/Gather emulation was previously never allowed.
5784 // Limited number of Masked Store/Scatter emulation was allowed.
5786 "Expecting a scalar emulated instruction");
5787 return isa<LoadInst>(I) ||
5788 (isa<StoreInst>(I) &&
5789 NumPredStores > NumberOfStoresToPredicate);
5790}
5791
5793 // If we aren't vectorizing the loop, or if we've already collected the
5794 // instructions to scalarize, there's nothing to do. Collection may already
5795 // have occurred if we have a user-selected VF and are now computing the
5796 // expected cost for interleaving.
5797 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5798 return;
5799
5800 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5801 // not profitable to scalarize any instructions, the presence of VF in the
5802 // map will indicate that we've analyzed it already.
5803 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5804
5805 PredicatedBBsAfterVectorization[VF].clear();
5806
5807 // Find all the instructions that are scalar with predication in the loop and
5808 // determine if it would be better to not if-convert the blocks they are in.
5809 // If so, we also record the instructions to scalarize.
5810 for (BasicBlock *BB : TheLoop->blocks()) {
5812 continue;
5813 for (Instruction &I : *BB)
5814 if (isScalarWithPredication(&I, VF)) {
5815 ScalarCostsTy ScalarCosts;
5816 // Do not apply discount if scalable, because that would lead to
5817 // invalid scalarization costs.
5818 // Do not apply discount logic if hacked cost is needed
5819 // for emulated masked memrefs.
5820 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5821 !useEmulatedMaskMemRefHack(&I, VF) &&
5822 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5823 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5824 // Remember that BB will remain after vectorization.
5825 PredicatedBBsAfterVectorization[VF].insert(BB);
5826 }
5827 }
5828}
5829
5830InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5831 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5832 assert(!isUniformAfterVectorization(PredInst, VF) &&
5833 "Instruction marked uniform-after-vectorization will be predicated");
5834
5835 // Initialize the discount to zero, meaning that the scalar version and the
5836 // vector version cost the same.
5837 InstructionCost Discount = 0;
5838
5839 // Holds instructions to analyze. The instructions we visit are mapped in
5840 // ScalarCosts. Those instructions are the ones that would be scalarized if
5841 // we find that the scalar version costs less.
5843
5844 // Returns true if the given instruction can be scalarized.
5845 auto canBeScalarized = [&](Instruction *I) -> bool {
5846 // We only attempt to scalarize instructions forming a single-use chain
5847 // from the original predicated block that would otherwise be vectorized.
5848 // Although not strictly necessary, we give up on instructions we know will
5849 // already be scalar to avoid traversing chains that are unlikely to be
5850 // beneficial.
5851 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5853 return false;
5854
5855 // If the instruction is scalar with predication, it will be analyzed
5856 // separately. We ignore it within the context of PredInst.
5857 if (isScalarWithPredication(I, VF))
5858 return false;
5859
5860 // If any of the instruction's operands are uniform after vectorization,
5861 // the instruction cannot be scalarized. This prevents, for example, a
5862 // masked load from being scalarized.
5863 //
5864 // We assume we will only emit a value for lane zero of an instruction
5865 // marked uniform after vectorization, rather than VF identical values.
5866 // Thus, if we scalarize an instruction that uses a uniform, we would
5867 // create uses of values corresponding to the lanes we aren't emitting code
5868 // for. This behavior can be changed by allowing getScalarValue to clone
5869 // the lane zero values for uniforms rather than asserting.
5870 for (Use &U : I->operands())
5871 if (auto *J = dyn_cast<Instruction>(U.get()))
5872 if (isUniformAfterVectorization(J, VF))
5873 return false;
5874
5875 // Otherwise, we can scalarize the instruction.
5876 return true;
5877 };
5878
5879 // Compute the expected cost discount from scalarizing the entire expression
5880 // feeding the predicated instruction. We currently only consider expressions
5881 // that are single-use instruction chains.
5882 Worklist.push_back(PredInst);
5883 while (!Worklist.empty()) {
5884 Instruction *I = Worklist.pop_back_val();
5885
5886 // If we've already analyzed the instruction, there's nothing to do.
5887 if (ScalarCosts.contains(I))
5888 continue;
5889
5890 // Compute the cost of the vector instruction. Note that this cost already
5891 // includes the scalarization overhead of the predicated instruction.
5892 InstructionCost VectorCost = getInstructionCost(I, VF).first;
5893
5894 // Compute the cost of the scalarized instruction. This cost is the cost of
5895 // the instruction as if it wasn't if-converted and instead remained in the
5896 // predicated block. We will scale this cost by block probability after
5897 // computing the scalarization overhead.
5898 InstructionCost ScalarCost =
5899 VF.getFixedValue() *
5900 getInstructionCost(I, ElementCount::getFixed(1)).first;
5901
5902 // Compute the scalarization overhead of needed insertelement instructions
5903 // and phi nodes.
5905 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5906 ScalarCost += TTI.getScalarizationOverhead(
5907 cast<VectorType>(ToVectorTy(I->getType(), VF)),
5908 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5909 /*Extract*/ false, CostKind);
5910 ScalarCost +=
5911 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5912 }
5913
5914 // Compute the scalarization overhead of needed extractelement
5915 // instructions. For each of the instruction's operands, if the operand can
5916 // be scalarized, add it to the worklist; otherwise, account for the
5917 // overhead.
5918 for (Use &U : I->operands())
5919 if (auto *J = dyn_cast<Instruction>(U.get())) {
5920 assert(VectorType::isValidElementType(J->getType()) &&
5921 "Instruction has non-scalar type");
5922 if (canBeScalarized(J))
5923 Worklist.push_back(J);
5924 else if (needsExtract(J, VF)) {
5925 ScalarCost += TTI.getScalarizationOverhead(
5926 cast<VectorType>(ToVectorTy(J->getType(), VF)),
5927 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5928 /*Extract*/ true, CostKind);
5929 }
5930 }
5931
5932 // Scale the total scalar cost by block probability.
5933 ScalarCost /= getReciprocalPredBlockProb();
5934
5935 // Compute the discount. A non-negative discount means the vector version
5936 // of the instruction costs more, and scalarizing would be beneficial.
5937 Discount += VectorCost - ScalarCost;
5938 ScalarCosts[I] = ScalarCost;
5939 }
5940
5941 return Discount;
5942}
5943
5948
5949 // For each block.
5950 for (BasicBlock *BB : TheLoop->blocks()) {
5951 VectorizationCostTy BlockCost;
5952
5953 // For each instruction in the old loop.
5954 for (Instruction &I : BB->instructionsWithoutDebug()) {
5955 // Skip ignored values.
5956 if (ValuesToIgnore.count(&I) ||
5957 (VF.isVector() && VecValuesToIgnore.count(&I)))
5958 continue;
5959
5960 VectorizationCostTy C = getInstructionCost(&I, VF);
5961
5962 // Check if we should override the cost.
5963 if (C.first.isValid() &&
5964 ForceTargetInstructionCost.getNumOccurrences() > 0)
5966
5967 // Keep a list of instructions with invalid costs.
5968 if (Invalid && !C.first.isValid())
5969 Invalid->emplace_back(&I, VF);
5970
5971 BlockCost.first += C.first;
5972 BlockCost.second |= C.second;
5973 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5974 << " for VF " << VF << " For instruction: " << I
5975 << '\n');
5976 }
5977
5978 // If we are vectorizing a predicated block, it will have been
5979 // if-converted. This means that the block's instructions (aside from
5980 // stores and instructions that may divide by zero) will now be
5981 // unconditionally executed. For the scalar case, we may not always execute
5982 // the predicated block, if it is an if-else block. Thus, scale the block's
5983 // cost by the probability of executing it. blockNeedsPredication from
5984 // Legal is used so as to not include all blocks in tail folded loops.
5985 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5986 BlockCost.first /= getReciprocalPredBlockProb();
5987
5988 Cost.first += BlockCost.first;
5989 Cost.second |= BlockCost.second;
5990 }
5991
5992 return Cost;
5993}
5994
5995/// Gets Address Access SCEV after verifying that the access pattern
5996/// is loop invariant except the induction variable dependence.
5997///
5998/// This SCEV can be sent to the Target in order to estimate the address
5999/// calculation cost.
6001 Value *Ptr,
6004 const Loop *TheLoop) {
6005
6006 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6007 if (!Gep)
6008 return nullptr;
6009
6010 // We are looking for a gep with all loop invariant indices except for one
6011 // which should be an induction variable.
6012 auto SE = PSE.getSE();
6013 unsigned NumOperands = Gep->getNumOperands();
6014 for (unsigned i = 1; i < NumOperands; ++i) {
6015 Value *Opd = Gep->getOperand(i);
6016 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6017 !Legal->isInductionVariable(Opd))
6018 return nullptr;
6019 }
6020
6021 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6022 return PSE.getSCEV(Ptr);
6023}
6024
6026LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6027 ElementCount VF) {
6028 assert(VF.isVector() &&
6029 "Scalarization cost of instruction implies vectorization.");
6030 if (VF.isScalable())
6032
6033 Type *ValTy = getLoadStoreType(I);
6034 auto SE = PSE.getSE();
6035
6036 unsigned AS = getLoadStoreAddressSpace(I);
6038 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6039 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6040 // that it is being called from this specific place.
6041
6042 // Figure out whether the access is strided and get the stride value
6043 // if it's known in compile time
6044 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6045
6046 // Get the cost of the scalar memory instruction and address computation.
6048 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6049
6050 // Don't pass *I here, since it is scalar but will actually be part of a
6051 // vectorized loop where the user of it is a vectorized instruction.
6053 const Align Alignment = getLoadStoreAlignment(I);
6054 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6055 ValTy->getScalarType(),
6056 Alignment, AS, CostKind);
6057
6058 // Get the overhead of the extractelement and insertelement instructions
6059 // we might create due to scalarization.
6060 Cost += getScalarizationOverhead(I, VF, CostKind);
6061
6062 // If we have a predicated load/store, it will need extra i1 extracts and
6063 // conditional branches, but may not be executed for each vector lane. Scale
6064 // the cost by the probability of executing the predicated block.
6065 if (isPredicatedInst(I)) {
6067
6068 // Add the cost of an i1 extract and a branch
6069 auto *Vec_i1Ty =
6072 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6073 /*Insert=*/false, /*Extract=*/true, CostKind);
6074 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6075
6076 if (useEmulatedMaskMemRefHack(I, VF))
6077 // Artificially setting to a high enough value to practically disable
6078 // vectorization with such operations.
6079 Cost = 3000000;
6080 }
6081
6082 return Cost;
6083}
6084
6086LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6087 ElementCount VF) {
6088 Type *ValTy = getLoadStoreType(I);
6089 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6091 unsigned AS = getLoadStoreAddressSpace(I);
6092 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6094
6095 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6096 "Stride should be 1 or -1 for consecutive memory access");
6097 const Align Alignment = getLoadStoreAlignment(I);
6099 if (Legal->isMaskRequired(I)) {
6100 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6101 CostKind);
6102 } else {
6103 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6104 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6105 CostKind, OpInfo, I);
6106 }
6107
6108 bool Reverse = ConsecutiveStride < 0;
6109 if (Reverse)
6111 std::nullopt, CostKind, 0);
6112 return Cost;
6113}
6114
6116LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6117 ElementCount VF) {
6118 assert(Legal->isUniformMemOp(*I, VF));
6119
6120 Type *ValTy = getLoadStoreType(I);
6121 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6122 const Align Alignment = getLoadStoreAlignment(I);
6123 unsigned AS = getLoadStoreAddressSpace(I);
6125 if (isa<LoadInst>(I)) {
6126 return TTI.getAddressComputationCost(ValTy) +
6127 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6128 CostKind) +
6130 }
6131 StoreInst *SI = cast<StoreInst>(I);
6132
6133 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6134 return TTI.getAddressComputationCost(ValTy) +
6135 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6136 CostKind) +
6137 (isLoopInvariantStoreValue
6138 ? 0
6139 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6140 CostKind, VF.getKnownMinValue() - 1));
6141}
6142
6144LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6145 ElementCount VF) {
6146 Type *ValTy = getLoadStoreType(I);
6147 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6148 const Align Alignment = getLoadStoreAlignment(I);
6150
6151 return TTI.getAddressComputationCost(VectorTy) +
6153 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6155}
6156
6158LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6159 ElementCount VF) {
6160 Type *ValTy = getLoadStoreType(I);
6161 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6162 unsigned AS = getLoadStoreAddressSpace(I);
6164
6165 auto Group = getInterleavedAccessGroup(I);
6166 assert(Group && "Fail to get an interleaved access group.");
6167
6168 unsigned InterleaveFactor = Group->getFactor();
6169 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6170
6171 // Holds the indices of existing members in the interleaved group.
6173 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6174 if (Group->getMember(IF))
6175 Indices.push_back(IF);
6176
6177 // Calculate the cost of the whole interleaved group.
6178 bool UseMaskForGaps =
6179 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6180 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6182 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6183 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6184
6185 if (Group->isReverse()) {
6186 // TODO: Add support for reversed masked interleaved access.
6188 "Reverse masked interleaved access not supported.");
6189 Cost += Group->getNumMembers() *
6191 std::nullopt, CostKind, 0);
6192 }
6193 return Cost;
6194}
6195
6196std::optional<InstructionCost>
6197LoopVectorizationCostModel::getReductionPatternCost(
6198 Instruction *I, ElementCount VF, Type *Ty,
6200 using namespace llvm::PatternMatch;
6201 // Early exit for no inloop reductions
6202 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6203 return std::nullopt;
6204 auto *VectorTy = cast<VectorType>(Ty);
6205
6206 // We are looking for a pattern of, and finding the minimal acceptable cost:
6207 // reduce(mul(ext(A), ext(B))) or
6208 // reduce(mul(A, B)) or
6209 // reduce(ext(A)) or
6210 // reduce(A).
6211 // The basic idea is that we walk down the tree to do that, finding the root
6212 // reduction instruction in InLoopReductionImmediateChains. From there we find
6213 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6214 // of the components. If the reduction cost is lower then we return it for the
6215 // reduction instruction and 0 for the other instructions in the pattern. If
6216 // it is not we return an invalid cost specifying the orignal cost method
6217 // should be used.
6218 Instruction *RetI = I;
6219 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6220 if (!RetI->hasOneUser())
6221 return std::nullopt;
6222 RetI = RetI->user_back();
6223 }
6224
6225 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6226 RetI->user_back()->getOpcode() == Instruction::Add) {
6227 RetI = RetI->user_back();
6228 }
6229
6230 // Test if the found instruction is a reduction, and if not return an invalid
6231 // cost specifying the parent to use the original cost modelling.
6232 if (!InLoopReductionImmediateChains.count(RetI))
6233 return std::nullopt;
6234
6235 // Find the reduction this chain is a part of and calculate the basic cost of
6236 // the reduction on its own.
6237 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6238 Instruction *ReductionPhi = LastChain;
6239 while (!isa<PHINode>(ReductionPhi))
6240 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6241
6242 const RecurrenceDescriptor &RdxDesc =
6243 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6244
6246 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6247
6248 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6249 // normal fmul instruction to the cost of the fadd reduction.
6250 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6251 BaseCost +=
6252 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6253
6254 // If we're using ordered reductions then we can just return the base cost
6255 // here, since getArithmeticReductionCost calculates the full ordered
6256 // reduction cost when FP reassociation is not allowed.
6257 if (useOrderedReductions(RdxDesc))
6258 return BaseCost;
6259
6260 // Get the operand that was not the reduction chain and match it to one of the
6261 // patterns, returning the better cost if it is found.
6262 Instruction *RedOp = RetI->getOperand(1) == LastChain
6263 ? dyn_cast<Instruction>(RetI->getOperand(0))
6264 : dyn_cast<Instruction>(RetI->getOperand(1));
6265
6266 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6267
6268 Instruction *Op0, *Op1;
6269 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6270 match(RedOp,
6272 match(Op0, m_ZExtOrSExt(m_Value())) &&
6273 Op0->getOpcode() == Op1->getOpcode() &&
6274 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6276 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6277
6278 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6279 // Note that the extend opcodes need to all match, or if A==B they will have
6280 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6281 // which is equally fine.
6282 bool IsUnsigned = isa<ZExtInst>(Op0);
6283 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6284 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6285
6286 InstructionCost ExtCost =
6287 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6289 InstructionCost MulCost =
6290 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6291 InstructionCost Ext2Cost =
6292 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6294
6296 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6297
6298 if (RedCost.isValid() &&
6299 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6300 return I == RetI ? RedCost : 0;
6301 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6302 !TheLoop->isLoopInvariant(RedOp)) {
6303 // Matched reduce(ext(A))
6304 bool IsUnsigned = isa<ZExtInst>(RedOp);
6305 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6307 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6308 RdxDesc.getFastMathFlags(), CostKind);
6309
6310 InstructionCost ExtCost =
6311 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6313 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6314 return I == RetI ? RedCost : 0;
6315 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6316 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6317 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6318 Op0->getOpcode() == Op1->getOpcode() &&
6320 bool IsUnsigned = isa<ZExtInst>(Op0);
6321 Type *Op0Ty = Op0->getOperand(0)->getType();
6322 Type *Op1Ty = Op1->getOperand(0)->getType();
6323 Type *LargestOpTy =
6324 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6325 : Op0Ty;
6326 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6327
6328 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6329 // different sizes. We take the largest type as the ext to reduce, and add
6330 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6332 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6335 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6337 InstructionCost MulCost =
6338 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6339
6341 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6342 InstructionCost ExtraExtCost = 0;
6343 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6344 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6345 ExtraExtCost = TTI.getCastInstrCost(
6346 ExtraExtOp->getOpcode(), ExtType,
6347 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6349 }
6350
6351 if (RedCost.isValid() &&
6352 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6353 return I == RetI ? RedCost : 0;
6354 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6355 // Matched reduce.add(mul())
6356 InstructionCost MulCost =
6357 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6358
6360 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6361
6362 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6363 return I == RetI ? RedCost : 0;
6364 }
6365 }
6366
6367 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6368}
6369
6371LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6372 ElementCount VF) {
6373 // Calculate scalar cost only. Vectorization cost should be ready at this
6374 // moment.
6375 if (VF.isScalar()) {
6376 Type *ValTy = getLoadStoreType(I);
6377 const Align Alignment = getLoadStoreAlignment(I);
6378 unsigned AS = getLoadStoreAddressSpace(I);
6379
6380 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6381 return TTI.getAddressComputationCost(ValTy) +
6382 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6383 TTI::TCK_RecipThroughput, OpInfo, I);
6384 }
6385 return getWideningCost(I, VF);
6386}
6387
6389LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6390 ElementCount VF) {
6391 // If we know that this instruction will remain uniform, check the cost of
6392 // the scalar version.
6394 VF = ElementCount::getFixed(1);
6395
6396 if (VF.isVector() && isProfitableToScalarize(I, VF))
6397 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6398
6399 // Forced scalars do not have any scalarization overhead.
6400 auto ForcedScalar = ForcedScalars.find(VF);
6401 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6402 auto InstSet = ForcedScalar->second;
6403 if (InstSet.count(I))
6404 return VectorizationCostTy(
6405 (getInstructionCost(I, ElementCount::getFixed(1)).first *
6406 VF.getKnownMinValue()),
6407 false);
6408 }
6409
6410 Type *VectorTy;
6411 InstructionCost C = getInstructionCost(I, VF, VectorTy);
6412
6413 bool TypeNotScalarized = false;
6414 if (VF.isVector() && VectorTy->isVectorTy()) {
6415 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6416 if (VF.isScalable())
6417 // <vscale x 1 x iN> is assumed to be profitable over iN because
6418 // scalable registers are a distinct register class from scalar ones.
6419 // If we ever find a target which wants to lower scalable vectors
6420 // back to scalars, we'll need to update this code to explicitly
6421 // ask TTI about the register class uses for each part.
6422 TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6423 else
6424 TypeNotScalarized = NumParts < VF.getKnownMinValue();
6425 } else
6427 }
6428 return VectorizationCostTy(C, TypeNotScalarized);
6429}
6430
6431InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6433
6434 // There is no mechanism yet to create a scalable scalarization loop,
6435 // so this is currently Invalid.
6436 if (VF.isScalable())
6438
6439 if (VF.isScalar())
6440 return 0;
6441
6443 Type *RetTy = ToVectorTy(I->getType(), VF);
6444 if (!RetTy->isVoidTy() &&
6445 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6447 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6448 /*Insert*/ true,
6449 /*Extract*/ false, CostKind);
6450
6451 // Some targets keep addresses scalar.
6452 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6453 return Cost;
6454
6455 // Some targets support efficient element stores.
6456 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6457 return Cost;
6458
6459 // Collect operands to consider.
6460 CallInst *CI = dyn_cast<CallInst>(I);
6461 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6462
6463 // Skip operands that do not require extraction/scalarization and do not incur
6464 // any overhead.
6466 for (auto *V : filterExtractingOperands(Ops, VF))
6467 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6469 filterExtractingOperands(Ops, VF), Tys, CostKind);
6470}
6471
6473 if (VF.isScalar())
6474 return;
6475 NumPredStores = 0;
6476 for (BasicBlock *BB : TheLoop->blocks()) {
6477 // For each instruction in the old loop.
6478 for (Instruction &I : *BB) {
6480 if (!Ptr)
6481 continue;
6482
6483 // TODO: We should generate better code and update the cost model for
6484 // predicated uniform stores. Today they are treated as any other
6485 // predicated store (see added test cases in
6486 // invariant-store-vectorization.ll).
6487 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6488 NumPredStores++;
6489
6490 if (Legal->isUniformMemOp(I, VF)) {
6491 auto isLegalToScalarize = [&]() {
6492 if (!VF.isScalable())
6493 // Scalarization of fixed length vectors "just works".
6494 return true;
6495
6496 // We have dedicated lowering for unpredicated uniform loads and
6497 // stores. Note that even with tail folding we know that at least
6498 // one lane is active (i.e. generalized predication is not possible
6499 // here), and the logic below depends on this fact.
6500 if (!foldTailByMasking())
6501 return true;
6502
6503 // For scalable vectors, a uniform memop load is always
6504 // uniform-by-parts and we know how to scalarize that.
6505 if (isa<LoadInst>(I))
6506 return true;
6507
6508 // A uniform store isn't neccessarily uniform-by-part
6509 // and we can't assume scalarization.
6510 auto &SI = cast<StoreInst>(I);
6511 return TheLoop->isLoopInvariant(SI.getValueOperand());
6512 };
6513
6514 const InstructionCost GatherScatterCost =
6516 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6517
6518 // Load: Scalar load + broadcast
6519 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6520 // FIXME: This cost is a significant under-estimate for tail folded
6521 // memory ops.
6522 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6523 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6524
6525 // Choose better solution for the current VF, Note that Invalid
6526 // costs compare as maximumal large. If both are invalid, we get
6527 // scalable invalid which signals a failure and a vectorization abort.
6528 if (GatherScatterCost < ScalarizationCost)
6529 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6530 else
6531 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6532 continue;
6533 }
6534
6535 // We assume that widening is the best solution when possible.
6536 if (memoryInstructionCanBeWidened(&I, VF)) {
6537 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6538 int ConsecutiveStride = Legal->isConsecutivePtr(
6540 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6541 "Expected consecutive stride.");
6542 InstWidening Decision =
6543 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6544 setWideningDecision(&I, VF, Decision, Cost);
6545 continue;
6546 }
6547
6548 // Choose between Interleaving, Gather/Scatter or Scalarization.
6550 unsigned NumAccesses = 1;
6551 if (isAccessInterleaved(&I)) {
6552 auto Group = getInterleavedAccessGroup(&I);
6553 assert(Group && "Fail to get an interleaved access group.");
6554
6555 // Make one decision for the whole group.
6556 if (getWideningDecision(&I, VF) != CM_Unknown)
6557 continue;
6558
6559 NumAccesses = Group->getNumMembers();
6561 InterleaveCost = getInterleaveGroupCost(&I, VF);
6562 }
6563
6564 InstructionCost GatherScatterCost =
6566 ? getGatherScatterCost(&I, VF) * NumAccesses
6568
6569 InstructionCost ScalarizationCost =
6570 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6571
6572 // Choose better solution for the current VF,
6573 // write down this decision and use it during vectorization.
6575 InstWidening Decision;
6576 if (InterleaveCost <= GatherScatterCost &&
6577 InterleaveCost < ScalarizationCost) {
6578 Decision = CM_Interleave;
6579 Cost = InterleaveCost;
6580 } else if (GatherScatterCost < ScalarizationCost) {
6581 Decision = CM_GatherScatter;
6582 Cost = GatherScatterCost;
6583 } else {
6584 Decision = CM_Scalarize;
6585 Cost = ScalarizationCost;
6586 }
6587 // If the instructions belongs to an interleave group, the whole group
6588 // receives the same decision. The whole group receives the cost, but
6589 // the cost will actually be assigned to one instruction.
6590 if (auto Group = getInterleavedAccessGroup(&I))
6591 setWideningDecision(Group, VF, Decision, Cost);
6592 else
6593 setWideningDecision(&I, VF, Decision, Cost);
6594 }
6595 }
6596
6597 // Make sure that any load of address and any other address computation
6598 // remains scalar unless there is gather/scatter support. This avoids
6599 // inevitable extracts into address registers, and also has the benefit of
6600 // activating LSR more, since that pass can't optimize vectorized
6601 // addresses.
6603 return;
6604
6605 // Start with all scalar pointer uses.
6607 for (BasicBlock *BB : TheLoop->blocks())
6608 for (Instruction &I : *BB) {
6609 Instruction *PtrDef =
6610 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6611 if (PtrDef && TheLoop->contains(PtrDef) &&
6613 AddrDefs.insert(PtrDef);
6614 }
6615
6616 // Add all instructions used to generate the addresses.
6618 append_range(Worklist, AddrDefs);
6619 while (!Worklist.empty()) {
6620 Instruction *I = Worklist.pop_back_val();
6621 for (auto &Op : I->operands())
6622 if (auto *InstOp = dyn_cast<Instruction>(Op))
6623 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6624 AddrDefs.insert(InstOp).second)
6625 Worklist.push_back(InstOp);
6626 }
6627
6628 for (auto *I : AddrDefs) {
6629 if (isa<LoadInst>(I)) {
6630 // Setting the desired widening decision should ideally be handled in
6631 // by cost functions, but since this involves the task of finding out
6632 // if the loaded register is involved in an address computation, it is
6633 // instead changed here when we know this is the case.
6634 InstWidening Decision = getWideningDecision(I, VF);
6635 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6636 // Scalarize a widened load of address.
6638 I, VF, CM_Scalarize,
6639 (VF.getKnownMinValue() *
6640 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6641 else if (auto Group = getInterleavedAccessGroup(I)) {
6642 // Scalarize an interleave group of address loads.
6643 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6644 if (Instruction *Member = Group->getMember(I))
6646 Member, VF, CM_Scalarize,
6647 (VF.getKnownMinValue() *
6648 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6649 }
6650 }
6651 } else
6652 // Make sure I gets scalarized and a cost estimate without
6653 // scalarization overhead.
6654 ForcedScalars[VF].insert(I);
6655 }
6656}
6657
6659 assert(!VF.isScalar() &&
6660 "Trying to set a vectorization decision for a scalar VF");
6661
6662 for (BasicBlock *BB : TheLoop->blocks()) {
6663 // For each instruction in the old loop.
6664 for (Instruction &I : *BB) {
6665 CallInst *CI = dyn_cast<CallInst>(&I);
6666
6667 if (!CI)
6668 continue;
6669
6674
6675 Function *ScalarFunc = CI->getCalledFunction();
6676 Type *ScalarRetTy = CI->getType();
6677 SmallVector<Type *, 4> Tys, ScalarTys;
6678 bool MaskRequired = Legal->isMaskRequired(CI);
6679 for (auto &ArgOp : CI->args())
6680 ScalarTys.push_back(ArgOp->getType());
6681
6682 // Compute corresponding vector type for return value and arguments.
6683 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6684 for (Type *ScalarTy : ScalarTys)
6685 Tys.push_back(ToVectorTy(ScalarTy, VF));
6686
6687 // An in-loop reduction using an fmuladd intrinsic is a special case;
6688 // we don't want the normal cost for that intrinsic.
6690 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6693 std::nullopt, *RedCost);
6694 continue;
6695 }
6696
6697 // Estimate cost of scalarized vector call. The source operands are
6698 // assumed to be vectors, so we need to extract individual elements from
6699 // there, execute VF scalar calls, and then gather the result into the
6700 // vector return value.
6701 InstructionCost ScalarCallCost =
6702 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6703
6704 // Compute costs of unpacking argument values for the scalar calls and
6705 // packing the return values to a vector.
6706 InstructionCost ScalarizationCost =
6707 getScalarizationOverhead(CI, VF, CostKind);
6708
6709 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6710
6711 // Find the cost of vectorizing the call, if we can find a suitable
6712 // vector variant of the function.
6713 bool UsesMask = false;
6714 VFInfo FuncInfo;
6715 Function *VecFunc = nullptr;
6716 // Search through any available variants for one we can use at this VF.
6717 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6718 // Must match requested VF.
6719 if (Info.Shape.VF != VF)
6720 continue;
6721
6722 // Must take a mask argument if one is required
6723 if (MaskRequired && !Info.isMasked())
6724 continue;
6725
6726 // Check that all parameter kinds are supported
6727 bool ParamsOk = true;
6728 for (VFParameter Param : Info.Shape.Parameters) {
6729 switch (Param.ParamKind) {
6731 break;
6733 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6734 // Make sure the scalar parameter in the loop is invariant.
6735 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6736 TheLoop))
6737 ParamsOk = false;
6738 break;
6739 }
6741 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6742 // Find the stride for the scalar parameter in this loop and see if
6743 // it matches the stride for the variant.
6744 // TODO: do we need to figure out the cost of an extract to get the
6745 // first lane? Or do we hope that it will be folded away?
6746 ScalarEvolution *SE = PSE.getSE();
6747 const auto *SAR =
6748 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6749
6750 if (!SAR || SAR->getLoop() != TheLoop) {
6751 ParamsOk = false;
6752 break;
6753 }
6754
6755 const SCEVConstant *Step =
6756 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6757
6758 if (!Step ||
6759 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6760 ParamsOk = false;
6761
6762 break;
6763 }
6765 UsesMask = true;
6766 break;
6767 default:
6768 ParamsOk = false;
6769 break;
6770 }
6771 }
6772
6773 if (!ParamsOk)
6774 continue;
6775
6776 // Found a suitable candidate, stop here.
6777 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6778 FuncInfo = Info;
6779 break;
6780 }
6781
6782 // Add in the cost of synthesizing a mask if one wasn't required.
6783 InstructionCost MaskCost = 0;
6784 if (VecFunc && UsesMask && !MaskRequired)
6785 MaskCost = TTI.getShuffleCost(
6788 VecFunc->getFunctionType()->getContext()),
6789 VF));
6790
6791 if (TLI && VecFunc && !CI->isNoBuiltin())
6792 VectorCost =
6793 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6794
6795 // Find the cost of an intrinsic; some targets may have instructions that
6796 // perform the operation without needing an actual call.
6798 if (IID != Intrinsic::not_intrinsic)
6799 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6800
6801 InstructionCost Cost = ScalarCost;
6802 InstWidening Decision = CM_Scalarize;
6803
6804 if (VectorCost <= Cost) {
6805 Cost = VectorCost;
6806 Decision = CM_VectorCall;
6807 }
6808
6809 if (IntrinsicCost <= Cost) {
6810 Cost = IntrinsicCost;
6811 Decision = CM_IntrinsicCall;
6812 }
6813
6814 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6816 }
6817 }
6818}
6819
6821LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6822 Type *&VectorTy) {
6823 Type *RetTy = I->getType();
6825 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6826 auto SE = PSE.getSE();
6828
6829 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6830 ElementCount VF) -> bool {
6831 if (VF.isScalar())
6832 return true;
6833
6834 auto Scalarized = InstsToScalarize.find(VF);
6835 assert(Scalarized != InstsToScalarize.end() &&
6836 "VF not yet analyzed for scalarization profitability");
6837 return !Scalarized->second.count(I) &&
6838 llvm::all_of(I->users(), [&](User *U) {
6839 auto *UI = cast<Instruction>(U);
6840 return !Scalarized->second.count(UI);
6841 });
6842 };
6843 (void) hasSingleCopyAfterVectorization;
6844
6845 if (isScalarAfterVectorization(I, VF)) {
6846 // With the exception of GEPs and PHIs, after scalarization there should
6847 // only be one copy of the instruction generated in the loop. This is
6848 // because the VF is either 1, or any instructions that need scalarizing
6849 // have already been dealt with by the time we get here. As a result,
6850 // it means we don't have to multiply the instruction cost by VF.
6851 assert(I->getOpcode() == Instruction::GetElementPtr ||
6852 I->getOpcode() == Instruction::PHI ||
6853 (I->getOpcode() == Instruction::BitCast &&
6854 I->getType()->isPointerTy()) ||
6855 hasSingleCopyAfterVectorization(I, VF));
6856 VectorTy = RetTy;
6857 } else
6858 VectorTy = ToVectorTy(RetTy, VF);
6859
6860 // TODO: We need to estimate the cost of intrinsic calls.
6861 switch (I->getOpcode()) {
6862 case Instruction::GetElementPtr:
6863 // We mark this instruction as zero-cost because the cost of GEPs in
6864 // vectorized code depends on whether the corresponding memory instruction
6865 // is scalarized or not. Therefore, we handle GEPs with the memory
6866 // instruction cost.
6867 return 0;
6868 case Instruction::Br: {
6869 // In cases of scalarized and predicated instructions, there will be VF
6870 // predicated blocks in the vectorized loop. Each branch around these
6871 // blocks requires also an extract of its vector compare i1 element.
6872 // Note that the conditional branch from the loop latch will be replaced by
6873 // a single branch controlling the loop, so there is no extra overhead from
6874 // scalarization.
6875 bool ScalarPredicatedBB = false;
6876 BranchInst *BI = cast<BranchInst>(I);
6877 if (VF.isVector() && BI->isConditional() &&
6878 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6879 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6880 BI->getParent() != TheLoop->getLoopLatch())
6881 ScalarPredicatedBB = true;
6882
6883 if (ScalarPredicatedBB) {
6884 // Not possible to scalarize scalable vector with predicated instructions.
6885 if (VF.isScalable())
6887 // Return cost for branches around scalarized and predicated blocks.
6888 auto *Vec_i1Ty =
6889 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6890 return (
6892 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6893 /*Insert*/ false, /*Extract*/ true, CostKind) +
6894 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6895 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6896 // The back-edge branch will remain, as will all scalar branches.
6897 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6898 else
6899 // This branch will be eliminated by if-conversion.
6900 return 0;
6901 // Note: We currently assume zero cost for an unconditional branch inside
6902 // a predicated block since it will become a fall-through, although we
6903 // may decide in the future to call TTI for all branches.
6904 }
6905 case Instruction::PHI: {
6906 auto *Phi = cast<PHINode>(I);
6907
6908 // First-order recurrences are replaced by vector shuffles inside the loop.
6909 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6911 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6913 cast<VectorType>(VectorTy), Mask, CostKind,
6914 VF.getKnownMinValue() - 1);
6915 }
6916
6917 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6918 // converted into select instructions. We require N - 1 selects per phi
6919 // node, where N is the number of incoming values.
6920 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6921 return (Phi->getNumIncomingValues() - 1) *
6923 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6924 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6926
6927 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6928 }
6929 case Instruction::UDiv:
6930 case Instruction::SDiv:
6931 case Instruction::URem:
6932 case Instruction::SRem:
6933 if (VF.isVector() && isPredicatedInst(I)) {
6934 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6935 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6936 ScalarCost : SafeDivisorCost;
6937 }
6938 // We've proven all lanes safe to speculate, fall through.
6939 [[fallthrough]];
6940 case Instruction::Add:
6941 case Instruction::FAdd:
6942 case Instruction::Sub:
6943 case Instruction::FSub:
6944 case Instruction::Mul:
6945 case Instruction::FMul:
6946 case Instruction::FDiv:
6947 case Instruction::FRem:
6948 case Instruction::Shl:
6949 case Instruction::LShr:
6950 case Instruction::AShr:
6951 case Instruction::And:
6952 case Instruction::Or:
6953 case Instruction::Xor: {
6954 // If we're speculating on the stride being 1, the multiplication may
6955 // fold away. We can generalize this for all operations using the notion
6956 // of neutral elements. (TODO)
6957 if (I->getOpcode() == Instruction::Mul &&
6958 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6959 PSE.getSCEV(I->getOperand(1))->isOne()))
6960 return 0;
6961
6962 // Detect reduction patterns
6963 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6964 return *RedCost;
6965
6966 // Certain instructions can be cheaper to vectorize if they have a constant
6967 // second vector operand. One example of this are shifts on x86.
6968 Value *Op2 = I->getOperand(1);
6969 auto Op2Info = TTI.getOperandInfo(Op2);
6970 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6971 Legal->isInvariant(Op2))
6973
6974 SmallVector<const Value *, 4> Operands(I->operand_values());
6976 I->getOpcode(), VectorTy, CostKind,
6977 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6978 Op2Info, Operands, I, TLI);
6979 }
6980 case Instruction::FNeg: {
6982 I->getOpcode(), VectorTy, CostKind,
6983 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6984 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6985 I->getOperand(0), I);
6986 }
6987 case Instruction::Select: {
6988 SelectInst *SI = cast<SelectInst>(I);
6989 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6990 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6991
6992 const Value *Op0, *Op1;
6993 using namespace llvm::PatternMatch;
6994 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6995 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6996 // select x, y, false --> x & y
6997 // select x, true, y --> x | y
6998 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6999 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7000 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7001 Op1->getType()->getScalarSizeInBits() == 1);
7002
7005 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7006 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7007 }
7008
7009 Type *CondTy = SI->getCondition()->getType();
7010 if (!ScalarCond)
7011 CondTy = VectorType::get(CondTy, VF);
7012
7014 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7015 Pred = Cmp->getPredicate();
7016 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7017 CostKind, I);
7018 }
7019 case Instruction::ICmp:
7020 case Instruction::FCmp: {
7021 Type *ValTy = I->getOperand(0)->getType();
7022 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7023 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7024 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7025 VectorTy = ToVectorTy(ValTy, VF);
7026 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7027 cast<CmpInst>(I)->getPredicate(), CostKind,
7028 I);
7029 }
7030 case Instruction::Store:
7031 case Instruction::Load: {
7032 ElementCount Width = VF;
7033 if (Width.isVector()) {
7034 InstWidening Decision = getWideningDecision(I, Width);
7035 assert(Decision != CM_Unknown &&
7036 "CM decision should be taken at this point");
7039 if (Decision == CM_Scalarize)
7040 Width = ElementCount::getFixed(1);
7041 }
7042 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7043 return getMemoryInstructionCost(I, VF);
7044 }
7045 case Instruction::BitCast:
7046 if (I->getType()->isPointerTy())
7047 return 0;
7048 [[fallthrough]];
7049 case Instruction::ZExt:
7050 case Instruction::SExt:
7051 case Instruction::FPToUI:
7052 case Instruction::FPToSI:
7053 case Instruction::FPExt:
7054 case Instruction::PtrToInt:
7055 case Instruction::IntToPtr:
7056 case Instruction::SIToFP:
7057 case Instruction::UIToFP:
7058 case Instruction::Trunc:
7059 case Instruction::FPTrunc: {
7060 // Computes the CastContextHint from a Load/Store instruction.
7061 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7062 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7063 "Expected a load or a store!");
7064
7065 if (VF.isScalar() || !TheLoop->contains(I))
7067
7068 switch (getWideningDecision(I, VF)) {
7080 llvm_unreachable("Instr did not go through cost modelling?");
7083 llvm_unreachable_internal("Instr has invalid widening decision");
7084 }
7085
7086 llvm_unreachable("Unhandled case!");
7087 };
7088
7089 unsigned Opcode = I->getOpcode();
7091 // For Trunc, the context is the only user, which must be a StoreInst.
7092 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7093 if (I->hasOneUse())
7094 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7095 CCH = ComputeCCH(Store);
7096 }
7097 // For Z/Sext, the context is the operand, which must be a LoadInst.
7098 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7099 Opcode == Instruction::FPExt) {
7100 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7101 CCH = ComputeCCH(Load);
7102 }
7103
7104 // We optimize the truncation of induction variables having constant
7105 // integer steps. The cost of these truncations is the same as the scalar
7106 // operation.
7107 if (isOptimizableIVTruncate(I, VF)) {
7108 auto *Trunc = cast<TruncInst>(I);
7109 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7110 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7111 }
7112
7113 // Detect reduction patterns
7114 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7115 return *RedCost;
7116
7117 Type *SrcScalarTy = I->getOperand(0)->getType();
7118 Type *SrcVecTy =
7119 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7121 // This cast is going to be shrunk. This may remove the cast or it might
7122 // turn it into slightly different cast. For example, if MinBW == 16,
7123 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7124 //
7125 // Calculate the modified src and dest types.
7126 Type *MinVecTy = VectorTy;
7127 if (Opcode == Instruction::Trunc) {
7128 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7129 VectorTy =
7130 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7131 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7132 // Leave SrcVecTy unchanged - we only shrink the destination element
7133 // type.
7134 VectorTy =
7135 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7136 }
7137 }
7138
7139 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7140 }
7141 case Instruction::Call:
7142 return getVectorCallCost(cast<CallInst>(I), VF);
7143 case Instruction::ExtractValue:
7145 case Instruction::Alloca:
7146 // We cannot easily widen alloca to a scalable alloca, as
7147 // the result would need to be a vector of pointers.
7148 if (VF.isScalable())
7150 [[fallthrough]];
7151 default:
7152 // This opcode is unknown. Assume that it is the same as 'mul'.
7153 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7154 } // end of switch.
7155}
7156
7158 // Ignore ephemeral values.
7160
7161 // Find all stores to invariant variables. Since they are going to sink
7162 // outside the loop we do not need calculate cost for them.
7163 for (BasicBlock *BB : TheLoop->blocks())
7164 for (Instruction &I : *BB) {
7165 StoreInst *SI;
7166 if ((SI = dyn_cast<StoreInst>(&I)) &&
7167 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7168 ValuesToIgnore.insert(&I);
7169 }
7170
7171 // Ignore type-promoting instructions we identified during reduction
7172 // detection.
7173 for (const auto &Reduction : Legal->getReductionVars()) {
7174 const RecurrenceDescriptor &RedDes = Reduction.second;
7175 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7176 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7177 }
7178 // Ignore type-casting instructions we identified during induction
7179 // detection.
7180 for (const auto &Induction : Legal->getInductionVars()) {
7181 const InductionDescriptor &IndDes = Induction.second;
7182 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7183 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7184 }
7185}
7186
7188 for (const auto &Reduction : Legal->getReductionVars()) {
7189 PHINode *Phi = Reduction.first;
7190 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7191
7192 // We don't collect reductions that are type promoted (yet).
7193 if (RdxDesc.getRecurrenceType() != Phi->getType())
7194 continue;
7195
7196 // If the target would prefer this reduction to happen "in-loop", then we
7197 // want to record it as such.
7198 unsigned Opcode = RdxDesc.getOpcode();
7199 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7200 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7202 continue;
7203
7204 // Check that we can correctly put the reductions into the loop, by
7205 // finding the chain of operations that leads from the phi to the loop
7206 // exit value.
7207 SmallVector<Instruction *, 4> ReductionOperations =
7208 RdxDesc.getReductionOpChain(Phi, TheLoop);
7209 bool InLoop = !ReductionOperations.empty();
7210
7211 if (InLoop) {
7212 InLoopReductions.insert(Phi);
7213 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7214 Instruction *LastChain = Phi;
7215 for (auto *I : ReductionOperations) {
7216 InLoopReductionImmediateChains[I] = LastChain;
7217 LastChain = I;
7218 }
7219 }
7220 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7221 << " reduction for phi: " << *Phi << "\n");
7222 }
7223}
7224
7226 DebugLoc DL, const Twine &Name) {
7228 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7229 return tryInsertInstruction(
7230 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7231}
7232
7233// This function will select a scalable VF if the target supports scalable
7234// vectors and a fixed one otherwise.
7235// TODO: we could return a pair of values that specify the max VF and
7236// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7237// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7238// doesn't have a cost model that can choose which plan to execute if
7239// more than one is generated.
7242 unsigned WidestType;
7243 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7244
7249
7251 unsigned N = RegSize.getKnownMinValue() / WidestType;
7252 return ElementCount::get(N, RegSize.isScalable());
7253}
7254
7257 ElementCount VF = UserVF;
7258 // Outer loop handling: They may require CFG and instruction level
7259 // transformations before even evaluating whether vectorization is profitable.
7260 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7261 // the vectorization pipeline.
7262 if (!OrigLoop->isInnermost()) {
7263 // If the user doesn't provide a vectorization factor, determine a
7264 // reasonable one.
7265 if (UserVF.isZero()) {
7266 VF = determineVPlanVF(TTI, CM);
7267 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7268
7269 // Make sure we have a VF > 1 for stress testing.
7270 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7271 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7272 << "overriding computed VF.\n");
7273 VF = ElementCount::getFixed(4);
7274 }
7275 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7277 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7278 << "not supported by the target.\n");
7280 "Scalable vectorization requested but not supported by the target",
7281 "the scalable user-specified vectorization width for outer-loop "
7282 "vectorization cannot be used because the target does not support "
7283 "scalable vectors.",
7284 "ScalableVFUnfeasible", ORE, OrigLoop);
7286 }
7287 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7289 "VF needs to be a power of two");
7290 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7291 << "VF " << VF << " to build VPlans.\n");
7292 buildVPlans(VF, VF);
7293
7294 // For VPlan build stress testing, we bail out after VPlan construction.
7297
7298 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7299 }
7300
7301 LLVM_DEBUG(
7302 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7303 "VPlan-native path.\n");
7305}
7306
7307std::optional<VectorizationFactor>
7309 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7312
7313 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7314 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7315 return std::nullopt;
7316
7317 // Invalidate interleave groups if all blocks of loop will be predicated.
7318 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7320 LLVM_DEBUG(
7321 dbgs()
7322 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7323 "which requires masked-interleaved support.\n");
7325 // Invalidating interleave groups also requires invalidating all decisions
7326 // based on them, which includes widening decisions and uniform and scalar
7327 // values.
7329 }
7330
7331 ElementCount MaxUserVF =
7332 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7333 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7334 if (!UserVF.isZero() && UserVFIsLegal) {
7336 "VF needs to be a power of two");
7337 // Collect the instructions (and their associated costs) that will be more
7338 // profitable to scalarize.
7340 if (CM.selectUserVectorizationFactor(UserVF)) {
7341 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7342 buildVPlansWithVPRecipes(UserVF, UserVF);
7343 if (!hasPlanWithVF(UserVF)) {
7344 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7345 << ".\n");
7346 return std::nullopt;
7347 }
7348
7350 return {{UserVF, 0, 0}};
7351 } else
7352 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7353 "InvalidCost", ORE, OrigLoop);
7354 }
7355
7356 // Populate the set of Vectorization Factor Candidates.
7357 ElementCountSet VFCandidates;
7358 for (auto VF = ElementCount::getFixed(1);
7359 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7360 VFCandidates.insert(VF);
7361 for (auto VF = ElementCount::getScalable(1);
7362 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7363 VFCandidates.insert(VF);
7364
7366 for (const auto &VF : VFCandidates) {
7367 // Collect Uniform and Scalar instructions after vectorization with VF.
7369
7370 // Collect the instructions (and their associated costs) that will be more
7371 // profitable to scalarize.
7372 if (VF.isVector())
7374 }
7375
7376 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7377 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7378
7380 if (!MaxFactors.hasVector())
7382
7383 // Select the optimal vectorization factor.
7384 VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7385 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7386 if (!hasPlanWithVF(VF.Width)) {
7387 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7388 << ".\n");
7389 return std::nullopt;
7390 }
7391 return VF;
7392}
7393
7395 assert(count_if(VPlans,
7396 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7397 1 &&
7398 "Best VF has not a single VPlan.");
7399
7400 for (const VPlanPtr &Plan : VPlans) {
7401 if (Plan->hasVF(VF))
7402 return *Plan.get();
7403 }
7404 llvm_unreachable("No plan found!");
7405}
7406
7409 // Reserve first location for self reference to the LoopID metadata node.
7410 MDs.push_back(nullptr);
7411 bool IsUnrollMetadata = false;
7412 MDNode *LoopID = L->getLoopID();
7413 if (LoopID) {
7414 // First find existing loop unrolling disable metadata.
7415 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7416 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7417 if (MD) {
7418 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7419 IsUnrollMetadata =
7420 S && S->getString().starts_with("llvm.loop.unroll.disable");
7421 }
7422 MDs.push_back(LoopID->getOperand(i));
7423 }
7424 }
7425
7426 if (!IsUnrollMetadata) {
7427 // Add runtime unroll disable metadata.
7428 LLVMContext &Context = L->getHeader()->getContext();
7429 SmallVector<Metadata *, 1> DisableOperands;
7430 DisableOperands.push_back(
7431 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7432 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7433 MDs.push_back(DisableNode);
7434 MDNode *NewLoopID = MDNode::get(Context, MDs);
7435 // Set operand 0 to refer to the loop id itself.
7436 NewLoopID->replaceOperandWith(0, NewLoopID);
7437 L->setLoopID(NewLoopID);
7438 }
7439}
7440
7441// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7442// create a merge phi node for it and add it to \p ReductionResumeValues.
7444 VPInstruction *RedResult,
7446 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
7447 bool VectorizingEpilogue) {
7448 if (!RedResult ||
7450 return;
7451
7452 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7453 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7454
7455 Value *FinalValue =
7456 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7457 auto *ResumePhi =
7458 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7459 if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
7460 RdxDesc.getRecurrenceKind())) {
7461 auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
7462 assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
7463 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
7464 ResumePhi = cast<PHINode>(Cmp->getOperand(0));
7465 }
7466 assert((!VectorizingEpilogue || ResumePhi) &&
7467 "when vectorizing the epilogue loop, we need a resume phi from main "
7468 "vector loop");
7469
7470 // TODO: bc.merge.rdx should not be created here, instead it should be
7471 // modeled in VPlan.
7472 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7473 // Create a phi node that merges control-flow from the backedge-taken check
7474 // block and the middle block.
7475 auto *BCBlockPhi =
7476 PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7477 LoopScalarPreHeader->getTerminator()->getIterator());
7478
7479 // If we are fixing reductions in the epilogue loop then we should already
7480 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7481 // we carry over the incoming values correctly.
7482 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7483 if (Incoming == LoopMiddleBlock)
7484 BCBlockPhi->addIncoming(FinalValue, Incoming);
7485 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7486 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7487 Incoming);
7488 else
7489 BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
7490 }
7491
7492 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7493 // TODO: This fixup should instead be modeled in VPlan.
7494 // Fix the scalar loop reduction variable with the incoming reduction sum
7495 // from the vector body and from the backedge value.
7496 int IncomingEdgeBlockIdx =
7497 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7498 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7499 // Pick the other block.
7500 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7501 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7502 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7503 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7504
7505 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7506}
7507
7508std::pair<DenseMap<const SCEV *, Value *>,
7511 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7512 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7513 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7514 assert(BestVPlan.hasVF(BestVF) &&
7515 "Trying to execute plan with unsupported VF");
7516 assert(BestVPlan.hasUF(BestUF) &&
7517 "Trying to execute plan with unsupported UF");
7518 assert(
7519 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7520 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7521
7522 if (!IsEpilogueVectorization)
7523 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7524
7525 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7526 << ", UF=" << BestUF << '\n');
7527 BestVPlan.setName("Final VPlan");
7528 LLVM_DEBUG(BestVPlan.dump());
7529
7530 // Perform the actual loop transformation.
7531 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7532 OrigLoop->getHeader()->getContext());
7533
7534 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7535 // before making any changes to the CFG.
7536 if (!BestVPlan.getPreheader()->empty()) {
7537 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7539 BestVPlan.getPreheader()->execute(&State);
7540 }
7541 if (!ILV.getTripCount())
7542 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7543 else
7544 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7545 "count during epilogue vectorization");
7546
7547 // 1. Set up the skeleton for vectorization, including vector pre-header and
7548 // middle block. The vector loop is created during VPlan execution.
7549 Value *CanonicalIVStartValue;
7550 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7551 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7552 : State.ExpandedSCEVs);
7553
7554 // Only use noalias metadata when using memory checks guaranteeing no overlap
7555 // across all iterations.
7556 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7557 std::unique_ptr<LoopVersioning> LVer = nullptr;
7558 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7560
7561 // We currently don't use LoopVersioning for the actual loop cloning but we
7562 // still use it to add the noalias metadata.
7563 // TODO: Find a better way to re-use LoopVersioning functionality to add
7564 // metadata.
7565 LVer = std::make_unique<LoopVersioning>(
7566 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7567 PSE.getSE());
7568 State.LVer = &*LVer;
7570 }
7571
7573
7574 //===------------------------------------------------===//
7575 //
7576 // Notice: any optimization or new instruction that go
7577 // into the code below should also be implemented in
7578 // the cost-model.
7579 //
7580 //===------------------------------------------------===//
7581
7582 // 2. Copy and widen instructions from the old loop into the new loop.
7583 BestVPlan.prepareToExecute(ILV.getTripCount(),
7584 ILV.getOrCreateVectorTripCount(nullptr),
7585 CanonicalIVStartValue, State);
7586
7587 BestVPlan.execute(&State);
7588
7589 // 2.5 Collect reduction resume values.
7591 auto *ExitVPBB =
7592 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7593 for (VPRecipeBase &R : *ExitVPBB) {
7595 dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
7596 State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
7597 }
7598
7599 // 2.6. Maintain Loop Hints
7600 // Keep all loop hints from the original loop on the vector loop (we'll
7601 // replace the vectorizer-specific hints below).
7602 MDNode *OrigLoopID = OrigLoop->getLoopID();
7603
7604 std::optional<MDNode *> VectorizedLoopID =
7607
7608 VPBasicBlock *HeaderVPBB =
7610 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7611 if (VectorizedLoopID)
7612 L->setLoopID(*VectorizedLoopID);
7613 else {
7614 // Keep all loop hints from the original loop on the vector loop (we'll
7615 // replace the vectorizer-specific hints below).
7616 if (MDNode *LID = OrigLoop->getLoopID())
7617 L->setLoopID(LID);
7618
7619 LoopVectorizeHints Hints(L, true, *ORE);
7620 Hints.setAlreadyVectorized();
7621 }
7623 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7624 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7626
7627 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7628 // predication, updating analyses.
7629 ILV.fixVectorizedLoop(State, BestVPlan);
7630
7632
7633 return {State.ExpandedSCEVs, ReductionResumeValues};
7634}
7635
7636#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7638 for (const auto &Plan : VPlans)
7640 Plan->printDOT(O);
7641 else
7642 Plan->print(O);
7643}
7644#endif
7645
7646//===--------------------------------------------------------------------===//
7647// EpilogueVectorizerMainLoop
7648//===--------------------------------------------------------------------===//
7649
7650/// This function is partially responsible for generating the control flow
7651/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7652std::pair<BasicBlock *, Value *>
7654 const SCEV2ValueTy &ExpandedSCEVs) {
7656
7657 // Generate the code to check the minimum iteration count of the vector
7658 // epilogue (see below).
7662
7663 // Generate the code to check any assumptions that we've made for SCEV
7664 // expressions.
7666
7667 // Generate the code that checks at runtime if arrays overlap. We put the
7668 // checks into a separate block to make the more common case of few elements
7669 // faster.
7671
7672 // Generate the iteration count check for the main loop, *after* the check
7673 // for the epilogue loop, so that the path-length is shorter for the case
7674 // that goes directly through the vector epilogue. The longer-path length for
7675 // the main loop is compensated for, by the gain from vectorizing the larger
7676 // trip count. Note: the branch will get updated later on when we vectorize
7677 // the epilogue.
7680
7681 // Generate the induction variable.
7683
7684 // Skip induction resume value creation here because they will be created in
7685 // the second pass for the scalar loop. The induction resume values for the
7686 // inductions in the epilogue loop are created before executing the plan for
7687 // the epilogue loop.
7688
7689 return {completeLoopSkeleton(), nullptr};
7690}
7691
7693 LLVM_DEBUG({
7694 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7695 << "Main Loop VF:" << EPI.MainLoopVF
7696 << ", Main Loop UF:" << EPI.MainLoopUF
7697 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7698 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7699 });
7700}
7701
7704 dbgs() << "intermediate fn:\n"
7705 << *OrigLoop->getHeader()->getParent() << "\n";
7706 });
7707}
7708
7709BasicBlock *
7711 bool ForEpilogue) {
7712 assert(Bypass && "Expected valid bypass basic block.");
7713 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7714 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7715 Value *Count = getTripCount();
7716 // Reuse existing vector loop preheader for TC checks.
7717 // Note that new preheader block is generated for vector loop.
7718 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7719 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7720
7721 // Generate code to check if the loop's trip count is less than VF * UF of the
7722 // main vector loop.
7723 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7724 : VF.isVector())
7727
7728 Value *CheckMinIters = Builder.CreateICmp(
7729 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7730 "min.iters.check");
7731
7732 if (!ForEpilogue)
7733 TCCheckBlock->setName("vector.main.loop.iter.check");
7734
7735 // Create new preheader for vector loop.
7736 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7737 DT, LI, nullptr, "vector.ph");
7738
7739 if (ForEpilogue) {
7740 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7741 DT->getNode(Bypass)->getIDom()) &&
7742 "TC check is expected to dominate Bypass");
7743
7744 // Update dominator for Bypass & LoopExit.
7745 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7746 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7747 // For loops with multiple exits, there's no edge from the middle block
7748 // to exit blocks (as the epilogue must run) and thus no need to update
7749 // the immediate dominator of the exit blocks.
7751
7752 LoopBypassBlocks.push_back(TCCheckBlock);
7753
7754 // Save the trip count so we don't have to regenerate it in the
7755 // vec.epilog.iter.check. This is safe to do because the trip count
7756 // generated here dominates the vector epilog iter check.
7757 EPI.TripCount = Count;
7758 }
7759
7760 BranchInst &BI =
7761 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7764 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7765
7766 return TCCheckBlock;
7767}
7768
7769//===--------------------------------------------------------------------===//
7770// EpilogueVectorizerEpilogueLoop
7771//===--------------------------------------------------------------------===//
7772
7773/// This function is partially responsible for generating the control flow
7774/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7775std::pair<BasicBlock *, Value *>
7777 const SCEV2ValueTy &ExpandedSCEVs) {
7778 createVectorLoopSkeleton("vec.epilog.");
7779
7780 // Now, compare the remaining count and if there aren't enough iterations to
7781 // execute the vectorized epilogue skip to the scalar part.
7782 LoopVectorPreHeader->setName("vec.epilog.ph");
7783 BasicBlock *VecEpilogueIterationCountCheck =
7785 nullptr, "vec.epilog.iter.check", true);
7787 VecEpilogueIterationCountCheck);
7788
7789 // Adjust the control flow taking the state info from the main loop
7790 // vectorization into account.
7792 "expected this to be saved from the previous pass.");
7794 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7795
7798
7800 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7801
7802 if (EPI.SCEVSafetyCheck)
7804 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7805 if (EPI.MemSafetyCheck)
7807 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7808
7810 VecEpilogueIterationCountCheck,
7811 VecEpilogueIterationCountCheck->getSinglePredecessor());
7812
7815 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7816 // If there is an epilogue which must run, there's no edge from the
7817 // middle block to exit blocks and thus no need to update the immediate
7818 // dominator of the exit blocks.
7821
7822 // Keep track of bypass blocks, as they feed start values to the induction and
7823 // reduction phis in the scalar loop preheader.
7824 if (EPI.SCEVSafetyCheck)
7826 if (EPI.MemSafetyCheck)
7829
7830 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7831 // reductions which merge control-flow from the latch block and the middle
7832 // block. Update the incoming values here and move the Phi into the preheader.
7833 SmallVector<PHINode *, 4> PhisInBlock;
7834 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7835 PhisInBlock.push_back(&Phi);
7836
7837 for (PHINode *Phi : PhisInBlock) {
7838 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7839 Phi->replaceIncomingBlockWith(
7840 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7841 VecEpilogueIterationCountCheck);
7842
7843 // If the phi doesn't have an incoming value from the
7844 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7845 // value and also those from other check blocks. This is needed for
7846 // reduction phis only.
7847 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7848 return EPI.EpilogueIterationCountCheck == IncB;
7849 }))
7850 continue;
7851 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7852 if (EPI.SCEVSafetyCheck)
7853 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7854 if (EPI.MemSafetyCheck)
7855 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7856 }
7857
7858 // Generate a resume induction for the vector epilogue and put it in the
7859 // vector epilogue preheader
7860 Type *IdxTy = Legal->getWidestInductionType();
7861 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7863 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7864 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7866
7867 // Generate induction resume values. These variables save the new starting
7868 // indexes for the scalar loop. They are used to test if there are any tail
7869 // iterations left once the vector loop has completed.
7870 // Note that when the vectorized epilogue is skipped due to iteration count
7871 // check, then the resume value for the induction variable comes from
7872 // the trip count of the main vector loop, hence passing the AdditionalBypass
7873 // argument.
7874 createInductionResumeValues(ExpandedSCEVs,
7875 {VecEpilogueIterationCountCheck,
7876 EPI.VectorTripCount} /* AdditionalBypass */);
7877
7878 return {completeLoopSkeleton(), EPResumeVal};
7879}
7880
7881BasicBlock *
7883 BasicBlock *Bypass, BasicBlock *Insert) {
7884
7886 "Expected trip count to have been safed in the first pass.");
7887 assert(
7888 (!isa<Instruction>(EPI.TripCount) ||
7889 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7890 "saved trip count does not dominate insertion point.");
7891 Value *TC = EPI.TripCount;
7892 IRBuilder<> Builder(Insert->getTerminator());
7893 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7894
7895 // Generate code to check if the loop's trip count is less than VF * UF of the
7896 // vector epilogue loop.
7897 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7900
7901 Value *CheckMinIters =
7902 Builder.CreateICmp(P, Count,
7905 "min.epilog.iters.check");
7906
7907 BranchInst &BI =
7908 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7910 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7911 unsigned EpilogueLoopStep =
7913 // We assume the remaining `Count` is equally distributed in
7914 // [0, MainLoopStep)
7915 // So the probability for `Count < EpilogueLoopStep` should be
7916 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7917 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7918 const uint32_t Weights[] = {EstimatedSkipCount,
7919 MainLoopStep - EstimatedSkipCount};
7920 setBranchWeights(BI, Weights);
7921 }
7922 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7923
7924 LoopBypassBlocks.push_back(Insert);
7925 return Insert;
7926}
7927
7929 LLVM_DEBUG({
7930 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7931 << "Epilogue Loop VF:" << EPI.EpilogueVF
7932 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7933 });
7934}
7935
7938 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7939 });
7940}
7941
7943 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7944 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7945 bool PredicateAtRangeStart = Predicate(Range.Start);
7946
7947 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7948 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7949 Range.End = TmpVF;
7950 break;
7951 }
7952
7953 return PredicateAtRangeStart;
7954}
7955
7956/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7957/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7958/// of VF's starting at a given VF and extending it as much as possible. Each
7959/// vectorization decision can potentially shorten this sub-range during
7960/// buildVPlan().
7962 ElementCount MaxVF) {
7963 auto MaxVFTimes2 = MaxVF * 2;
7964 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7965 VFRange SubRange = {VF, MaxVFTimes2};
7966 VPlans.push_back(buildVPlan(SubRange));
7967 VF = SubRange.End;
7968 }
7969}
7970
7971iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
7973 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
7974 if (auto *I = dyn_cast<Instruction>(Op)) {
7975 if (auto *R = Ingredient2Recipe.lookup(I))
7976 return R->getVPSingleValue();
7977 }
7978 return Plan.getOrAddLiveIn(Op);
7979 };
7980 return map_range(Operands, Fn);
7981}
7982
7984 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7985
7986 // Look for cached value.
7987 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7988 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7989 if (ECEntryIt != EdgeMaskCache.end())
7990 return ECEntryIt->second;
7991
7992 VPValue *SrcMask = getBlockInMask(Src);
7993
7994 // The terminator has to be a branch inst!
7995 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7996 assert(BI && "Unexpected terminator found");
7997
7998 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7999 return EdgeMaskCache[Edge] = SrcMask;
8000
8001 // If source is an exiting block, we know the exit edge is dynamically dead
8002 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8003 // adding uses of an otherwise potentially dead instruction.
8004 if (OrigLoop->isLoopExiting(Src))
8005 return EdgeMaskCache[Edge] = SrcMask;
8006
8007 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);
8008 assert(EdgeMask && "No Edge Mask found for condition");
8009
8010 if (BI->getSuccessor(0) != Dst)
8011 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8012
8013 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8014 // The condition is 'SrcMask && EdgeMask', which is equivalent to
8015 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8016 // The select version does not introduce new UB if SrcMask is false and
8017 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8018 VPValue *False = Plan.getOrAddLiveIn(
8020 EdgeMask =
8021 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8022 }
8023
8024 return EdgeMaskCache[Edge] = EdgeMask;
8025}
8026
8028 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8029
8030 // Look for cached value.
8031 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8032 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8033 assert(ECEntryIt != EdgeMaskCache.end() &&
8034 "looking up mask for edge which has not been created");
8035 return ECEntryIt->second;
8036}
8037
8039 BasicBlock *Header = OrigLoop->getHeader();
8040
8041 // When not folding the tail, use nullptr to model all-true mask.
8042 if (!CM.foldTailByMasking()) {
8043 BlockMaskCache[Header] = nullptr;
8044 return;
8045 }
8046
8047 // Introduce the early-exit compare IV <= BTC to form header block mask.
8048 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8049 // constructing the desired canonical IV in the header block as its first
8050 // non-phi instructions.
8051
8052 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8053 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8054 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8055 HeaderVPBB->insert(IV, NewInsertionPoint);
8056
8057 VPBuilder::InsertPointGuard Guard(Builder);
8058 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8059 VPValue *BlockMask = nullptr;
8061 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8062 BlockMaskCache[Header] = BlockMask;
8063}
8064
8066 // Return the cached value.
8067 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8068 assert(BCEntryIt != BlockMaskCache.end() &&
8069 "Trying to access mask for block without one.");
8070 return BCEntryIt->second;
8071}
8072
8074 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8075 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8076 assert(OrigLoop->getHeader() != BB &&
8077 "Loop header must have cached block mask");
8078
8079 // All-one mask is modelled as no-mask following the convention for masked
8080 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8081 VPValue *BlockMask = nullptr;
8082 // This is the block mask. We OR all incoming edges.
8083 for (auto *Predecessor : predecessors(BB)) {
8084 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8085 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8086 BlockMaskCache[BB] = EdgeMask;
8087 return;
8088 }
8089
8090 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8091 BlockMask = EdgeMask;
8092 continue;
8093 }
8094
8095 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8096 }
8097
8098 BlockMaskCache[BB] = BlockMask;
8099}
8100
8102VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8103 VFRange &Range) {
8104 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8105 "Must be called with either a load or store");
8106
8107 auto willWiden = [&](ElementCount VF) -> bool {
8109 CM.getWideningDecision(I, VF);
8111 "CM decision should be taken at this point.");
8113 return true;
8114 if (CM.isScalarAfterVectorization(I, VF) ||
8115 CM.isProfitableToScalarize(I, VF))
8116 return false;
8118 };
8119
8121 return nullptr;
8122
8123 VPValue *Mask = nullptr;
8124 if (Legal->isMaskRequired(I))
8125 Mask = getBlockInMask(I->getParent());
8126
8127 // Determine if the pointer operand of the access is either consecutive or
8128 // reverse consecutive.
8130 CM.getWideningDecision(I, Range.Start);
8132 bool Consecutive =
8134
8135 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8136 if (Consecutive) {
8137 auto *GEP = dyn_cast<GetElementPtrInst>(
8138 Ptr->getUnderlyingValue()->stripPointerCasts());
8139 auto *VectorPtr = new VPVectorPointerRecipe(
8140 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8141 I->getDebugLoc());
8142 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8143 Ptr = VectorPtr;
8144 }
8145 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8146 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8147 I->getDebugLoc());
8148
8149 StoreInst *Store = cast<StoreInst>(I);
8150 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8151 Reverse, I->getDebugLoc());
8152}
8153
8154/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8155/// insert a recipe to expand the step for the induction recipe.
8158 VPValue *Start, const InductionDescriptor &IndDesc,
8159 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8160 VFRange &Range) {
8161 assert(IndDesc.getStartValue() ==
8162 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8163 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8164 "step must be loop invariant");
8165
8166 VPValue *Step =
8168 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8169 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8170 }
8171 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8172 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8173}
8174
8175VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8177
8178 // Check if this is an integer or fp induction. If so, build the recipe that
8179 // produces its scalar and vector values.
8180 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8181 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8182 *PSE.getSE(), *OrigLoop, Range);
8183
8184 // Check if this is pointer induction. If so, build the recipe for it.
8185 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8186 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8187 *PSE.getSE());
8189 Phi, Operands[0], Step, *II,
8191 [&](ElementCount VF) {
8192 return CM.isScalarAfterVectorization(Phi, VF);
8193 },
8194 Range));
8195 }
8196 return nullptr;
8197}
8198
8199VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8201 // Optimize the special case where the source is a constant integer
8202 // induction variable. Notice that we can only optimize the 'trunc' case
8203 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8204 // (c) other casts depend on pointer size.
8205
8206 // Determine whether \p K is a truncation based on an induction variable that
8207 // can be optimized.
8208 auto isOptimizableIVTruncate =
8209 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8210 return [=](ElementCount VF) -> bool {
8211 return CM.isOptimizableIVTruncate(K, VF);
8212 };
8213 };
8214
8216 isOptimizableIVTruncate(I), Range)) {
8217
8218 auto *Phi = cast<PHINode>(I->getOperand(0));
8219 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8220 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8221 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8222 *OrigLoop, Range);
8223 }
8224 return nullptr;
8225}
8226
8227VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8229 unsigned NumIncoming = Phi->getNumIncomingValues();
8230
8231 // We know that all PHIs in non-header blocks are converted into selects, so
8232 // we don't have to worry about the insertion order and we can just use the
8233 // builder. At this point we generate the predication tree. There may be
8234 // duplications since this is a simple recursive scan, but future
8235 // optimizations will clean it up.
8236 // TODO: At the moment the first mask is always skipped, but it would be
8237 // better to skip the most expensive mask.
8238 SmallVector<VPValue *, 2> OperandsWithMask;
8239
8240 for (unsigned In = 0; In < NumIncoming; In++) {
8241 OperandsWithMask.push_back(Operands[In]);
8242 VPValue *EdgeMask =
8243 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8244 if (!EdgeMask) {
8245 assert(In == 0 && "Both null and non-null edge masks found");
8247 "Distinct incoming values with one having a full mask");
8248 break;
8249 }
8250 if (In == 0)
8251 continue;
8252 OperandsWithMask.push_back(EdgeMask);
8253 }
8254 return new VPBlendRecipe(Phi, OperandsWithMask);
8255}
8256
8257VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8259 VFRange &Range) {
8261 [this, CI](ElementCount VF) {
8262 return CM.isScalarWithPredication(CI, VF);
8263 },
8264 Range);
8265
8266 if (IsPredicated)
8267 return nullptr;
8268
8270 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8271 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8272 ID == Intrinsic::pseudoprobe ||
8273 ID == Intrinsic::experimental_noalias_scope_decl))
8274 return nullptr;
8275
8276 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8277 Ops.push_back(Operands.back());
8278
8279 // Is it beneficial to perform intrinsic call compared to lib call?
8280 bool ShouldUseVectorIntrinsic =
8282 [&](ElementCount VF) -> bool {
8283 return CM.getCallWideningDecision(CI, VF).Kind ==
8285 },
8286 Range);
8287 if (ShouldUseVectorIntrinsic)
8288 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID,
8289 CI->getDebugLoc());
8290
8291 Function *Variant = nullptr;
8292 std::optional<unsigned> MaskPos;
8293 // Is better to call a vectorized version of the function than to to scalarize
8294 // the call?
8295 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8296 [&](ElementCount VF) -> bool {
8297 // The following case may be scalarized depending on the VF.
8298 // The flag shows whether we can use a usual Call for vectorized
8299 // version of the instruction.
8300
8301 // If we've found a variant at a previous VF, then stop looking. A
8302 // vectorized variant of a function expects input in a certain shape
8303 // -- basically the number of input registers, the number of lanes
8304 // per register, and whether there's a mask required.
8305 // We store a pointer to the variant in the VPWidenCallRecipe, so
8306 // once we have an appropriate variant it's only valid for that VF.
8307 // This will force a different vplan to be generated for each VF that
8308 // finds a valid variant.
8309 if (Variant)
8310 return false;
8312 CM.getCallWideningDecision(CI, VF);
8314 Variant = Decision.Variant;
8315 MaskPos = Decision.MaskPos;
8316 return true;
8317 }
8318
8319 return false;
8320 },
8321 Range);
8322 if (ShouldUseVectorCall) {
8323 if (MaskPos.has_value()) {
8324 // We have 2 cases that would require a mask:
8325 // 1) The block needs to be predicated, either due to a conditional
8326 // in the scalar loop or use of an active lane mask with
8327 // tail-folding, and we use the appropriate mask for the block.
8328 // 2) No mask is required for the block, but the only available
8329 // vector variant at this VF requires a mask, so we synthesize an
8330 // all-true mask.
8331 VPValue *Mask = nullptr;
8332 if (Legal->isMaskRequired(CI))
8333 Mask = getBlockInMask(CI->getParent());
8334 else
8336 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8337
8338 Ops.insert(Ops.begin() + *MaskPos, Mask);
8339 }
8340
8341 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()),
8343 Variant);
8344 }
8345
8346 return nullptr;
8347}
8348
8349bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8350 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8351 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8352 // Instruction should be widened, unless it is scalar after vectorization,
8353 // scalarization is profitable or it is predicated.
8354 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8355 return CM.isScalarAfterVectorization(I, VF) ||
8356 CM.isProfitableToScalarize(I, VF) ||
8357 CM.isScalarWithPredication(I, VF);
8358 };
8360 Range);
8361}
8362
8363VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8365 VPBasicBlock *VPBB) {
8366 switch (I->getOpcode()) {
8367 default:
8368 return nullptr;
8369 case Instruction::SDiv:
8370 case Instruction::UDiv:
8371 case Instruction::SRem:
8372 case Instruction::URem: {
8373 // If not provably safe, use a select to form a safe divisor before widening the
8374 // div/rem operation itself. Otherwise fall through to general handling below.
8375 if (CM.isPredicatedInst(I)) {
8376 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8377 VPValue *Mask = getBlockInMask(I->getParent());
8378 VPValue *One =
8379 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8380 auto *SafeRHS =
8381 new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8382 I->getDebugLoc());
8383 VPBB->appendRecipe(SafeRHS);
8384 Ops[1] = SafeRHS;
8385 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8386 }
8387 [[fallthrough]];
8388 }
8389 case Instruction::Add:
8390 case Instruction::And:
8391 case Instruction::AShr:
8392 case Instruction::FAdd:
8393 case Instruction::FCmp:
8394 case Instruction::FDiv:
8395 case Instruction::FMul:
8396 case Instruction::FNeg:
8397 case Instruction::FRem:
8398 case Instruction::FSub:
8399 case Instruction::ICmp:
8400 case Instruction::LShr:
8401 case Instruction::Mul:
8402 case Instruction::Or:
8403 case Instruction::Select:
8404 case Instruction::Shl:
8405 case Instruction::Sub:
8406 case Instruction::Xor:
8407 case Instruction::Freeze:
8408 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8409 };
8410}
8411
8413 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8414 for (VPHeaderPHIRecipe *R : PhisToFix) {
8415 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8416 VPRecipeBase *IncR =
8417 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8418 R->addOperand(IncR->getVPSingleValue());
8419 }
8420}
8421
8423 VFRange &Range) {
8425 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8426 Range);
8427
8428 bool IsPredicated = CM.isPredicatedInst(I);
8429
8430 // Even if the instruction is not marked as uniform, there are certain
8431 // intrinsic calls that can be effectively treated as such, so we check for
8432 // them here. Conservatively, we only do this for scalable vectors, since
8433 // for fixed-width VFs we can always fall back on full scalarization.
8434 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8435 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8436 case Intrinsic::assume:
8437 case Intrinsic::lifetime_start:
8438 case Intrinsic::lifetime_end:
8439 // For scalable vectors if one of the operands is variant then we still
8440 // want to mark as uniform, which will generate one instruction for just
8441 // the first lane of the vector. We can't scalarize the call in the same
8442 // way as for fixed-width vectors because we don't know how many lanes
8443 // there are.
8444 //
8445 // The reasons for doing it this way for scalable vectors are:
8446 // 1. For the assume intrinsic generating the instruction for the first
8447 // lane is still be better than not generating any at all. For
8448 // example, the input may be a splat across all lanes.
8449 // 2. For the lifetime start/end intrinsics the pointer operand only
8450 // does anything useful when the input comes from a stack object,
8451 // which suggests it should always be uniform. For non-stack objects
8452 // the effect is to poison the object, which still allows us to
8453 // remove the call.
8454 IsUniform = true;
8455 break;
8456 default:
8457 break;
8458 }
8459 }
8460 VPValue *BlockInMask = nullptr;
8461 if (!IsPredicated) {
8462 // Finalize the recipe for Instr, first if it is not predicated.
8463 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8464 } else {
8465 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8466 // Instructions marked for predication are replicated and a mask operand is
8467 // added initially. Masked replicate recipes will later be placed under an
8468 // if-then construct to prevent side-effects. Generate recipes to compute
8469 // the block mask for this region.
8470 BlockInMask = getBlockInMask(I->getParent());
8471 }
8472
8473 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8474 IsUniform, BlockInMask);
8475 return Recipe;
8476}
8477
8481 VFRange &Range, VPBasicBlock *VPBB) {
8482 // First, check for specific widening recipes that deal with inductions, Phi
8483 // nodes, calls and memory operations.
8484 VPRecipeBase *Recipe;
8485 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8486 if (Phi->getParent() != OrigLoop->getHeader())
8487 return tryToBlend(Phi, Operands);
8488
8489 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8490 return Recipe;
8491
8492 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8493 assert((Legal->isReductionVariable(Phi) ||
8494 Legal->isFixedOrderRecurrence(Phi)) &&
8495 "can only widen reductions and fixed-order recurrences here");
8496 VPValue *StartV = Operands[0];
8497 if (Legal->isReductionVariable(Phi)) {
8498 const RecurrenceDescriptor &RdxDesc =
8499 Legal->getReductionVars().find(Phi)->second;
8500 assert(RdxDesc.getRecurrenceStartValue() ==
8501 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8502 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8503 CM.isInLoopReduction(Phi),
8504 CM.useOrderedReductions(RdxDesc));
8505 } else {
8506 // TODO: Currently fixed-order recurrences are modeled as chains of
8507 // first-order recurrences. If there are no users of the intermediate
8508 // recurrences in the chain, the fixed order recurrence should be modeled
8509 // directly, enabling more efficient codegen.
8510 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8511 }
8512
8513 PhisToFix.push_back(PhiRecipe);
8514 return PhiRecipe;
8515 }
8516
8517 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8518 cast<TruncInst>(Instr), Operands, Range)))
8519 return Recipe;
8520
8521 // All widen recipes below deal only with VF > 1.
8523 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8524 return nullptr;
8525
8526 if (auto *CI = dyn_cast<CallInst>(Instr))
8527 return tryToWidenCall(CI, Operands, Range);
8528
8529 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8530 return tryToWidenMemory(Instr, Operands, Range);
8531
8532 if (!shouldWiden(Instr, Range))
8533 return nullptr;
8534
8535 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8536 return new VPWidenGEPRecipe(GEP,
8537 make_range(Operands.begin(), Operands.end()));
8538
8539 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8540 return new VPWidenSelectRecipe(
8541 *SI, make_range(Operands.begin(), Operands.end()));
8542 }
8543
8544 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8545 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8546 *CI);
8547 }
8548
8549 return tryToWiden(Instr, Operands, VPBB);
8550}
8551
8552void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8553 ElementCount MaxVF) {
8554 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8555
8556 auto MaxVFTimes2 = MaxVF * 2;
8557 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8558 VFRange SubRange = {VF, MaxVFTimes2};
8559 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8560 // Now optimize the initial VPlan.
8561 if (!Plan->hasVF(ElementCount::getFixed(1)))
8563 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8564 VPlanTransforms::optimize(*Plan, *PSE.getSE());
8565 // TODO: try to put it close to addActiveLaneMask().
8566 if (CM.foldTailWithEVL())
8568 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8569 VPlans.push_back(std::move(Plan));
8570 }
8571 VF = SubRange.End;
8572 }
8573}
8574
8575// Add the necessary canonical IV and branch recipes required to control the
8576// loop.
8577static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8578 DebugLoc DL) {
8579 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8580 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8581
8582 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8583 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8584 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8585 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8586 Header->insert(CanonicalIVPHI, Header->begin());
8587
8588 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8589 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8590 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8591 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8592 "index.next");
8593 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8594
8595 // Add the BranchOnCount VPInstruction to the latch.
8597 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8598}
8599
8600// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8601// original exit block.
8602static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8603 VPRecipeBuilder &Builder, VPlan &Plan) {
8604 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8605 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8606 // Only handle single-exit loops with unique exit blocks for now.
8607 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8608 return;
8609
8610 // Introduce VPUsers modeling the exit values.
8611 for (PHINode &ExitPhi : ExitBB->phis()) {
8612 Value *IncomingValue =
8613 ExitPhi.getIncomingValueForBlock(ExitingBB);
8614 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
8615 Plan.addLiveOut(&ExitPhi, V);
8616 }
8617}
8618
8620LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8621
8623
8624 // ---------------------------------------------------------------------------
8625 // Build initial VPlan: Scan the body of the loop in a topological order to
8626 // visit each basic block after having visited its predecessor basic blocks.
8627 // ---------------------------------------------------------------------------
8628
8629 // Create initial VPlan skeleton, having a basic block for the pre-header
8630 // which contains SCEV expansions that need to happen before the CFG is
8631 // modified; a basic block for the vector pre-header, followed by a region for
8632 // the vector loop, followed by the middle basic block. The skeleton vector
8633 // loop region contains a header and latch basic blocks.
8635 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8636 *PSE.getSE());
8637 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8638 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8639 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8640 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8641 Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8642
8643 // Don't use getDecisionAndClampRange here, because we don't know the UF
8644 // so this function is better to be conservative, rather than to split
8645 // it up into different VPlans.
8646 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8647 bool IVUpdateMayOverflow = false;
8648 for (ElementCount VF : Range)
8649 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8650
8652 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8653 // When not folding the tail, we know that the induction increment will not
8654 // overflow.
8655 bool HasNUW = Style == TailFoldingStyle::None;
8656 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8657
8658 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8659
8660 // ---------------------------------------------------------------------------
8661 // Pre-construction: record ingredients whose recipes we'll need to further
8662 // process after constructing the initial VPlan.
8663 // ---------------------------------------------------------------------------
8664
8665 // For each interleave group which is relevant for this (possibly trimmed)
8666 // Range, add it to the set of groups to be later applied to the VPlan and add
8667 // placeholders for its members' Recipes which we'll be replacing with a
8668 // single VPInterleaveRecipe.
8670 auto applyIG = [IG, this](ElementCount VF) -> bool {
8671 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8672 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8674 // For scalable vectors, the only interleave factor currently supported
8675 // is 2 since we require the (de)interleave2 intrinsics instead of
8676 // shufflevectors.
8677 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8678 "Unsupported interleave factor for scalable vectors");
8679 return Result;
8680 };
8681 if (!getDecisionAndClampRange(applyIG, Range))
8682 continue;
8683 InterleaveGroups.insert(IG);
8684 };
8685
8686 // ---------------------------------------------------------------------------
8687 // Construct recipes for the instructions in the loop
8688 // ---------------------------------------------------------------------------
8689
8690 // Scan the body of the loop in a topological order to visit each basic block
8691 // after having visited its predecessor basic blocks.
8692 LoopBlocksDFS DFS(OrigLoop);
8693 DFS.perform(LI);
8694
8695 VPBasicBlock *VPBB = HeaderVPBB;
8696 BasicBlock *HeaderBB = OrigLoop->getHeader();
8697 bool NeedsMasks =
8698 CM.foldTailByMasking() ||
8699 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
8700 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8701 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8702 });
8703 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8704 // Relevant instructions from basic block BB will be grouped into VPRecipe
8705 // ingredients and fill a new VPBasicBlock.
8706 if (VPBB != HeaderVPBB)
8707 VPBB->setName(BB->getName());
8708 Builder.setInsertPoint(VPBB);
8709
8710 if (VPBB == HeaderVPBB)
8711 RecipeBuilder.createHeaderMask();
8712 else if (NeedsMasks)
8713 RecipeBuilder.createBlockInMask(BB);
8714
8715 // Introduce each ingredient into VPlan.
8716 // TODO: Model and preserve debug intrinsics in VPlan.
8717 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8718 Instruction *Instr = &I;
8720 auto *Phi = dyn_cast<PHINode>(Instr);
8721 if (Phi && Phi->getParent() == HeaderBB) {
8722 Operands.push_back(Plan->getOrAddLiveIn(
8723 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8724 } else {
8725 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
8726 Operands = {OpRange.begin(), OpRange.end()};
8727 }
8728
8729 // Invariant stores inside loop will be deleted and a single store
8730 // with the final reduction value will be added to the exit block
8731 StoreInst *SI;
8732 if ((SI = dyn_cast<StoreInst>(&I)) &&
8733 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8734 continue;
8735
8736 VPRecipeBase *Recipe =
8737 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8738 if (!Recipe)
8739 Recipe = RecipeBuilder.handleReplication(Instr, Range);
8740
8741 RecipeBuilder.setRecipe(Instr, Recipe);
8742 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8743 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8744 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8745 // recipes and need to be moved to the phi section of HeaderVPBB:
8746 // * tail-folding (non-phi recipes computing the header mask are
8747 // introduced earlier than regular header phi recipes, and should appear
8748 // after them)
8749 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8750
8751 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8752 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8753 "unexpected recipe needs moving");
8754 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8755 } else
8756 VPBB->appendRecipe(Recipe);
8757 }
8758
8760 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8761 }
8762
8763 // After here, VPBB should not be used.
8764 VPBB = nullptr;
8765
8766 if (CM.requiresScalarEpilogue(Range)) {
8767 // No edge from the middle block to the unique exit block has been inserted
8768 // and there is nothing to fix from vector loop; phis should have incoming
8769 // from scalar loop only.
8770 } else
8771 addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
8772
8773 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8774 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8775 "entry block must be set to a VPRegionBlock having a non-empty entry "
8776 "VPBasicBlock");
8777 RecipeBuilder.fixHeaderPhis();
8778
8779 // ---------------------------------------------------------------------------
8780 // Transform initial VPlan: Apply previously taken decisions, in order, to
8781 // bring the VPlan to its final state.
8782 // ---------------------------------------------------------------------------
8783
8784 // Adjust the recipes for any inloop reductions.
8785 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8786
8787 // Interleave memory: for each Interleave Group we marked earlier as relevant
8788 // for this VPlan, replace the Recipes widening its memory instructions with a
8789 // single VPInterleaveRecipe at its insertion point.
8790 for (const auto *IG : InterleaveGroups) {
8791 auto *Recipe =
8792 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
8793 SmallVector<VPValue *, 4> StoredValues;
8794 for (unsigned i = 0; i < IG->getFactor(); ++i)
8795 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8796 auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
8797 StoredValues.push_back(StoreR->getStoredValue());
8798 }
8799
8800 bool NeedsMaskForGaps =
8801 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8802 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8803 Recipe->getMask(), NeedsMaskForGaps);
8804 VPIG->insertBefore(Recipe);
8805 unsigned J = 0;
8806 for (unsigned i = 0; i < IG->getFactor(); ++i)
8807 if (Instruction *Member = IG->getMember(i)) {
8808 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8809 if (!Member->getType()->isVoidTy()) {
8810 VPValue *OriginalV = MemberR->getVPSingleValue();
8811 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8812 J++;
8813 }
8814 MemberR->eraseFromParent();
8815 }
8816 }
8817
8818 for (ElementCount VF : Range)
8819 Plan->addVF(VF);
8820 Plan->setName("Initial VPlan");
8821
8822 // Replace VPValues for known constant strides guaranteed by predicate scalar
8823 // evolution.
8824 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8825 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8826 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8827 // Only handle constant strides for now.
8828 if (!ScevStride)
8829 continue;
8830
8831 auto *CI = Plan->getOrAddLiveIn(
8832 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
8833 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
8834 StrideVPV->replaceAllUsesWith(CI);
8835
8836 // The versioned value may not be used in the loop directly but through a
8837 // sext/zext. Add new live-ins in those cases.
8838 for (Value *U : StrideV->users()) {
8839 if (!isa<SExtInst, ZExtInst>(U))
8840 continue;
8841 VPValue *StrideVPV = Plan->getLiveIn(U);
8842 if (!StrideVPV)
8843 continue;
8844 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(
8845 U->getType(), ScevStride->getAPInt().getSExtValue()));
8846 StrideVPV->replaceAllUsesWith(CI);
8847 }
8848 }
8849
8851 return Legal->blockNeedsPredication(BB);
8852 });
8853
8854 // Sink users of fixed-order recurrence past the recipe defining the previous
8855 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8857 return nullptr;
8858
8859 if (useActiveLaneMask(Style)) {
8860 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8861 // TailFoldingStyle is visible there.
8862 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8863 bool WithoutRuntimeCheck =
8865 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8866 WithoutRuntimeCheck);
8867 }
8868 return Plan;
8869}
8870
8871VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8872 // Outer loop handling: They may require CFG and instruction level
8873 // transformations before even evaluating whether vectorization is profitable.
8874 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8875 // the vectorization pipeline.
8876 assert(!OrigLoop->isInnermost());
8877 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8878
8879 // Create new empty VPlan
8880 auto Plan = VPlan::createInitialVPlan(
8881 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8882 *PSE.getSE());
8883
8884 // Build hierarchical CFG
8885 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8886 HCFGBuilder.buildHierarchicalCFG();
8887
8888 for (ElementCount VF : Range)
8889 Plan->addVF(VF);
8890
8892 Plan,
8893 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8894 *PSE.getSE(), *TLI);
8895
8896 // Remove the existing terminator of the exiting block of the top-most region.
8897 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8898 auto *Term =
8899 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8900 Term->eraseFromParent();
8901
8902 // Tail folding is not supported for outer loops, so the induction increment
8903 // is guaranteed to not wrap.
8904 bool HasNUW = true;
8905 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8906 DebugLoc());
8907 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8908 return Plan;
8909}
8910
8911// Adjust the recipes for reductions. For in-loop reductions the chain of
8912// instructions leading from the loop exit instr to the phi need to be converted
8913// to reductions, with one operand being vector and the other being the scalar
8914// reduction chain. For other reductions, a select is introduced between the phi
8915// and live-out recipes when folding the tail.
8916//
8917// A ComputeReductionResult recipe is added to the middle block, also for
8918// in-loop reductions which compute their result in-loop, because generating
8919// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8920//
8921// Adjust AnyOf reductions; replace the reduction phi for the selected value
8922// with a boolean reduction phi node to check if the condition is true in any
8923// iteration. The final value is selected by the final ComputeReductionResult.
8924void LoopVectorizationPlanner::adjustRecipesForReductions(
8925 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8926 ElementCount MinVF) {
8927 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8928 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8929 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8930 // sank outside of the loop would keep the same order as they had in the
8931 // original loop.
8932 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8933 for (VPRecipeBase &R : Header->phis()) {
8934 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8935 ReductionPHIList.emplace_back(ReductionPhi);
8936 }
8937 bool HasIntermediateStore = false;
8938 stable_sort(ReductionPHIList,
8939 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8940 const VPReductionPHIRecipe *R2) {
8941 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8942 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8943 HasIntermediateStore |= IS1 || IS2;
8944
8945 // If neither of the recipes has an intermediate store, keep the
8946 // order the same.
8947 if (!IS1 && !IS2)
8948 return false;
8949
8950 // If only one of the recipes has an intermediate store, then
8951 // move it towards the beginning of the list.
8952 if (IS1 && !IS2)
8953 return true;
8954
8955 if (!IS1 && IS2)
8956 return false;
8957
8958 // If both recipes have an intermediate store, then the recipe
8959 // with the later store should be processed earlier. So it
8960 // should go to the beginning of the list.
8961 return DT->dominates(IS2, IS1);
8962 });
8963
8964 if (HasIntermediateStore && ReductionPHIList.size() > 1)
8965 for (VPRecipeBase *R : ReductionPHIList)
8966 R->moveBefore(*Header, Header->getFirstNonPhi());
8967
8968 for (VPRecipeBase &R : Header->phis()) {
8969 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8970 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8971 continue;
8972
8973 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8974 RecurKind Kind = RdxDesc.getRecurrenceKind();
8976 "AnyOf reductions are not allowed for in-loop reductions");
8977
8978 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8980 Worklist.insert(PhiR);
8981 for (unsigned I = 0; I != Worklist.size(); ++I) {
8982 VPSingleDefRecipe *Cur = Worklist[I];
8983 for (VPUser *U : Cur->users()) {
8984 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8985 if (!UserRecipe) {
8986 assert(isa<VPLiveOut>(U) &&
8987 "U must either be a VPSingleDef or VPLiveOut");
8988 continue;
8989 }
8990 Worklist.insert(UserRecipe);
8991 }
8992 }
8993
8994 // Visit operation "Links" along the reduction chain top-down starting from
8995 // the phi until LoopExitValue. We keep track of the previous item
8996 // (PreviousLink) to tell which of the two operands of a Link will remain
8997 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8998 // the select instructions. Blend recipes of in-loop reduction phi's will
8999 // get folded to their non-phi operand, as the reduction recipe handles the
9000 // condition directly.
9001 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9002 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9003 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9004
9005 // Index of the first operand which holds a non-mask vector operand.
9006 unsigned IndexOfFirstOperand;
9007 // Recognize a call to the llvm.fmuladd intrinsic.
9008 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9009 VPValue *VecOp;
9010 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9011 if (IsFMulAdd) {
9012 assert(
9014 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9015 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9016 isa<VPWidenCallRecipe>(CurrentLink)) &&
9017 CurrentLink->getOperand(2) == PreviousLink &&
9018 "expected a call where the previous link is the added operand");
9019
9020 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9021 // need to create an fmul recipe (multiplying the first two operands of
9022 // the fmuladd together) to use as the vector operand for the fadd
9023 // reduction.
9024 VPInstruction *FMulRecipe = new VPInstruction(
9025 Instruction::FMul,
9026 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9027 CurrentLinkI->getFastMathFlags());
9028 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9029 VecOp = FMulRecipe;
9030 } else {
9031 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9032 if (PhiR->isInLoop() && Blend) {
9033 assert(Blend->getNumIncomingValues() == 2 &&
9034 "Blend must have 2 incoming values");
9035 if (Blend->getIncomingValue(0) == PhiR)
9036 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9037 else {
9038 assert(Blend->getIncomingValue(1) == PhiR &&
9039 "PhiR must be an operand of the blend");
9040 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9041 }
9042 continue;
9043 }
9044
9046 if (isa<VPWidenRecipe>(CurrentLink)) {
9047 assert(isa<CmpInst>(CurrentLinkI) &&
9048 "need to have the compare of the select");
9049 continue;
9050 }
9051 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9052 "must be a select recipe");
9053 IndexOfFirstOperand = 1;
9054 } else {
9055 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9056 "Expected to replace a VPWidenSC");
9057 IndexOfFirstOperand = 0;
9058 }
9059 // Note that for non-commutable operands (cmp-selects), the semantics of
9060 // the cmp-select are captured in the recurrence kind.
9061 unsigned VecOpId =
9062 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9063 ? IndexOfFirstOperand + 1
9064 : IndexOfFirstOperand;
9065 VecOp = CurrentLink->getOperand(VecOpId);
9066 assert(VecOp != PreviousLink &&
9067 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9068 (VecOpId - IndexOfFirstOperand)) ==
9069 PreviousLink &&
9070 "PreviousLink must be the operand other than VecOp");
9071 }
9072
9073 BasicBlock *BB = CurrentLinkI->getParent();
9074 VPValue *CondOp = nullptr;
9076 CondOp = RecipeBuilder.getBlockInMask(BB);
9077
9078 VPReductionRecipe *RedRecipe =
9079 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
9080 CondOp, CM.useOrderedReductions(RdxDesc));
9081 // Append the recipe to the end of the VPBasicBlock because we need to
9082 // ensure that it comes after all of it's inputs, including CondOp.
9083 // Note that this transformation may leave over dead recipes (including
9084 // CurrentLink), which will be cleaned by a later VPlan transform.
9085 LinkVPBB->appendRecipe(RedRecipe);
9086 CurrentLink->replaceAllUsesWith(RedRecipe);
9087 PreviousLink = RedRecipe;
9088 }
9089 }
9090 Builder.setInsertPoint(&*LatchVPBB->begin());
9091 for (VPRecipeBase &R :
9092 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9093 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9094 if (!PhiR)
9095 continue;
9096
9097 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9098 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9099 // with a boolean reduction phi node to check if the condition is true in
9100 // any iteration. The final value is selected by the final
9101 // ComputeReductionResult.
9103 RdxDesc.getRecurrenceKind())) {
9104 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9105 return isa<VPWidenSelectRecipe>(U) ||
9106 (isa<VPReplicateRecipe>(U) &&
9107 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9108 Instruction::Select);
9109 }));
9110 VPValue *Cmp = Select->getOperand(0);
9111 // If the compare is checking the reduction PHI node, adjust it to check
9112 // the start value.
9113 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9114 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9115 if (CmpR->getOperand(I) == PhiR)
9116 CmpR->setOperand(I, PhiR->getStartValue());
9117 }
9118 VPBuilder::InsertPointGuard Guard(Builder);
9119 Builder.setInsertPoint(Select);
9120
9121 // If the true value of the select is the reduction phi, the new value is
9122 // selected if the negated condition is true in any iteration.
9123 if (Select->getOperand(1) == PhiR)
9124 Cmp = Builder.createNot(Cmp);
9125 VPValue *Or = Builder.createOr(PhiR, Cmp);
9126 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9127
9128 // Convert the reduction phi to operate on bools.
9129 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9130 OrigLoop->getHeader()->getContext())));
9131 }
9132
9133 // If tail is folded by masking, introduce selects between the phi
9134 // and the live-out instruction of each reduction, at the beginning of the
9135 // dedicated latch block.
9136 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9137 auto *NewExitingVPV = PhiR->getBackedgeValue();
9138 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9139 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9140 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9141 "reduction recipe must be defined before latch");
9142 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9143 std::optional<FastMathFlags> FMFs =
9144 PhiTy->isFloatingPointTy()
9145 ? std::make_optional(RdxDesc.getFastMathFlags())
9146 : std::nullopt;
9147 NewExitingVPV =
9148 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9149 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9150 return isa<VPInstruction>(&U) &&
9151 cast<VPInstruction>(&U)->getOpcode() ==
9153 });
9156 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9158 PhiR->setOperand(1, NewExitingVPV);
9159 }
9160
9161 // If the vector reduction can be performed in a smaller type, we truncate
9162 // then extend the loop exit value to enable InstCombine to evaluate the
9163 // entire expression in the smaller type.
9164 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9165 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9167 RdxDesc.getRecurrenceKind())) {
9168 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9169 Type *RdxTy = RdxDesc.getRecurrenceType();
9170 auto *Trunc =
9171 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9172 auto *Extnd =
9173 RdxDesc.isSigned()
9174 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9175 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9176
9177 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9178 Extnd->insertAfter(Trunc);
9179 if (PhiR->getOperand(1) == NewExitingVPV)
9180 PhiR->setOperand(1, Extnd->getVPSingleValue());
9181 NewExitingVPV = Extnd;
9182 }
9183
9184 // We want code in the middle block to appear to execute on the location of
9185 // the scalar loop's latch terminator because: (a) it is all compiler
9186 // generated, (b) these instructions are always executed after evaluating
9187 // the latch conditional branch, and (c) other passes may add new
9188 // predecessors which terminate on this line. This is the easiest way to
9189 // ensure we don't accidentally cause an extra step back into the loop while
9190 // debugging.
9191 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9192
9193 // TODO: At the moment ComputeReductionResult also drives creation of the
9194 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9195 // even for in-loop reductions, until the reduction resume value handling is
9196 // also modeled in VPlan.
9197 auto *FinalReductionResult = new VPInstruction(
9198 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9199 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9200 ->appendRecipe(FinalReductionResult);
9201 OrigExitingVPV->replaceUsesWithIf(
9202 FinalReductionResult,
9203 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9204 }
9205
9207}
9208
9209#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9211 VPSlotTracker &SlotTracker) const {
9212 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9213 IG->getInsertPos()->printAsOperand(O, false);
9214 O << ", ";
9216 VPValue *Mask = getMask();
9217 if (Mask) {
9218 O << ", ";
9219 Mask->printAsOperand(O, SlotTracker);
9220 }
9221
9222 unsigned OpIdx = 0;
9223 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9224 if (!IG->getMember(i))
9225 continue;
9226 if (getNumStoreOperands() > 0) {
9227 O << "\n" << Indent << " store ";
9228 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9229 O << " to index " << i;
9230 } else {
9231 O << "\n" << Indent << " ";
9233 O << " = load from index " << i;
9234 }
9235 ++OpIdx;
9236 }
9237}
9238#endif
9239
9242 "Not a pointer induction according to InductionDescriptor!");
9243 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9244 "Unexpected type.");
9246 "Recipe should have been replaced");
9247
9248 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9249 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9250 Type *PhiType = IndDesc.getStep()->getType();
9251
9252 // Build a pointer phi
9253 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9254 Type *ScStValueType = ScalarStartValue->getType();
9255 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
9256 CanonicalIV->getIterator());
9257
9258 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9259 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9260
9261 // A pointer induction, performed by using a gep
9262 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9263
9264 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9265 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9266 Value *NumUnrolledElems =
9267 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9268 Value *InductionGEP = GetElementPtrInst::Create(
9269 State.Builder.getInt8Ty(), NewPointerPhi,
9270 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9271 InductionLoc);
9272 // Add induction update using an incorrect block temporarily. The phi node
9273 // will be fixed after VPlan execution. Note that at this point the latch
9274 // block cannot be used, as it does not exist yet.
9275 // TODO: Model increment value in VPlan, by turning the recipe into a
9276 // multi-def and a subclass of VPHeaderPHIRecipe.
9277 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9278
9279 // Create UF many actual address geps that use the pointer
9280 // phi as base and a vectorized version of the step value
9281 // (<step*0, ..., step*N>) as offset.
9282 for (unsigned Part = 0; Part < State.UF; ++Part) {
9283 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9284 Value *StartOffsetScalar =
9285 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9286 Value *StartOffset =
9287 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9288 // Create a vector of consecutive numbers from zero to VF.
9289 StartOffset = State.Builder.CreateAdd(
9290 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9291
9292 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9293 "scalar step must be the same across all parts");
9294 Value *GEP = State.Builder.CreateGEP(
9295 State.Builder.getInt8Ty(), NewPointerPhi,
9296 State.Builder.CreateMul(
9297 StartOffset,
9298 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9299 "vector.gep"));
9300 State.set(this, GEP, Part);
9301 }
9302}
9303
9305 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9306
9307 // Fast-math-flags propagate from the original induction instruction.
9309 if (FPBinOp)
9310 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9311
9312 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9313 Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
9314 Value *DerivedIV = emitTransformedIndex(
9315 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9316 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9317 DerivedIV->setName("offset.idx");
9318 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9319
9320 State.set(this, DerivedIV, VPIteration(0, 0));
9321}
9322
9324 assert(!State.Instance && "Interleave group being replicated.");
9325 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9327 NeedsMaskForGaps);
9328}
9329
9332 if (State.Instance) { // Generate a single instance.
9333 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9334 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9335 // Insert scalar instance packing it into a vector.
9336 if (State.VF.isVector() && shouldPack()) {
9337 // If we're constructing lane 0, initialize to start from poison.
9338 if (State.Instance->Lane.isFirstLane()) {
9339 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9340 Value *Poison = PoisonValue::get(
9341 VectorType::get(UI->getType(), State.VF));
9342 State.set(this, Poison, State.Instance->Part);
9343 }
9344 State.packScalarIntoVectorValue(this, *State.Instance);
9345 }
9346 return;
9347 }
9348
9349 if (IsUniform) {
9350 // If the recipe is uniform across all parts (instead of just per VF), only
9351 // generate a single instance.
9352 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9353 all_of(operands(), [](VPValue *Op) {
9354 return Op->isDefinedOutsideVectorRegions();
9355 })) {
9356 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9357 if (user_begin() != user_end()) {
9358 for (unsigned Part = 1; Part < State.UF; ++Part)
9359 State.set(this, State.get(this, VPIteration(0, 0)),
9360 VPIteration(Part, 0));
9361 }
9362 return;
9363 }
9364
9365 // Uniform within VL means we need to generate lane 0 only for each
9366 // unrolled copy.
9367 for (unsigned Part = 0; Part < State.UF; ++Part)
9368 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9369 return;
9370 }
9371
9372 // A store of a loop varying value to a uniform address only needs the last
9373 // copy of the store.
9374 if (isa<StoreInst>(UI) &&
9376 auto Lane = VPLane::getLastLaneForVF(State.VF);
9377 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9378 State);
9379 return;
9380 }
9381
9382 // Generate scalar instances for all VF lanes of all UF parts.
9383 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9384 const unsigned EndLane = State.VF.getKnownMinValue();
9385 for (unsigned Part = 0; Part < State.UF; ++Part)
9386 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9387 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9388}
9389
9391 auto *LI = cast<LoadInst>(&Ingredient);
9392
9393 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9394 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9395 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9396 bool CreateGather = !isConsecutive();
9397
9398 auto &Builder = State.Builder;
9400 for (unsigned Part = 0; Part < State.UF; ++Part) {
9401 Value *NewLI;
9402 Value *Mask = nullptr;
9403 if (auto *VPMask = getMask()) {
9404 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9405 // of a null all-one mask is a null mask.
9406 Mask = State.get(VPMask, Part);
9407 if (isReverse())
9408 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9409 }
9410
9411 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
9412 if (CreateGather) {
9413 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
9414 "wide.masked.gather");
9415 } else if (Mask) {
9416 NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
9417 PoisonValue::get(DataTy),
9418 "wide.masked.load");
9419 } else {
9420 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
9421 }
9422 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9423 State.addMetadata(NewLI, LI);
9424 if (Reverse)
9425 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9426 State.set(this, NewLI, Part);
9427 }
9428}
9429
9430/// Use all-true mask for reverse rather than actual mask, as it avoids a
9431/// dependence w/o affecting the result.
9433 Value *EVL, const Twine &Name) {
9434 VectorType *ValTy = cast<VectorType>(Operand->getType());
9435 Value *AllTrueMask =
9436 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
9437 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
9438 {Operand, AllTrueMask, EVL}, nullptr, Name);
9439}
9440
9442 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9443 "explicit vector length.");
9444 auto *LI = cast<LoadInst>(&Ingredient);
9445
9446 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9447 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9448 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9449 bool CreateGather = !isConsecutive();
9450
9451 auto &Builder = State.Builder;
9453 CallInst *NewLI;
9454 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9455 Value *Addr = State.get(getAddr(), 0, !CreateGather);
9456 Value *Mask = nullptr;
9457 if (VPValue *VPMask = getMask()) {
9458 Mask = State.get(VPMask, 0);
9459 if (isReverse())
9460 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9461 } else {
9462 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9463 }
9464
9465 if (CreateGather) {
9466 NewLI =
9467 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9468 nullptr, "wide.masked.gather");
9469 } else {
9470 VectorBuilder VBuilder(Builder);
9471 VBuilder.setEVL(EVL).setMask(Mask);
9472 NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
9473 Instruction::Load, DataTy, Addr, "vp.op.load"));
9474 }
9475 NewLI->addParamAttr(
9476 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
9477 State.addMetadata(NewLI, LI);
9478 Instruction *Res = NewLI;
9479 if (isReverse())
9480 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
9481 State.set(this, Res, 0);
9482}
9483
9485 auto *SI = cast<StoreInst>(&Ingredient);
9486
9487 VPValue *StoredVPValue = getStoredValue();
9488 bool CreateScatter = !isConsecutive();
9489 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9490
9491 auto &Builder = State.Builder;
9493
9494 for (unsigned Part = 0; Part < State.UF; ++Part) {
9495 Instruction *NewSI = nullptr;
9496 Value *Mask = nullptr;
9497 if (auto *VPMask = getMask()) {
9498 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9499 // of a null all-one mask is a null mask.
9500 Mask = State.get(VPMask, Part);
9501 if (isReverse())
9502 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9503 }
9504
9505 Value *StoredVal = State.get(StoredVPValue, Part);
9506 if (isReverse()) {
9507 // If we store to reverse consecutive memory locations, then we need
9508 // to reverse the order of elements in the stored value.
9509 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9510 // We don't want to update the value in the map as it might be used in
9511 // another expression. So don't call resetVectorValue(StoredVal).
9512 }
9513 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
9514 if (CreateScatter)
9515 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
9516 else if (Mask)
9517 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
9518 else
9519 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
9520 State.addMetadata(NewSI, SI);
9521 }
9522}
9523
9525 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9526 "explicit vector length.");
9527 auto *SI = cast<StoreInst>(&Ingredient);
9528
9529 VPValue *StoredValue = getStoredValue();
9530 bool CreateScatter = !isConsecutive();
9531 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9532
9533 auto &Builder = State.Builder;
9535
9536 CallInst *NewSI = nullptr;
9537 Value *StoredVal = State.get(StoredValue, 0);
9538 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9539 if (isReverse())
9540 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
9541 Value *Mask = nullptr;
9542 if (VPValue *VPMask = getMask()) {
9543 Mask = State.get(VPMask, 0);
9544 if (isReverse())
9545 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9546 } else {
9547 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9548 }
9549 Value *Addr = State.get(getAddr(), 0, !CreateScatter);
9550 if (CreateScatter) {
9551 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9552 Intrinsic::vp_scatter,
9553 {StoredVal, Addr, Mask, EVL});
9554 } else {
9555 VectorBuilder VBuilder(Builder);
9556 VBuilder.setEVL(EVL).setMask(Mask);
9557 NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
9558 Instruction::Store, Type::getVoidTy(EVL->getContext()),
9559 {StoredVal, Addr}));
9560 }
9561 NewSI->addParamAttr(
9562 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
9563 State.addMetadata(NewSI, SI);
9564}
9565
9566// Determine how to lower the scalar epilogue, which depends on 1) optimising
9567// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9568// predication, and 4) a TTI hook that analyses whether the loop is suitable
9569// for predication.
9574 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9575 // don't look at hints or options, and don't request a scalar epilogue.
9576 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9577 // LoopAccessInfo (due to code dependency and not being able to reliably get
9578 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9579 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9580 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9581 // back to the old way and vectorize with versioning when forced. See D81345.)
9582 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9586
9587 // 2) If set, obey the directives
9588 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9596 };
9597 }
9598
9599 // 3) If set, obey the hints
9600 switch (Hints.getPredicate()) {
9605 };
9606
9607 // 4) if the TTI hook indicates this is profitable, request predication.
9608 TailFoldingInfo TFI(TLI, &LVL, IAI);
9611
9613}
9614
9615// Process the loop in the VPlan-native vectorization path. This path builds
9616// VPlan upfront in the vectorization pipeline, which allows to apply
9617// VPlan-to-VPlan transformations from the very beginning without modifying the
9618// input LLVM IR.
9625 LoopVectorizationRequirements &Requirements) {
9626
9627 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9628 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9629 return false;
9630 }
9631 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9632 Function *F = L->getHeader()->getParent();
9633 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9634
9636 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9637
9638 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9639 &Hints, IAI);
9640 // Use the planner for outer loop vectorization.
9641 // TODO: CM is not used at this point inside the planner. Turn CM into an
9642 // optional argument if we don't need it in the future.
9643 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9644 ORE);
9645
9646 // Get user vectorization factor.
9647 ElementCount UserVF = Hints.getWidth();
9648
9650
9651 // Plan how to best vectorize, return the best VF and its cost.
9652 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9653
9654 // If we are stress testing VPlan builds, do not attempt to generate vector
9655 // code. Masked vector code generation support will follow soon.
9656 // Also, do not attempt to vectorize if no vector code will be produced.
9658 return false;
9659
9660 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9661
9662 {
9663 bool AddBranchWeights =
9664 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9665 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9666 F->getParent()->getDataLayout(), AddBranchWeights);
9667 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9668 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9669 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9670 << L->getHeader()->getParent()->getName() << "\"\n");
9671 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9672 }
9673
9674 reportVectorization(ORE, L, VF, 1);
9675
9676 // Mark the loop as already vectorized to avoid vectorizing again.
9677 Hints.setAlreadyVectorized();
9678 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9679 return true;
9680}
9681
9682// Emit a remark if there are stores to floats that required a floating point
9683// extension. If the vectorized loop was generated with floating point there
9684// will be a performance penalty from the conversion overhead and the change in
9685// the vector width.
9688 for (BasicBlock *BB : L->getBlocks()) {
9689 for (Instruction &Inst : *BB) {
9690 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9691 if (S->getValueOperand()->getType()->isFloatTy())
9692 Worklist.push_back(S);
9693 }
9694 }
9695 }
9696
9697 // Traverse the floating point stores upwards searching, for floating point
9698 // conversions.
9701 while (!Worklist.empty()) {
9702 auto *I = Worklist.pop_back_val();
9703 if (!L->contains(I))
9704 continue;
9705 if (!Visited.insert(I).second)
9706 continue;
9707
9708 // Emit a remark if the floating point store required a floating
9709 // point conversion.
9710 // TODO: More work could be done to identify the root cause such as a
9711 // constant or a function return type and point the user to it.
9712 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9713 ORE->emit([&]() {
9714 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9715 I->getDebugLoc(), L->getHeader())
9716 << "floating point conversion changes vector width. "
9717 << "Mixed floating point precision requires an up/down "
9718 << "cast that will negatively impact performance.";
9719 });
9720
9721 for (Use &Op : I->operands())
9722 if (auto *OpI = dyn_cast<Instruction>(Op))
9723 Worklist.push_back(OpI);
9724 }
9725}
9726
9727static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9729 std::optional<unsigned> VScale, Loop *L,
9730 ScalarEvolution &SE,
9732 InstructionCost CheckCost = Checks.getCost();
9733 if (!CheckCost.isValid())
9734 return false;
9735
9736 // When interleaving only scalar and vector cost will be equal, which in turn
9737 // would lead to a divide by 0. Fall back to hard threshold.
9738 if (VF.Width.isScalar()) {
9739 if (CheckCost > VectorizeMemoryCheckThreshold) {
9740 LLVM_DEBUG(
9741 dbgs()
9742 << "LV: Interleaving only is not profitable due to runtime checks\n");
9743 return false;
9744 }
9745 return true;
9746 }
9747
9748 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9749 uint64_t ScalarC = *VF.ScalarCost.getValue();
9750 if (ScalarC == 0)
9751 return true;
9752
9753 // First, compute the minimum iteration count required so that the vector
9754 // loop outperforms the scalar loop.
9755 // The total cost of the scalar loop is
9756 // ScalarC * TC
9757 // where
9758 // * TC is the actual trip count of the loop.
9759 // * ScalarC is the cost of a single scalar iteration.
9760 //
9761 // The total cost of the vector loop is
9762 // RtC + VecC * (TC / VF) + EpiC
9763 // where
9764 // * RtC is the cost of the generated runtime checks
9765 // * VecC is the cost of a single vector iteration.
9766 // * TC is the actual trip count of the loop
9767 // * VF is the vectorization factor
9768 // * EpiCost is the cost of the generated epilogue, including the cost
9769 // of the remaining scalar operations.
9770 //
9771 // Vectorization is profitable once the total vector cost is less than the
9772 // total scalar cost:
9773 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9774 //
9775 // Now we can compute the minimum required trip count TC as
9776 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9777 //
9778 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9779 // the computations are performed on doubles, not integers and the result
9780 // is rounded up, hence we get an upper estimate of the TC.
9781 unsigned IntVF = VF.Width.getKnownMinValue();
9782 if (VF.Width.isScalable()) {
9783 unsigned AssumedMinimumVscale = 1;
9784 if (VScale)
9785 AssumedMinimumVscale = *VScale;
9786 IntVF *= AssumedMinimumVscale;
9787 }
9788 uint64_t RtC = *CheckCost.getValue();
9789 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9790 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9791
9792 // Second, compute a minimum iteration count so that the cost of the
9793 // runtime checks is only a fraction of the total scalar loop cost. This
9794 // adds a loop-dependent bound on the overhead incurred if the runtime
9795 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9796 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9797 // cost, compute
9798 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9799 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9800
9801 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9802 // epilogue is allowed, choose the next closest multiple of VF. This should
9803 // partly compensate for ignoring the epilogue cost.
9804 uint64_t MinTC = std::max(MinTC1, MinTC2);
9805 if (SEL == CM_ScalarEpilogueAllowed)
9806 MinTC = alignTo(MinTC, IntVF);
9808
9809 LLVM_DEBUG(
9810 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9811 << VF.MinProfitableTripCount << "\n");
9812
9813 // Skip vectorization if the expected trip count is less than the minimum
9814 // required trip count.
9815 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9818 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9819 "trip count < minimum profitable VF ("
9820 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9821 << ")\n");
9822
9823 return false;
9824 }
9825 }
9826 return true;
9827}
9828
9830 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9832 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9834
9836 assert((EnableVPlanNativePath || L->isInnermost()) &&
9837 "VPlan-native path is not enabled. Only process inner loops.");
9838
9839#ifndef NDEBUG
9840 const std::string DebugLocStr = getDebugLocString(L);
9841#endif /* NDEBUG */
9842
9843 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9844 << L->getHeader()->getParent()->getName() << "' from "
9845 << DebugLocStr << "\n");
9846
9847 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9848
9849 LLVM_DEBUG(
9850 dbgs() << "LV: Loop hints:"
9851 << " force="
9853 ? "disabled"
9855 ? "enabled"
9856 : "?"))
9857 << " width=" << Hints.getWidth()
9858 << " interleave=" << Hints.getInterleave() << "\n");
9859
9860 // Function containing loop
9861 Function *F = L->getHeader()->getParent();
9862
9863 // Looking at the diagnostic output is the only way to determine if a loop
9864 // was vectorized (other than looking at the IR or machine code), so it
9865 // is important to generate an optimization remark for each loop. Most of
9866 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9867 // generated as OptimizationRemark and OptimizationRemarkMissed are
9868 // less verbose reporting vectorized loops and unvectorized loops that may
9869 // benefit from vectorization, respectively.
9870
9871 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9872 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9873 return false;
9874 }
9875
9876 PredicatedScalarEvolution PSE(*SE, *L);
9877
9878 // Check if it is legal to vectorize the loop.
9879 LoopVectorizationRequirements Requirements;
9880 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9881 &Requirements, &Hints, DB, AC, BFI, PSI);
9883 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9884 Hints.emitRemarkWithHints();
9885 return false;
9886 }
9887
9888 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9889 // here. They may require CFG and instruction level transformations before
9890 // even evaluating whether vectorization is profitable. Since we cannot modify
9891 // the incoming IR, we need to build VPlan upfront in the vectorization
9892 // pipeline.
9893 if (!L->isInnermost())
9894 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9895 ORE, BFI, PSI, Hints, Requirements);
9896
9897 assert(L->isInnermost() && "Inner loop expected.");
9898
9899 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9900 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9901
9902 // If an override option has been passed in for interleaved accesses, use it.
9903 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9904 UseInterleaved = EnableInterleavedMemAccesses;
9905
9906 // Analyze interleaved memory accesses.
9907 if (UseInterleaved)
9909
9910 // Check the function attributes and profiles to find out if this function
9911 // should be optimized for size.
9913 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9914
9915 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9916 // count by optimizing for size, to minimize overheads.
9917 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9918 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9919 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9920 << "This loop is worth vectorizing only if no scalar "
9921 << "iteration overheads are incurred.");
9923 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9924 else {
9925 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9926 LLVM_DEBUG(dbgs() << "\n");
9927 // Predicate tail-folded loops are efficient even when the loop
9928 // iteration count is low. However, setting the epilogue policy to
9929 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9930 // with runtime checks. It's more effective to let
9931 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9932 // for the loop.
9935 } else {
9936 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9937 "small to consider vectorizing.\n");
9939 "The trip count is below the minial threshold value.",
9940 "loop trip count is too low, avoiding vectorization",
9941 "LowTripCount", ORE, L);
9942 Hints.emitRemarkWithHints();
9943 return false;
9944 }
9945 }
9946 }
9947
9948 // Check the function attributes to see if implicit floats or vectors are
9949 // allowed.
9950 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9952 "Can't vectorize when the NoImplicitFloat attribute is used",
9953 "loop not vectorized due to NoImplicitFloat attribute",
9954 "NoImplicitFloat", ORE, L);
9955 Hints.emitRemarkWithHints();
9956 return false;
9957 }
9958
9959 // Check if the target supports potentially unsafe FP vectorization.
9960 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9961 // for the target we're vectorizing for, to make sure none of the
9962 // additional fp-math flags can help.
9963 if (Hints.isPotentiallyUnsafe() &&
9966 "Potentially unsafe FP op prevents vectorization",
9967 "loop not vectorized due to unsafe FP support.",
9968 "UnsafeFP", ORE, L);
9969 Hints.emitRemarkWithHints();
9970 return false;
9971 }
9972
9973 bool AllowOrderedReductions;
9974 // If the flag is set, use that instead and override the TTI behaviour.
9975 if (ForceOrderedReductions.getNumOccurrences() > 0)
9976 AllowOrderedReductions = ForceOrderedReductions;
9977 else
9978 AllowOrderedReductions = TTI->enableOrderedReductions();
9979 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9980 ORE->emit([&]() {
9981 auto *ExactFPMathInst = Requirements.getExactFPInst();
9982 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9983 ExactFPMathInst->getDebugLoc(),
9984 ExactFPMathInst->getParent())
9985 << "loop not vectorized: cannot prove it is safe to reorder "
9986 "floating-point operations";
9987 });
9988 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9989 "reorder floating-point operations\n");
9990 Hints.emitRemarkWithHints();
9991 return false;
9992 }
9993
9994 // Use the cost model.
9995 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9996 F, &Hints, IAI);
9997 // Use the planner for vectorization.
9998 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9999 ORE);
10000
10001 // Get user vectorization factor and interleave count.
10002 ElementCount UserVF = Hints.getWidth();
10003 unsigned UserIC = Hints.getInterleave();
10004
10005 // Plan how to best vectorize, return the best VF and its cost.
10006 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10007
10009 unsigned IC = 1;
10010
10011 bool AddBranchWeights =
10012 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10013 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10014 F->getParent()->getDataLayout(), AddBranchWeights);
10015 if (MaybeVF) {
10016 VF = *MaybeVF;
10017 // Select the interleave count.
10018 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10019
10020 unsigned SelectedIC = std::max(IC, UserIC);
10021 // Optimistically generate runtime checks if they are needed. Drop them if
10022 // they turn out to not be profitable.
10023 if (VF.Width.isVector() || SelectedIC > 1)
10024 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10025
10026 // Check if it is profitable to vectorize with runtime checks.
10027 bool ForceVectorization =
10029 if (!ForceVectorization &&
10031 *PSE.getSE(), SEL)) {
10032 ORE->emit([&]() {
10034 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10035 L->getHeader())
10036 << "loop not vectorized: cannot prove it is safe to reorder "
10037 "memory operations";
10038 });
10039 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10040 Hints.emitRemarkWithHints();
10041 return false;
10042 }
10043 }
10044
10045 // Identify the diagnostic messages that should be produced.
10046 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10047 bool VectorizeLoop = true, InterleaveLoop = true;
10048 if (VF.Width.isScalar()) {
10049 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10050 VecDiagMsg = std::make_pair(
10051 "VectorizationNotBeneficial",
10052 "the cost-model indicates that vectorization is not beneficial");
10053 VectorizeLoop = false;
10054 }
10055
10056 if (!MaybeVF && UserIC > 1) {
10057 // Tell the user interleaving was avoided up-front, despite being explicitly
10058 // requested.
10059 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10060 "interleaving should be avoided up front\n");
10061 IntDiagMsg = std::make_pair(
10062 "InterleavingAvoided",
10063 "Ignoring UserIC, because interleaving was avoided up front");
10064 InterleaveLoop = false;
10065 } else if (IC == 1 && UserIC <= 1) {
10066 // Tell the user interleaving is not beneficial.
10067 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10068 IntDiagMsg = std::make_pair(
10069 "InterleavingNotBeneficial",
10070 "the cost-model indicates that interleaving is not beneficial");
10071 InterleaveLoop = false;
10072 if (UserIC == 1) {
10073 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10074 IntDiagMsg.second +=
10075 " and is explicitly disabled or interleave count is set to 1";
10076 }
10077 } else if (IC > 1 && UserIC == 1) {
10078 // Tell the user interleaving is beneficial, but it explicitly disabled.
10079 LLVM_DEBUG(
10080 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10081 IntDiagMsg = std::make_pair(
10082 "InterleavingBeneficialButDisabled",
10083 "the cost-model indicates that interleaving is beneficial "
10084 "but is explicitly disabled or interleave count is set to 1");
10085 InterleaveLoop = false;
10086 }
10087
10088 // Override IC if user provided an interleave count.
10089 IC = UserIC > 0 ? UserIC : IC;
10090
10091 // Emit diagnostic messages, if any.
10092 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10093 if (!VectorizeLoop && !InterleaveLoop) {
10094 // Do not vectorize or interleaving the loop.
10095 ORE->emit([&]() {
10096 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10097 L->getStartLoc(), L->getHeader())
10098 << VecDiagMsg.second;
10099 });
10100 ORE->emit([&]() {
10101 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10102 L->getStartLoc(), L->getHeader())
10103 << IntDiagMsg.second;
10104 });
10105 return false;
10106 } else if (!VectorizeLoop && InterleaveLoop) {
10107 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10108 ORE->emit([&]() {
10109 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10110 L->getStartLoc(), L->getHeader())
10111 << VecDiagMsg.second;
10112 });
10113 } else if (VectorizeLoop && !InterleaveLoop) {
10114 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10115 << ") in " << DebugLocStr << '\n');
10116 ORE->emit([&]() {
10117 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10118 L->getStartLoc(), L->getHeader())
10119 << IntDiagMsg.second;
10120 });
10121 } else if (VectorizeLoop && InterleaveLoop) {
10122 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10123 << ") in " << DebugLocStr << '\n');
10124 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10125 }
10126
10127 bool DisableRuntimeUnroll = false;
10128 MDNode *OrigLoopID = L->getLoopID();
10129 {
10130 using namespace ore;
10131 if (!VectorizeLoop) {
10132 assert(IC > 1 && "interleave count should not be 1 or 0");
10133 // If we decided that it is not legal to vectorize the loop, then
10134 // interleave it.
10135 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10136 &CM, BFI, PSI, Checks);
10137
10138 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10139 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10140
10141 ORE->emit([&]() {
10142 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10143 L->getHeader())
10144 << "interleaved loop (interleaved count: "
10145 << NV("InterleaveCount", IC) << ")";
10146 });
10147 } else {
10148 // If we decided that it is *legal* to vectorize the loop, then do it.
10149
10150 // Consider vectorizing the epilogue too if it's profitable.
10151 VectorizationFactor EpilogueVF =
10153 if (EpilogueVF.Width.isVector()) {
10154
10155 // The first pass vectorizes the main loop and creates a scalar epilogue
10156 // to be vectorized by executing the plan (potentially with a different
10157 // factor) again shortly afterwards.
10158 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10159 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10160 EPI, &LVL, &CM, BFI, PSI, Checks);
10161
10162 std::unique_ptr<VPlan> BestMainPlan(
10164 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10165 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
10166 ++LoopsVectorized;
10167
10168 // Second pass vectorizes the epilogue and adjusts the control flow
10169 // edges from the first pass.
10170 EPI.MainLoopVF = EPI.EpilogueVF;
10171 EPI.MainLoopUF = EPI.EpilogueUF;
10172 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10173 ORE, EPI, &LVL, &CM, BFI, PSI,
10174 Checks);
10175
10176 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10177 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10178 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10179 Header->setName("vec.epilog.vector.body");
10180
10181 // Re-use the trip count and steps expanded for the main loop, as
10182 // skeleton creation needs it as a value that dominates both the scalar
10183 // and vector epilogue loops
10184 // TODO: This is a workaround needed for epilogue vectorization and it
10185 // should be removed once induction resume value creation is done
10186 // directly in VPlan.
10187 EpilogILV.setTripCount(MainILV.getTripCount());
10188 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10189 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10190 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
10191 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10192 ExpandR->replaceAllUsesWith(ExpandedVal);
10193 if (BestEpiPlan.getTripCount() == ExpandR)
10194 BestEpiPlan.resetTripCount(ExpandedVal);
10195 ExpandR->eraseFromParent();
10196 }
10197
10198 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10199 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10200 // before vectorizing the epilogue loop.
10201 for (VPRecipeBase &R : Header->phis()) {
10202 if (isa<VPCanonicalIVPHIRecipe>(&R))
10203 continue;
10204
10205 Value *ResumeV = nullptr;
10206 // TODO: Move setting of resume values to prepareToExecute.
10207 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10208 const RecurrenceDescriptor &RdxDesc =
10209 ReductionPhi->getRecurrenceDescriptor();
10210 RecurKind RK = RdxDesc.getRecurrenceKind();
10211 ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
10213 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10214 // start value; compare the final value from the main vector loop
10215 // to the start value.
10216 IRBuilder<> Builder(
10217 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10218 ResumeV = Builder.CreateICmpNE(ResumeV,
10219 RdxDesc.getRecurrenceStartValue());
10220 }
10221 } else {
10222 // Create induction resume values for both widened pointer and
10223 // integer/fp inductions and update the start value of the induction
10224 // recipes to use the resume value.
10225 PHINode *IndPhi = nullptr;
10226 const InductionDescriptor *ID;
10227 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10228 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10229 ID = &Ind->getInductionDescriptor();
10230 } else {
10231 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10232 IndPhi = WidenInd->getPHINode();
10233 ID = &WidenInd->getInductionDescriptor();
10234 }
10235
10236 ResumeV = MainILV.createInductionResumeValue(
10237 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10239 }
10240 assert(ResumeV && "Must have a resume value");
10241 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
10242 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10243 }
10244
10245 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10246 DT, true, &ExpandedSCEVs);
10247 ++LoopsEpilogueVectorized;
10248
10249 if (!MainILV.areSafetyChecksAdded())
10250 DisableRuntimeUnroll = true;
10251 } else {
10252 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10253 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10254 PSI, Checks);
10255
10256 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10257 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10258 ++LoopsVectorized;
10259
10260 // Add metadata to disable runtime unrolling a scalar loop when there
10261 // are no runtime checks about strides and memory. A scalar loop that is
10262 // rarely used is not worth unrolling.
10263 if (!LB.areSafetyChecksAdded())
10264 DisableRuntimeUnroll = true;
10265 }
10266 // Report the vectorization decision.
10267 reportVectorization(ORE, L, VF, IC);
10268 }
10269
10272 }
10273
10274 std::optional<MDNode *> RemainderLoopID =
10277 if (RemainderLoopID) {
10278 L->setLoopID(*RemainderLoopID);
10279 } else {
10280 if (DisableRuntimeUnroll)
10282
10283 // Mark the loop as already vectorized to avoid vectorizing again.
10284 Hints.setAlreadyVectorized();
10285 }
10286
10287 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10288 return true;
10289}
10290
10296 SE = &SE_;
10297 LI = &LI_;
10298 TTI = &TTI_;
10299 DT = &DT_;
10300 BFI = BFI_;
10301 TLI = TLI_;
10302 AC = &AC_;
10303 LAIs = &LAIs_;
10304 DB = &DB_;
10305 ORE = &ORE_;
10306 PSI = PSI_;
10307
10308 // Don't attempt if
10309 // 1. the target claims to have no vector registers, and
10310 // 2. interleaving won't help ILP.
10311 //
10312 // The second condition is necessary because, even if the target has no
10313 // vector registers, loop vectorization may still enable scalar
10314 // interleaving.
10317 return LoopVectorizeResult(false, false);
10318
10319 bool Changed = false, CFGChanged = false;
10320
10321 // The vectorizer requires loops to be in simplified form.
10322 // Since simplification may add new inner loops, it has to run before the
10323 // legality and profitability checks. This means running the loop vectorizer
10324 // will simplify all loops, regardless of whether anything end up being
10325 // vectorized.
10326 for (const auto &L : *LI)
10327 Changed |= CFGChanged |=
10328 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10329
10330 // Build up a worklist of inner-loops to vectorize. This is necessary as
10331 // the act of vectorizing or partially unrolling a loop creates new loops
10332 // and can invalidate iterators across the loops.
10333 SmallVector<Loop *, 8> Worklist;
10334
10335 for (Loop *L : *LI)
10336 collectSupportedLoops(*L, LI, ORE, Worklist);
10337
10338 LoopsAnalyzed += Worklist.size();
10339
10340 // Now walk the identified inner loops.
10341 while (!Worklist.empty()) {
10342 Loop *L = Worklist.pop_back_val();
10343
10344 // For the inner loops we actually process, form LCSSA to simplify the
10345 // transform.
10346 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10347
10348 Changed |= CFGChanged |= processLoop(L);
10349
10350 if (Changed) {
10351 LAIs->clear();
10352
10353#ifndef NDEBUG
10354 if (VerifySCEV)
10355 SE->verify();
10356#endif
10357 }
10358 }
10359
10360 // Process each loop nest in the function.
10361 return LoopVectorizeResult(Changed, CFGChanged);
10362}
10363
10366 auto &LI = AM.getResult<LoopAnalysis>(F);
10367 // There are no loops in the function. Return before computing other expensive
10368 // analyses.
10369 if (LI.empty())
10370 return PreservedAnalyses::all();
10372 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10373 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10374 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10375 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10376 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10378
10380 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10382 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10383 BlockFrequencyInfo *BFI = nullptr;
10384 if (PSI && PSI->hasProfileSummary())
10386 LoopVectorizeResult Result =
10387 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10388 if (!Result.MadeAnyChange)
10389 return PreservedAnalyses::all();
10391
10392 if (isAssignmentTrackingEnabled(*F.getParent())) {
10393 for (auto &BB : F)
10395 }
10396
10397 // We currently do not preserve loopinfo/dominator analyses with outer loop
10398 // vectorization. Until this is addressed, mark these analyses as preserved
10399 // only for non-VPlan-native path.
10400 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10401 if (!EnableVPlanNativePath) {
10402 PA.preserve<LoopAnalysis>();
10405 }
10406
10407 if (Result.MadeCFGChange) {
10408 // Making CFG changes likely means a loop got vectorized. Indicate that
10409 // extra simplification passes should be run.
10410 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10411 // be run if runtime checks have been added.
10414 } else {
10416 }
10417 return PA;
10418}
10419
10421 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10422 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10423 OS, MapClassName2PassName);
10424
10425 OS << '<';
10426 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10427 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10428 OS << '>';
10429}
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:693
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
uint64_t Addr
std::string Name
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static void createAndCollectMergePhiForReduction(VPInstruction *RedResult, DenseMap< const RecurrenceDescriptor *, Value * > &ReductionResumeValues, VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue)
static std::optional< unsigned > getSmallBestKnownTC(ScalarEvolution &SE, Loop *L)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it's operands.
static void emitInvalidCostRemarks(SmallVector< InstructionVFPair > InvalidCosts, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
const char LLVMLoopVectorizeFollowupAll[]
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static Type * MaybeVectorizeType(Type *Elt, ElementCount VF)
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static Type * smallestIntegerVectorType(Type *T1, Type *T2)
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional< unsigned > VScale, Loop *L, ScalarEvolution &SE, ScalarEpilogueLowering SEL)
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I)
Create an analysis remark that explains why vectorization failed.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static std::string getDebugLocString(const Loop *L)
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static bool useActiveLaneMask(TailFoldingStyle Style)
static Type * largestIntegerVectorType(Type *T1, Type *T2)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static void AddRuntimeUnrollDisableMetaData(Loop *L)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static cl::opt< bool > PrintVPlansInDotFormat("vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans"))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define R2(n)
This file contains the declarations for metadata subclasses.
#define T1
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(VerifyEach)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
separate const offset from gep
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
This defines the Use class.
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:411
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:194
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:499
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:367
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:360
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:452
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
BinaryOps getOpcode() const
Definition: InstrTypes.h:513
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
static BranchInst * Create(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore)
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:2227
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1678
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
This class represents a function call, abstracting a target machine's calling convention.
static bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:849
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:856
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:101
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:323
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:314
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:319
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:103
param_iterator param_begin() const
Definition: DerivedTypes.h:128
param_iterator param_end() const
Definition: DerivedTypes.h:129
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:202
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:703
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:682
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:677
static GetElementPtrInst * Create(Type *PointeeType, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:511
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2460
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1212
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2516
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:578
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1110
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:526
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:311
Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Definition: IRBuilder.cpp:1170
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2245
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1721
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:486
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2205
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2241
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:145
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1344
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:598
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1327
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:471
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1666
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1826
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2351
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:516
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1404
Value * CreateStepVector(Type *DstType, const Twine &Name="")
Creates a vector of type DstType with the linear sequence <0, 1, ...>
Definition: IRBuilder.cpp:109
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
A struct for saving information about induction variables.
BinaryOperator * getInductionBinOp() const
InductionKind getKind() const
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
virtual std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
PHINode * createInductionResumeValue(PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef< BasicBlock * > BypassBlocks, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create a new phi node for the induction variable OrigPhi to resume iteration count in the scalar epil...
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
BasicBlock * LoopScalarBody
The scalar loop body.
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
DenseMap< PHINode *, Value * > IVEndValues
ElementCount MinProfitableTripCount
Value * createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL)
Returns a bitcasted value to the requested vector type.
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
LoopVectorizationCostModel * Cost
The profitablity analysis.
SmallMapVector< const RecurrenceDescriptor *, PHINode *, 4 > ReductionResumeValues
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * completeLoopSkeleton()
Complete the loop skeleton by adding debug MDs, creating appropriate conditional branches in the midd...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
LoopVectorizationLegality * Legal
The legality analysis.
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
LoopInfo * LI
Loop Info.
void createInductionResumeValues(const SCEV2ValueTy &ExpandedSCEVs, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create new phi nodes for the induction variables to resume iteration count in the scalar epilogue,...
ProfileSummaryInfo * PSI
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State)
Fix the non-induction PHIs in Plan.
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void vectorizeInterleaveGroup(const InterleaveGroup< Instruction > *Group, ArrayRef< VPValue * > VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef< VPValue * > StoredValues, VPValue *BlockInMask, bool NeedsMaskForGaps)
Try to vectorize interleaved access group Group with the base address given in Addr,...
void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State)
Create the exit value of first order recurrences in the middle block and update their users.
virtual std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
unsigned UF
The vectorization unroll factor to use.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan)
Fix the vectorized code, taking care of header phi's, live-outs, and more.
BasicBlock * LoopExitBlock
The unique ExitBlock of the scalar loop if one exists.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:149
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:451
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:444
uint32_t getFactor() const
Definition: VectorUtils.h:460
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:514
uint32_t getIndex(const InstTy *Instr) const
Get the index for the given member.
Definition: VectorUtils.h:521
bool isReverse() const
Definition: VectorUtils.h:459
InstTy * getInsertPos() const
Definition: VectorUtils.h:530
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
Definition: VectorUtils.h:461
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:586
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:631
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:642
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:623
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:606
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:636
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:184
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1222
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
bool isAccessInterleaved(Instruction *Instr)
Check if Instr belongs to any interleaved access group.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
std::pair< InstructionCost, bool > VectorizationCostTy
The vectorization cost is a combination of the cost itself and a boolean indicating whether any of th...
DemandedBits * DB
Demanded bits analysis.
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
VectorizationCostTy expectedCost(ElementCount VF, SmallVectorImpl< InstructionVFPair > *Invalid=nullptr)
Returns the expected execution cost.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool isEpilogueVectorizationProfitable(const ElementCount VF) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr)
Get the interleaved access group that Instr belongs to.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if value V is uniform across VF lanes, when VF is provided, and otherwise if V is invari...
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Type * getWidestInductionType()
Returns the widest induction type.
const LoopAccessInfo * getLAI() const
bool prepareToFoldTailByMasking()
Return true if we can vectorize this loop while folding its tail by masking, and mark all respective ...
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
std::optional< VectorizationFactor > plan(ElementCount UserVF, unsigned UserIC)
Plan how to best vectorize, return the best VF and its cost, or std::nullopt if vectorization and int...
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
std::pair< DenseMap< const SCEV *, Value * >, DenseMap< const RecurrenceDescriptor *, Value * > > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
VPlan & getBestPlanFor(ElementCount VF) const
Return the best VPlan for VF.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
void printPlans(raw_ostream &O)
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:66
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:501
Metadata node.
Definition: Metadata.h:1067
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1071
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:600
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:193
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:756
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:129
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
This class provides computation of slot numbers for LLVM Assembly writing.
Definition: AsmWriter.cpp:693
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:243
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:234
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
op_iterator op_end()
Definition: User.h:236
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:2825
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:2893
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:443
iterator end()
Definition: VPlan.h:2856
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:2854
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:2903
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:210
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:2884
bool empty() const
Definition: VPlan.h:2865
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:1948
VPRegionBlock * getParent()
Definition: VPlan.h:489
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:175
void setName(const Twine &newName)
Definition: VPlan.h:482
VPlan * getPlan()
Definition: VPlan.cpp:148
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:153
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:524
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:3382
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:2564
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition: VPlanValue.h:423
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:401
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition: VPlanValue.h:413
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:2763
VPValue * getStartValue() const
Definition: VPlan.h:2762
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1634
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1678
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1667
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1159
@ FirstOrderRecurrenceSplice
Definition: VPlan.h:1165
unsigned getOpcode() const
Definition: VPlan.h:1259
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2005
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2046
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2052
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition: VPlan.h:2059
unsigned getNumStoreOperands() const
Returns the number of stored operands of this interleave group.
Definition: VPlan.h:2079
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:169
static VPLane getFirstLane()
Definition: VPlan.h:167
A value that is used outside the VPlan.
Definition: VPlan.h:669
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:709
VPBasicBlock * getParent()
Definition: VPlan.h:734
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:800
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPValue * getVPValueOrAddLiveIn(Value *V, VPlan &Plan)
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
void createHeaderMask()
Create the mask for the vector loop header block.
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1082
A recipe for handling reduction phis.
Definition: VPlan.h:1889
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:1943
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:1935
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2094
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:2958
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:3029
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2142
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:826
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:888
This class can be used to assign names to VPValues.
Definition: VPlanValue.h:454
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:203
operand_range operands()
Definition: VPlanValue.h:278
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:258
unsigned getNumOperands() const
Definition: VPlanValue.h:252
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:253
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:247
Value * getUnderlyingValue()
Return the underlying Value attached to this VPValue.
Definition: VPlanValue.h:77
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition: VPlan.cpp:1302
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1270
user_iterator user_begin()
Definition: VPlanValue.h:129
unsigned getNumUsers() const
Definition: VPlanValue.h:112
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:173
user_iterator user_end()
Definition: VPlanValue.h:131
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:168
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1274
user_range users()
Definition: VPlanValue.h:133
A recipe to compute the pointers for widened memory accesses of IndexTy for all parts.
Definition: VPlan.h:1578
A recipe for widening Call instructions.
Definition: VPlan.h:1449
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:2689
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1360
A recipe for handling GEP instructions.
Definition: VPlan.h:1536
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:1691
A common base class for widening memory operations.
Definition: VPlan.h:2299
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
Definition: VPlan.h:2307
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition: VPlan.h:2346
Instruction & Ingredient
Definition: VPlan.h:2301
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2360
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2353
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition: VPlan.h:2350
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:1817
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:1856
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:1853
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void execute(VPTransformState &State) override
Generate vector values for the pointer induction.
VPWidenRecipe is a recipe for producing a copy of vector type its ingredient.
Definition: VPlan.h:1328
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3059
void prepareToExecute(Value *TripCount, Value *VectorTripCount, Value *CanonicalIVStartValue, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:783
VPBasicBlock * getEntry()
Definition: VPlan.h:3152
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3177
void setName(const Twine &newName)
Definition: VPlan.h:3208
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3180
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3156
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3170
void removeLiveOut(PHINode *PN)
Definition: VPlan.h:3265
void addLiveOut(PHINode *PN, VPValue *V)
Definition: VPlan.cpp:993
VPBasicBlock * getPreheader()
Definition: VPlan.h:3284
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.h:3246
static VPlanPtr createInitialVPlan(const SCEV *TripCount, ScalarEvolution &PSE)
Create initial VPlan skeleton, having an "entry" VPBasicBlock (wrapping original scalar pre-header) w...
Definition: VPlan.cpp:769
bool hasVF(ElementCount VF)
Definition: VPlan.h:3190
bool hasUF(unsigned UF) const
Definition: VPlan.h:3197
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3163
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3212
LLVM_DUMP_METHOD void dump() const
Dump the plan to stderr (for debugging).
Definition: VPlan.cpp:990
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:825
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:3254
const MapVector< PHINode *, VPLiveOut * > & getLiveOuts() const
Definition: VPlan.h:3270
VPValue * getSCEVExpansion(const SCEV *S) const
Definition: VPlan.h:3274
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1074
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:693
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
VectorBuilder & setEVL(Value *NewExplicitVectorLength)
Definition: VectorBuilder.h:77
VectorBuilder & setMask(Value *NewMask)
Definition: VectorBuilder.h:73
Value * createVectorInstruction(unsigned Opcode, Type *ReturnTy, ArrayRef< Value * > VecOpArray, const Twine &Name=Twine())
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:229
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:215
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:255
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:222
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:251
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:236
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:109
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:810
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:718
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlan.cpp:1459
bool isUniformAfterVectorization(VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlan.h:3606
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:456
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1808
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7063
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const SCEV * createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *OrigLoop)
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:425
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:53
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:134
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< bool > EnableLoopVectorization
Align getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:572
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
Type * ToVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
Definition: VectorUtils.h:133
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2430
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1616
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
@ Invalid
Denotes invalid value.
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:1868
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:613
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
#define OP(n)
Definition: regex2.h:73
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:26
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:50
ElementCountComparator creates a total ordering for ElementCount for the purposes of using it in a se...
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
LoopVectorizeResult runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:74
A marker to determine if extra passes after loop vectorization should be run.
Definition: LoopVectorize.h:85
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:87
ElementCount End
Definition: VPlan.h:92
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:1862
VPIteration represents a single point in the iteration space of the output (vectorized and/or unrolle...
Definition: VPlan.h:219
bool isFirstIteration() const
Definition: VPlan.h:231
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:365
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:373
BasicBlock * ExitBB
The last IR BasicBlock in the output IR.
Definition: VPlan.h:369
BasicBlock * getPreheaderBBFor(VPRecipeBase *R)
Returns the BasicBlock* mapped to the pre-header of the loop region containing R.
Definition: VPlan.cpp:348
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:236
Value * get(VPValue *Def, unsigned Part, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def and a given Part if IsScalar is false,...
Definition: VPlan.cpp:247
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:409
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:412
void addMetadata(Value *To, Instruction *From)
Add metadata from one instruction to another.
Definition: VPlan.cpp:361
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:405
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:353
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:393
void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar=false)
Set the generated vector Value for a given VPValue and a given Part, if IsScalar is false.
Definition: VPlan.h:288
std::optional< VPIteration > Instance
Hold the indices to generate specific scalar instructions.
Definition: VPlan.h:248
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:389
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:395
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:392
ElementCount VF
The chosen Vectorization and Unroll Factors of the loop being vectorized.
Definition: VPlan.h:242
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:372
void execute(VPTransformState &State) override
Generate the wide load or gather.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2426
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:2375
void execute(VPTransformState &State) override
Generate a wide load or gather.
A recipe for widening select instructions.
Definition: VPlan.h:1502
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition: VPlan.h:2502
void execute(VPTransformState &State) override
Generate the wide store or scatter.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2505
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:2449
void execute(VPTransformState &State) override
Generate a wide store or scatter.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition: VPlan.h:2466
static void addExplicitVectorLength(VPlan &Plan)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimize(VPlan &Plan, ScalarEvolution &SE)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs, LLVMContext &Ctx)
Insert truncates and extends for any truncated recipe.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Sink users of fixed-order recurrences after the recipe defining their previous value.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.