LLVM 18.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanTransforms.h"
62#include "llvm/ADT/APInt.h"
63#include "llvm/ADT/ArrayRef.h"
64#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/Hashing.h"
67#include "llvm/ADT/MapVector.h"
68#include "llvm/ADT/STLExtras.h"
70#include "llvm/ADT/SmallSet.h"
72#include "llvm/ADT/Statistic.h"
73#include "llvm/ADT/StringRef.h"
74#include "llvm/ADT/Twine.h"
79#include "llvm/Analysis/CFG.h"
95#include "llvm/IR/Attributes.h"
96#include "llvm/IR/BasicBlock.h"
97#include "llvm/IR/CFG.h"
98#include "llvm/IR/Constant.h"
99#include "llvm/IR/Constants.h"
100#include "llvm/IR/DataLayout.h"
101#include "llvm/IR/DebugInfo.h"
103#include "llvm/IR/DebugLoc.h"
104#include "llvm/IR/DerivedTypes.h"
106#include "llvm/IR/Dominators.h"
107#include "llvm/IR/Function.h"
108#include "llvm/IR/IRBuilder.h"
109#include "llvm/IR/InstrTypes.h"
110#include "llvm/IR/Instruction.h"
111#include "llvm/IR/Instructions.h"
113#include "llvm/IR/Intrinsics.h"
114#include "llvm/IR/Metadata.h"
115#include "llvm/IR/Module.h"
116#include "llvm/IR/Operator.h"
117#include "llvm/IR/PatternMatch.h"
118#include "llvm/IR/Type.h"
119#include "llvm/IR/Use.h"
120#include "llvm/IR/User.h"
121#include "llvm/IR/Value.h"
122#include "llvm/IR/ValueHandle.h"
123#include "llvm/IR/Verifier.h"
124#include "llvm/Support/Casting.h"
127#include "llvm/Support/Debug.h"
140#include <algorithm>
141#include <cassert>
142#include <cmath>
143#include <cstdint>
144#include <functional>
145#include <iterator>
146#include <limits>
147#include <map>
148#include <memory>
149#include <string>
150#include <tuple>
151#include <utility>
152
153using namespace llvm;
154
155#define LV_NAME "loop-vectorize"
156#define DEBUG_TYPE LV_NAME
157
158#ifndef NDEBUG
159const char VerboseDebug[] = DEBUG_TYPE "-verbose";
160#endif
161
162/// @{
163/// Metadata attribute names
164const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166 "llvm.loop.vectorize.followup_vectorized";
168 "llvm.loop.vectorize.followup_epilogue";
169/// @}
170
171STATISTIC(LoopsVectorized, "Number of loops vectorized");
172STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
173STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
174
176 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
177 cl::desc("Enable vectorization of epilogue loops."));
178
180 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
181 cl::desc("When epilogue vectorization is enabled, and a value greater than "
182 "1 is specified, forces the given VF for all applicable epilogue "
183 "loops."));
184
186 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
187 cl::desc("Only loops with vectorization factor equal to or larger than "
188 "the specified value are considered for epilogue vectorization."));
189
190/// Loops with a known constant trip count below this number are vectorized only
191/// if no scalar iteration overheads are incurred.
193 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
194 cl::desc("Loops with a constant trip count that is smaller than this "
195 "value are vectorized only if no scalar iteration overheads "
196 "are incurred."));
197
199 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
200 cl::desc("The maximum allowed number of runtime memory checks"));
201
202// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
203// that predication is preferred, and this lists all options. I.e., the
204// vectorizer will try to fold the tail-loop (epilogue) into the vector body
205// and predicate the instructions accordingly. If tail-folding fails, there are
206// different fallback strategies depending on these values:
208 enum Option {
212 };
213} // namespace PreferPredicateTy
214
216 "prefer-predicate-over-epilogue",
219 cl::desc("Tail-folding and predication preferences over creating a scalar "
220 "epilogue loop."),
222 "scalar-epilogue",
223 "Don't tail-predicate loops, create scalar epilogue"),
225 "predicate-else-scalar-epilogue",
226 "prefer tail-folding, create scalar epilogue if tail "
227 "folding fails."),
229 "predicate-dont-vectorize",
230 "prefers tail-folding, don't attempt vectorization if "
231 "tail-folding fails.")));
232
234 "force-tail-folding-style", cl::desc("Force the tail folding style"),
235 cl::init(TailFoldingStyle::None),
237 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
239 TailFoldingStyle::Data, "data",
240 "Create lane mask for data only, using active.lane.mask intrinsic"),
241 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
242 "data-without-lane-mask",
243 "Create lane mask with compare/stepvector"),
244 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
245 "Create lane mask using active.lane.mask intrinsic, and use "
246 "it for both data and control flow"),
248 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
249 "data-and-control-without-rt-check",
250 "Similar to data-and-control, but remove the runtime check")));
251
253 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
254 cl::desc("Maximize bandwidth when selecting vectorization factor which "
255 "will be determined by the smallest type in loop."));
256
258 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
259 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
260
261/// An interleave-group may need masking if it resides in a block that needs
262/// predication, or in order to mask away gaps.
264 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
265 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
266
268 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
269 cl::desc("We don't interleave loops with a estimated constant trip count "
270 "below this number"));
271
273 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274 cl::desc("A flag that overrides the target's number of scalar registers."));
275
277 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
278 cl::desc("A flag that overrides the target's number of vector registers."));
279
281 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
282 cl::desc("A flag that overrides the target's max interleave factor for "
283 "scalar loops."));
284
286 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
287 cl::desc("A flag that overrides the target's max interleave factor for "
288 "vectorized loops."));
289
291 "force-target-instruction-cost", cl::init(0), cl::Hidden,
292 cl::desc("A flag that overrides the target's expected cost for "
293 "an instruction to a single constant value. Mostly "
294 "useful for getting consistent testing."));
295
297 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
298 cl::desc(
299 "Pretend that scalable vectors are supported, even if the target does "
300 "not support them. This flag should only be used for testing."));
301
303 "small-loop-cost", cl::init(20), cl::Hidden,
304 cl::desc(
305 "The cost of a loop that is considered 'small' by the interleaver."));
306
308 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
309 cl::desc("Enable the use of the block frequency analysis to access PGO "
310 "heuristics minimizing code growth in cold regions and being more "
311 "aggressive in hot regions."));
312
313// Runtime interleave loops for load/store throughput.
315 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
316 cl::desc(
317 "Enable runtime interleaving until load/store ports are saturated"));
318
319/// Interleave small loops with scalar reductions.
321 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
322 cl::desc("Enable interleaving for loops with small iteration counts that "
323 "contain scalar reductions to expose ILP."));
324
325/// The number of stores in a loop that are allowed to need predication.
327 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
328 cl::desc("Max number of stores to be predicated behind an if."));
329
331 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
332 cl::desc("Count the induction variable only once when interleaving"));
333
335 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
336 cl::desc("Enable if predication of stores during vectorization."));
337
339 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
340 cl::desc("The maximum interleave count to use when interleaving a scalar "
341 "reduction in a nested loop."));
342
343static cl::opt<bool>
344 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
346 cl::desc("Prefer in-loop vector reductions, "
347 "overriding the targets preference."));
348
350 "force-ordered-reductions", cl::init(false), cl::Hidden,
351 cl::desc("Enable the vectorisation of loops with in-order (strict) "
352 "FP reductions"));
353
355 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
356 cl::desc(
357 "Prefer predicating a reduction operation over an after loop select."));
358
359namespace llvm {
361 "enable-vplan-native-path", cl::Hidden,
362 cl::desc("Enable VPlan-native vectorization path with "
363 "support for outer loop vectorization."));
364}
365
366// This flag enables the stress testing of the VPlan H-CFG construction in the
367// VPlan-native vectorization path. It must be used in conjuction with
368// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
369// verification of the H-CFGs built.
371 "vplan-build-stress-test", cl::init(false), cl::Hidden,
372 cl::desc(
373 "Build VPlan for every supported loop nest in the function and bail "
374 "out right after the build (stress test the VPlan H-CFG construction "
375 "in the VPlan-native vectorization path)."));
376
378 "interleave-loops", cl::init(true), cl::Hidden,
379 cl::desc("Enable loop interleaving in Loop vectorization passes"));
381 "vectorize-loops", cl::init(true), cl::Hidden,
382 cl::desc("Run the Loop vectorization passes"));
383
385 "vplan-print-in-dot-format", cl::Hidden,
386 cl::desc("Use dot format instead of plain text when dumping VPlans"));
387
389 "force-widen-divrem-via-safe-divisor", cl::Hidden,
390 cl::desc(
391 "Override cost based safe divisor widening for div/rem instructions"));
392
393/// A helper function that returns true if the given type is irregular. The
394/// type is irregular if its allocated size doesn't equal the store size of an
395/// element of the corresponding vector type.
396static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
397 // Determine if an array of N elements of type Ty is "bitcast compatible"
398 // with a <N x Ty> vector.
399 // This is only true if there is no padding between the array elements.
400 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
401}
402
403/// A helper function that returns the reciprocal of the block probability of
404/// predicated blocks. If we return X, we are assuming the predicated block
405/// will execute once for every X iterations of the loop header.
406///
407/// TODO: We should use actual block probability here, if available. Currently,
408/// we always assume predicated blocks have a 50% chance of executing.
409static unsigned getReciprocalPredBlockProb() { return 2; }
410
411/// Returns "best known" trip count for the specified loop \p L as defined by
412/// the following procedure:
413/// 1) Returns exact trip count if it is known.
414/// 2) Returns expected trip count according to profile data if any.
415/// 3) Returns upper bound estimate if it is known.
416/// 4) Returns std::nullopt if all of the above failed.
417static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
418 Loop *L) {
419 // Check if exact trip count is known.
420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
421 return ExpectedTC;
422
423 // Check if there is an expected trip count available from profile data.
425 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
426 return *EstimatedTC;
427
428 // Check if upper bound estimate is known.
429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
430 return ExpectedTC;
431
432 return std::nullopt;
433}
434
435/// Return a vector containing interleaved elements from multiple
436/// smaller input vectors.
438 const Twine &Name) {
439 unsigned Factor = Vals.size();
440 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
441
442 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
443#ifndef NDEBUG
444 for (Value *Val : Vals)
445 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
446#endif
447
448 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
449 // must use intrinsics to interleave.
450 if (VecTy->isScalableTy()) {
451 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
452 return Builder.CreateIntrinsic(
453 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
454 /*FMFSource=*/nullptr, Name);
455 }
456
457 // Fixed length. Start by concatenating all vectors into a wide vector.
458 Value *WideVec = concatenateVectors(Builder, Vals);
459
460 // Interleave the elements into the wide vector.
461 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
462 return Builder.CreateShuffleVector(
463 WideVec, createInterleaveMask(NumElts, Factor), Name);
464}
465
466namespace {
467// Forward declare GeneratedRTChecks.
468class GeneratedRTChecks;
469
470using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
471} // namespace
472
473namespace llvm {
474
476
477/// InnerLoopVectorizer vectorizes loops which contain only one basic
478/// block to a specified vectorization factor (VF).
479/// This class performs the widening of scalars into vectors, or multiple
480/// scalars. This class also implements the following features:
481/// * It inserts an epilogue loop for handling loops that don't have iteration
482/// counts that are known to be a multiple of the vectorization factor.
483/// * It handles the code generation for reduction variables.
484/// * Scalarization (implementation using scalars) of un-vectorizable
485/// instructions.
486/// InnerLoopVectorizer does not perform any vectorization-legality
487/// checks, and relies on the caller to check for the different legality
488/// aspects. The InnerLoopVectorizer relies on the
489/// LoopVectorizationLegality class to provide information about the induction
490/// and reduction variables that were found to a given vectorization factor.
492public:
495 const TargetLibraryInfo *TLI,
499 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
501 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
502 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
503 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
504 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
506 // Query this against the original loop and save it here because the profile
507 // of the original loop header may change as the transformation happens.
510
512 this->MinProfitableTripCount = VecWidth;
513 else
514 this->MinProfitableTripCount = MinProfitableTripCount;
515 }
516
517 virtual ~InnerLoopVectorizer() = default;
518
519 /// Create a new empty loop that will contain vectorized instructions later
520 /// on, while the old loop will be used as the scalar remainder. Control flow
521 /// is generated around the vectorized (and scalar epilogue) loops consisting
522 /// of various checks and bypasses. Return the pre-header block of the new
523 /// loop and the start value for the canonical induction, if it is != 0. The
524 /// latter is the case when vectorizing the epilogue loop. In the case of
525 /// epilogue vectorization, this function is overriden to handle the more
526 /// complex control flow around the loops. \p ExpandedSCEVs is used to
527 /// look up SCEV expansions for expressions needed during skeleton creation.
528 virtual std::pair<BasicBlock *, Value *>
529 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
530
531 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
532 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
533
534 // Return true if any runtime check is added.
536
537 /// A type for vectorized values in the new loop. Each value from the
538 /// original loop, when vectorized, is represented by UF vector values in the
539 /// new unrolled loop, where UF is the unroll factor.
541
542 /// A helper function to scalarize a single Instruction in the innermost loop.
543 /// Generates a sequence of scalar instances for each lane between \p MinLane
544 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
545 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
546 /// Instr's operands.
547 void scalarizeInstruction(const Instruction *Instr,
548 VPReplicateRecipe *RepRecipe,
549 const VPIteration &Instance,
550 VPTransformState &State);
551
552 /// Try to vectorize interleaved access group \p Group with the base address
553 /// given in \p Addr, optionally masking the vector operations if \p
554 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
555 /// values in the vectorized loop.
557 ArrayRef<VPValue *> VPDefs,
559 ArrayRef<VPValue *> StoredValues,
560 VPValue *BlockInMask, bool NeedsMaskForGaps);
561
562 /// Fix the non-induction PHIs in \p Plan.
563 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
564
565 /// Returns true if the reordering of FP operations is not allowed, but we are
566 /// able to vectorize with strict in-order reductions for the given RdxDesc.
567 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
568
569 // Returns the resume value (bc.merge.rdx) for a reduction as
570 // generated by fixReduction.
572
573 /// Create a new phi node for the induction variable \p OrigPhi to resume
574 /// iteration count in the scalar epilogue, from where the vectorized loop
575 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
576 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
577 /// and the resume values can come from an additional bypass block, the \p
578 /// AdditionalBypass pair provides information about the bypass block and the
579 /// end value on the edge from bypass to this loop.
581 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
582 ArrayRef<BasicBlock *> BypassBlocks,
583 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
584
585 /// Returns the original loop trip count.
586 Value *getTripCount() const { return TripCount; }
587
588 /// Used to set the trip count after ILV's construction and after the
589 /// preheader block has been executed. Note that this always holds the trip
590 /// count of the original loop for both main loop and epilogue vectorization.
591 void setTripCount(Value *TC) { TripCount = TC; }
592
593protected:
595
596 /// A small list of PHINodes.
598
599 /// A type for scalarized values in the new loop. Each value from the
600 /// original loop, when scalarized, is represented by UF x VF scalar values
601 /// in the new unrolled loop, where UF is the unroll factor and VF is the
602 /// vectorization factor.
604
605 /// Set up the values of the IVs correctly when exiting the vector loop.
606 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
607 Value *VectorTripCount, Value *EndValue,
608 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
609 VPlan &Plan, VPTransformState &State);
610
611 /// Handle all cross-iteration phis in the header.
613
614 /// Create the exit value of first order recurrences in the middle block and
615 /// update their users.
617 VPTransformState &State);
618
619 /// Create code for the loop exit value of the reduction.
621
622 /// Iteratively sink the scalarized operands of a predicated instruction into
623 /// the block that was created for it.
624 void sinkScalarOperands(Instruction *PredInst);
625
626 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
627 /// represented as.
629
630 /// Returns (and creates if needed) the trip count of the widened loop.
632
633 /// Returns a bitcasted value to the requested vector type.
634 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
636 const DataLayout &DL);
637
638 /// Emit a bypass check to see if the vector trip count is zero, including if
639 /// it overflows.
641
642 /// Emit a bypass check to see if all of the SCEV assumptions we've
643 /// had to make are correct. Returns the block containing the checks or
644 /// nullptr if no checks have been added.
646
647 /// Emit bypass checks to check any memory assumptions we may have made.
648 /// Returns the block containing the checks or nullptr if no checks have been
649 /// added.
651
652 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
653 /// vector loop preheader, middle block and scalar preheader.
655
656 /// Create new phi nodes for the induction variables to resume iteration count
657 /// in the scalar epilogue, from where the vectorized loop left off.
658 /// In cases where the loop skeleton is more complicated (eg. epilogue
659 /// vectorization) and the resume values can come from an additional bypass
660 /// block, the \p AdditionalBypass pair provides information about the bypass
661 /// block and the end value on the edge from bypass to this loop.
663 const SCEV2ValueTy &ExpandedSCEVs,
664 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
665
666 /// Complete the loop skeleton by adding debug MDs, creating appropriate
667 /// conditional branches in the middle block, preparing the builder and
668 /// running the verifier. Return the preheader of the completed vector loop.
670
671 /// Collect poison-generating recipes that may generate a poison value that is
672 /// used after vectorization, even when their operands are not poison. Those
673 /// recipes meet the following conditions:
674 /// * Contribute to the address computation of a recipe generating a widen
675 /// memory load/store (VPWidenMemoryInstructionRecipe or
676 /// VPInterleaveRecipe).
677 /// * Such a widen memory load/store has at least one underlying Instruction
678 /// that is in a basic block that needs predication and after vectorization
679 /// the generated instruction won't be predicated.
681
682 /// Allow subclasses to override and print debug traces before/after vplan
683 /// execution, when trace information is requested.
684 virtual void printDebugTracesAtStart(){};
685 virtual void printDebugTracesAtEnd(){};
686
687 /// The original loop.
689
690 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
691 /// dynamic knowledge to simplify SCEV expressions and converts them to a
692 /// more usable form.
694
695 /// Loop Info.
697
698 /// Dominator Tree.
700
701 /// Target Library Info.
703
704 /// Target Transform Info.
706
707 /// Assumption Cache.
709
710 /// Interface to emit optimization remarks.
712
713 /// The vectorization SIMD factor to use. Each vector will have this many
714 /// vector elements.
716
718
719 /// The vectorization unroll factor to use. Each scalar is vectorized to this
720 /// many different vector instructions.
721 unsigned UF;
722
723 /// The builder that we use
725
726 // --- Vectorization state ---
727
728 /// The vector-loop preheader.
730
731 /// The scalar-loop preheader.
733
734 /// Middle Block between the vector and the scalar.
736
737 /// The unique ExitBlock of the scalar loop if one exists. Note that
738 /// there can be multiple exiting edges reaching this block.
740
741 /// The scalar loop body.
743
744 /// A list of all bypass blocks. The first block is the entry of the loop.
746
747 /// Store instructions that were predicated.
749
750 /// Trip count of the original loop.
751 Value *TripCount = nullptr;
752
753 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
755
756 /// The legality analysis.
758
759 /// The profitablity analysis.
761
762 // Record whether runtime checks are added.
763 bool AddedSafetyChecks = false;
764
765 // Holds the end values for each induction variable. We save the end values
766 // so we can later fix-up the external users of the induction variables.
768
769 /// BFI and PSI are used to check for profile guided size optimizations.
772
773 // Whether this loop should be optimized for size based on profile guided size
774 // optimizatios.
776
777 /// Structure to hold information about generated runtime checks, responsible
778 /// for cleaning the checks, if vectorization turns out unprofitable.
779 GeneratedRTChecks &RTChecks;
780
781 // Holds the resume values for reductions in the loops, used to set the
782 // correct start value of reduction PHIs when vectorizing the epilogue.
785};
786
788public:
791 const TargetLibraryInfo *TLI,
793 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
796 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
798 ElementCount::getFixed(1),
799 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
800 BFI, PSI, Check) {}
801};
802
803/// Encapsulate information regarding vectorization of a loop and its epilogue.
804/// This information is meant to be updated and used across two stages of
805/// epilogue vectorization.
808 unsigned MainLoopUF = 0;
810 unsigned EpilogueUF = 0;
815 Value *TripCount = nullptr;
817
819 ElementCount EVF, unsigned EUF)
820 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
821 assert(EUF == 1 &&
822 "A high UF for the epilogue loop is likely not beneficial.");
823 }
824};
825
826/// An extension of the inner loop vectorizer that creates a skeleton for a
827/// vectorized loop that has its epilogue (residual) also vectorized.
828/// The idea is to run the vplan on a given loop twice, firstly to setup the
829/// skeleton and vectorize the main loop, and secondly to complete the skeleton
830/// from the first step and vectorize the epilogue. This is achieved by
831/// deriving two concrete strategy classes from this base class and invoking
832/// them in succession from the loop vectorizer planner.
834public:
842 GeneratedRTChecks &Checks)
844 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
845 CM, BFI, PSI, Checks),
846 EPI(EPI) {}
847
848 // Override this function to handle the more complex control flow around the
849 // three loops.
850 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
851 const SCEV2ValueTy &ExpandedSCEVs) final {
852 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
853 }
854
855 /// The interface for creating a vectorized skeleton using one of two
856 /// different strategies, each corresponding to one execution of the vplan
857 /// as described above.
858 virtual std::pair<BasicBlock *, Value *>
859 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
860
861 /// Holds and updates state information required to vectorize the main loop
862 /// and its epilogue in two separate passes. This setup helps us avoid
863 /// regenerating and recomputing runtime safety checks. It also helps us to
864 /// shorten the iteration-count-check path length for the cases where the
865 /// iteration count of the loop is so small that the main vector loop is
866 /// completely skipped.
868};
869
870/// A specialized derived class of inner loop vectorizer that performs
871/// vectorization of *main* loops in the process of vectorizing loops and their
872/// epilogues.
874public:
882 GeneratedRTChecks &Check)
884 EPI, LVL, CM, BFI, PSI, Check) {}
885 /// Implements the interface for creating a vectorized skeleton using the
886 /// *main loop* strategy (ie the first pass of vplan execution).
887 std::pair<BasicBlock *, Value *>
888 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
889
890protected:
891 /// Emits an iteration count bypass check once for the main loop (when \p
892 /// ForEpilogue is false) and once for the epilogue loop (when \p
893 /// ForEpilogue is true).
894 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
895 void printDebugTracesAtStart() override;
896 void printDebugTracesAtEnd() override;
897};
898
899// A specialized derived class of inner loop vectorizer that performs
900// vectorization of *epilogue* loops in the process of vectorizing loops and
901// their epilogues.
903public:
911 GeneratedRTChecks &Checks)
913 EPI, LVL, CM, BFI, PSI, Checks) {
915 }
916 /// Implements the interface for creating a vectorized skeleton using the
917 /// *epilogue loop* strategy (ie the second pass of vplan execution).
918 std::pair<BasicBlock *, Value *>
919 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
920
921protected:
922 /// Emits an iteration count bypass check after the main vector loop has
923 /// finished to see if there are any iterations left to execute by either
924 /// the vector epilogue or the scalar epilogue.
926 BasicBlock *Bypass,
927 BasicBlock *Insert);
928 void printDebugTracesAtStart() override;
929 void printDebugTracesAtEnd() override;
930};
931} // end namespace llvm
932
933/// Look for a meaningful debug location on the instruction or it's
934/// operands.
936 if (!I)
937 return DebugLoc();
938
940 if (I->getDebugLoc() != Empty)
941 return I->getDebugLoc();
942
943 for (Use &Op : I->operands()) {
944 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
945 if (OpInst->getDebugLoc() != Empty)
946 return OpInst->getDebugLoc();
947 }
948
949 return I->getDebugLoc();
950}
951
952/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
953/// is passed, the message relates to that particular instruction.
954#ifndef NDEBUG
955static void debugVectorizationMessage(const StringRef Prefix,
956 const StringRef DebugMsg,
957 Instruction *I) {
958 dbgs() << "LV: " << Prefix << DebugMsg;
959 if (I != nullptr)
960 dbgs() << " " << *I;
961 else
962 dbgs() << '.';
963 dbgs() << '\n';
964}
965#endif
966
967/// Create an analysis remark that explains why vectorization failed
968///
969/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
970/// RemarkName is the identifier for the remark. If \p I is passed it is an
971/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
972/// the location of the remark. \return the remark object that can be
973/// streamed to.
975 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
976 Value *CodeRegion = TheLoop->getHeader();
977 DebugLoc DL = TheLoop->getStartLoc();
978
979 if (I) {
980 CodeRegion = I->getParent();
981 // If there is no debug location attached to the instruction, revert back to
982 // using the loop's.
983 if (I->getDebugLoc())
984 DL = I->getDebugLoc();
985 }
986
987 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
988}
989
990namespace llvm {
991
992/// Return a value for Step multiplied by VF.
994 int64_t Step) {
995 assert(Ty->isIntegerTy() && "Expected an integer step");
996 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
997}
998
999/// Return the runtime value for VF.
1001 return B.CreateElementCount(Ty, VF);
1002}
1003
1005 Loop *OrigLoop) {
1006 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
1007 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
1008
1009 ScalarEvolution &SE = *PSE.getSE();
1010 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
1011}
1012
1014 const StringRef OREMsg, const StringRef ORETag,
1015 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1016 Instruction *I) {
1017 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1018 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1019 ORE->emit(
1020 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1021 << "loop not vectorized: " << OREMsg);
1022}
1023
1024void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1025 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1026 Instruction *I) {
1028 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1029 ORE->emit(
1030 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1031 << Msg);
1032}
1033
1034/// Report successful vectorization of the loop. In case an outer loop is
1035/// vectorized, prepend "outer" to the vectorization remark.
1037 VectorizationFactor VF, unsigned IC) {
1039 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1040 nullptr));
1041 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1042 ORE->emit([&]() {
1043 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1044 TheLoop->getHeader())
1045 << "vectorized " << LoopType << "loop (vectorization width: "
1046 << ore::NV("VectorizationFactor", VF.Width)
1047 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1048 });
1049}
1050
1051} // end namespace llvm
1052
1053#ifndef NDEBUG
1054/// \return string containing a file name and a line # for the given loop.
1055static std::string getDebugLocString(const Loop *L) {
1056 std::string Result;
1057 if (L) {
1058 raw_string_ostream OS(Result);
1059 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1060 LoopDbgLoc.print(OS);
1061 else
1062 // Just print the module name.
1063 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1064 OS.flush();
1065 }
1066 return Result;
1067}
1068#endif
1069
1071 VPTransformState &State) {
1072
1073 // Collect recipes in the backward slice of `Root` that may generate a poison
1074 // value that is used after vectorization.
1076 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1078 Worklist.push_back(Root);
1079
1080 // Traverse the backward slice of Root through its use-def chain.
1081 while (!Worklist.empty()) {
1082 VPRecipeBase *CurRec = Worklist.back();
1083 Worklist.pop_back();
1084
1085 if (!Visited.insert(CurRec).second)
1086 continue;
1087
1088 // Prune search if we find another recipe generating a widen memory
1089 // instruction. Widen memory instructions involved in address computation
1090 // will lead to gather/scatter instructions, which don't need to be
1091 // handled.
1092 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1093 isa<VPInterleaveRecipe>(CurRec) ||
1094 isa<VPScalarIVStepsRecipe>(CurRec) ||
1095 isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1096 isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1097 continue;
1098
1099 // This recipe contributes to the address computation of a widen
1100 // load/store. If the underlying instruction has poison-generating flags,
1101 // drop them directly.
1102 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
1103 RecWithFlags->dropPoisonGeneratingFlags();
1104 } else {
1105 Instruction *Instr = CurRec->getUnderlyingInstr();
1106 (void)Instr;
1107 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
1108 "found instruction with poison generating flags not covered by "
1109 "VPRecipeWithIRFlags");
1110 }
1111
1112 // Add new definitions to the worklist.
1113 for (VPValue *operand : CurRec->operands())
1114 if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1115 Worklist.push_back(OpDef);
1116 }
1117 });
1118
1119 // Traverse all the recipes in the VPlan and collect the poison-generating
1120 // recipes in the backward slice starting at the address of a VPWidenRecipe or
1121 // VPInterleaveRecipe.
1122 auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1123 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1124 for (VPRecipeBase &Recipe : *VPBB) {
1125 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1126 Instruction &UnderlyingInstr = WidenRec->getIngredient();
1127 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1128 if (AddrDef && WidenRec->isConsecutive() &&
1129 Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1130 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1131 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1132 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1133 if (AddrDef) {
1134 // Check if any member of the interleave group needs predication.
1135 const InterleaveGroup<Instruction> *InterGroup =
1136 InterleaveRec->getInterleaveGroup();
1137 bool NeedPredication = false;
1138 for (int I = 0, NumMembers = InterGroup->getNumMembers();
1139 I < NumMembers; ++I) {
1140 Instruction *Member = InterGroup->getMember(I);
1141 if (Member)
1142 NeedPredication |=
1143 Legal->blockNeedsPredication(Member->getParent());
1144 }
1145
1146 if (NeedPredication)
1147 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1148 }
1149 }
1150 }
1151 }
1152}
1153
1155 const RecurrenceDescriptor &RdxDesc) {
1156 auto It = ReductionResumeValues.find(&RdxDesc);
1157 assert(It != ReductionResumeValues.end() &&
1158 "Expected to find a resume value for the reduction.");
1159 return It->second;
1160}
1161
1162namespace llvm {
1163
1164// Loop vectorization cost-model hints how the scalar epilogue loop should be
1165// lowered.
1167
1168 // The default: allowing scalar epilogues.
1170
1171 // Vectorization with OptForSize: don't allow epilogues.
1173
1174 // A special case of vectorisation with OptForSize: loops with a very small
1175 // trip count are considered for vectorization under OptForSize, thereby
1176 // making sure the cost of their loop body is dominant, free of runtime
1177 // guards and scalar iteration overheads.
1179
1180 // Loop hint predicate indicating an epilogue is undesired.
1182
1183 // Directive indicating we must either tail fold or not vectorize
1186
1187using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1188
1189/// LoopVectorizationCostModel - estimates the expected speedups due to
1190/// vectorization.
1191/// In many cases vectorization is not profitable. This can happen because of
1192/// a number of reasons. In this class we mainly attempt to predict the
1193/// expected speedup/slowdowns due to the supported instruction set. We use the
1194/// TargetTransformInfo to query the different backends for the cost of
1195/// different operations.
1197public:
1201 const TargetTransformInfo &TTI,
1207 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1208 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1209 Hints(Hints), InterleaveInfo(IAI) {}
1210
1211 /// \return An upper bound for the vectorization factors (both fixed and
1212 /// scalable). If the factors are 0, vectorization and interleaving should be
1213 /// avoided up front.
1214 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1215
1216 /// \return True if runtime checks are required for vectorization, and false
1217 /// otherwise.
1218 bool runtimeChecksRequired();
1219
1220 /// Setup cost-based decisions for user vectorization factor.
1221 /// \return true if the UserVF is a feasible VF to be chosen.
1225 return expectedCost(UserVF).first.isValid();
1226 }
1227
1228 /// \return The size (in bits) of the smallest and widest types in the code
1229 /// that needs to be vectorized. We ignore values that remain scalar such as
1230 /// 64 bit loop indices.
1231 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1232
1233 /// \return The desired interleave count.
1234 /// If interleave count has been specified by metadata it will be returned.
1235 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1236 /// are the selected vectorization factor and the cost of the selected VF.
1237 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1238
1239 /// Memory access instruction may be vectorized in more than one way.
1240 /// Form of instruction after vectorization depends on cost.
1241 /// This function takes cost-based decisions for Load/Store instructions
1242 /// and collects them in a map. This decisions map is used for building
1243 /// the lists of loop-uniform and loop-scalar instructions.
1244 /// The calculated cost is saved with widening decision in order to
1245 /// avoid redundant calculations.
1247
1248 /// A struct that represents some properties of the register usage
1249 /// of a loop.
1251 /// Holds the number of loop invariant values that are used in the loop.
1252 /// The key is ClassID of target-provided register class.
1254 /// Holds the maximum number of concurrent live intervals in the loop.
1255 /// The key is ClassID of target-provided register class.
1257 };
1258
1259 /// \return Returns information about the register usages of the loop for the
1260 /// given vectorization factors.
1263
1264 /// Collect values we want to ignore in the cost model.
1265 void collectValuesToIgnore();
1266
1267 /// Collect all element types in the loop for which widening is needed.
1269
1270 /// Split reductions into those that happen in the loop, and those that happen
1271 /// outside. In loop reductions are collected into InLoopReductions.
1273
1274 /// Returns true if we should use strict in-order reductions for the given
1275 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1276 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1277 /// of FP operations.
1278 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1279 return !Hints->allowReordering() && RdxDesc.isOrdered();
1280 }
1281
1282 /// \returns The smallest bitwidth each instruction can be represented with.
1283 /// The vector equivalents of these instructions should be truncated to this
1284 /// type.
1286 return MinBWs;
1287 }
1288
1289 /// \returns True if it is more profitable to scalarize instruction \p I for
1290 /// vectorization factor \p VF.
1292 assert(VF.isVector() &&
1293 "Profitable to scalarize relevant only for VF > 1.");
1294
1295 // Cost model is not run in the VPlan-native path - return conservative
1296 // result until this changes.
1298 return false;
1299
1300 auto Scalars = InstsToScalarize.find(VF);
1301 assert(Scalars != InstsToScalarize.end() &&
1302 "VF not yet analyzed for scalarization profitability");
1303 return Scalars->second.contains(I);
1304 }
1305
1306 /// Returns true if \p I is known to be uniform after vectorization.
1308 // Pseudo probe needs to be duplicated for each unrolled iteration and
1309 // vector lane so that profiled loop trip count can be accurately
1310 // accumulated instead of being under counted.
1311 if (isa<PseudoProbeInst>(I))
1312 return false;
1313
1314 if (VF.isScalar())
1315 return true;
1316
1317 // Cost model is not run in the VPlan-native path - return conservative
1318 // result until this changes.
1320 return false;
1321
1322 auto UniformsPerVF = Uniforms.find(VF);
1323 assert(UniformsPerVF != Uniforms.end() &&
1324 "VF not yet analyzed for uniformity");
1325 return UniformsPerVF->second.count(I);
1326 }
1327
1328 /// Returns true if \p I is known to be scalar after vectorization.
1330 if (VF.isScalar())
1331 return true;
1332
1333 // Cost model is not run in the VPlan-native path - return conservative
1334 // result until this changes.
1336 return false;
1337
1338 auto ScalarsPerVF = Scalars.find(VF);
1339 assert(ScalarsPerVF != Scalars.end() &&
1340 "Scalar values are not calculated for VF");
1341 return ScalarsPerVF->second.count(I);
1342 }
1343
1344 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1345 /// for vectorization factor \p VF.
1347 return VF.isVector() && MinBWs.contains(I) &&
1348 !isProfitableToScalarize(I, VF) &&
1350 }
1351
1352 /// Decision that was taken during cost calculation for memory instruction.
1355 CM_Widen, // For consecutive accesses with stride +1.
1356 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1361
1362 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1363 /// instruction \p I and vector width \p VF.
1366 assert(VF.isVector() && "Expected VF >=2");
1367 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1368 }
1369
1370 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1371 /// interleaving group \p Grp and vector width \p VF.
1375 assert(VF.isVector() && "Expected VF >=2");
1376 /// Broadcast this decicion to all instructions inside the group.
1377 /// But the cost will be assigned to one instruction only.
1378 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1379 if (auto *I = Grp->getMember(i)) {
1380 if (Grp->getInsertPos() == I)
1381 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1382 else
1383 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1384 }
1385 }
1386 }
1387
1388 /// Return the cost model decision for the given instruction \p I and vector
1389 /// width \p VF. Return CM_Unknown if this instruction did not pass
1390 /// through the cost modeling.
1392 assert(VF.isVector() && "Expected VF to be a vector VF");
1393 // Cost model is not run in the VPlan-native path - return conservative
1394 // result until this changes.
1396 return CM_GatherScatter;
1397
1398 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1399 auto Itr = WideningDecisions.find(InstOnVF);
1400 if (Itr == WideningDecisions.end())
1401 return CM_Unknown;
1402 return Itr->second.first;
1403 }
1404
1405 /// Return the vectorization cost for the given instruction \p I and vector
1406 /// width \p VF.
1408 assert(VF.isVector() && "Expected VF >=2");
1409 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1410 assert(WideningDecisions.contains(InstOnVF) &&
1411 "The cost is not calculated");
1412 return WideningDecisions[InstOnVF].second;
1413 }
1414
1415 /// Return True if instruction \p I is an optimizable truncate whose operand
1416 /// is an induction variable. Such a truncate will be removed by adding a new
1417 /// induction variable with the destination type.
1419 // If the instruction is not a truncate, return false.
1420 auto *Trunc = dyn_cast<TruncInst>(I);
1421 if (!Trunc)
1422 return false;
1423
1424 // Get the source and destination types of the truncate.
1425 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1426 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1427
1428 // If the truncate is free for the given types, return false. Replacing a
1429 // free truncate with an induction variable would add an induction variable
1430 // update instruction to each iteration of the loop. We exclude from this
1431 // check the primary induction variable since it will need an update
1432 // instruction regardless.
1433 Value *Op = Trunc->getOperand(0);
1434 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1435 return false;
1436
1437 // If the truncated value is not an induction variable, return false.
1438 return Legal->isInductionPhi(Op);
1439 }
1440
1441 /// Collects the instructions to scalarize for each predicated instruction in
1442 /// the loop.
1444
1445 /// Collect Uniform and Scalar values for the given \p VF.
1446 /// The sets depend on CM decision for Load/Store instructions
1447 /// that may be vectorized as interleave, gather-scatter or scalarized.
1449 // Do the analysis once.
1450 if (VF.isScalar() || Uniforms.contains(VF))
1451 return;
1453 collectLoopUniforms(VF);
1454 collectLoopScalars(VF);
1455 }
1456
1457 /// Returns true if the target machine supports masked store operation
1458 /// for the given \p DataType and kind of access to \p Ptr.
1459 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1460 return Legal->isConsecutivePtr(DataType, Ptr) &&
1461 TTI.isLegalMaskedStore(DataType, Alignment);
1462 }
1463
1464 /// Returns true if the target machine supports masked load operation
1465 /// for the given \p DataType and kind of access to \p Ptr.
1466 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1467 return Legal->isConsecutivePtr(DataType, Ptr) &&
1468 TTI.isLegalMaskedLoad(DataType, Alignment);
1469 }
1470
1471 /// Returns true if the target machine can represent \p V as a masked gather
1472 /// or scatter operation.
1474 bool LI = isa<LoadInst>(V);
1475 bool SI = isa<StoreInst>(V);
1476 if (!LI && !SI)
1477 return false;
1478 auto *Ty = getLoadStoreType(V);
1480 if (VF.isVector())
1481 Ty = VectorType::get(Ty, VF);
1482 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1483 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1484 }
1485
1486 /// Returns true if the target machine supports all of the reduction
1487 /// variables found for the given VF.
1489 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1490 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1491 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1492 }));
1493 }
1494
1495 /// Given costs for both strategies, return true if the scalar predication
1496 /// lowering should be used for div/rem. This incorporates an override
1497 /// option so it is not simply a cost comparison.
1499 InstructionCost SafeDivisorCost) const {
1500 switch (ForceSafeDivisor) {
1501 case cl::BOU_UNSET:
1502 return ScalarCost < SafeDivisorCost;
1503 case cl::BOU_TRUE:
1504 return false;
1505 case cl::BOU_FALSE:
1506 return true;
1507 };
1508 llvm_unreachable("impossible case value");
1509 }
1510
1511 /// Returns true if \p I is an instruction which requires predication and
1512 /// for which our chosen predication strategy is scalarization (i.e. we
1513 /// don't have an alternate strategy such as masking available).
1514 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1516
1517 /// Returns true if \p I is an instruction that needs to be predicated
1518 /// at runtime. The result is independent of the predication mechanism.
1519 /// Superset of instructions that return true for isScalarWithPredication.
1520 bool isPredicatedInst(Instruction *I) const;
1521
1522 /// Return the costs for our two available strategies for lowering a
1523 /// div/rem operation which requires speculating at least one lane.
1524 /// First result is for scalarization (will be invalid for scalable
1525 /// vectors); second is for the safe-divisor strategy.
1526 std::pair<InstructionCost, InstructionCost>
1528 ElementCount VF) const;
1529
1530 /// Returns true if \p I is a memory instruction with consecutive memory
1531 /// access that can be widened.
1533
1534 /// Returns true if \p I is a memory instruction in an interleaved-group
1535 /// of memory accesses that can be vectorized with wide vector loads/stores
1536 /// and shuffles.
1538
1539 /// Check if \p Instr belongs to any interleaved access group.
1541 return InterleaveInfo.isInterleaved(Instr);
1542 }
1543
1544 /// Get the interleaved access group that \p Instr belongs to.
1547 return InterleaveInfo.getInterleaveGroup(Instr);
1548 }
1549
1550 /// Returns true if we're required to use a scalar epilogue for at least
1551 /// the final iteration of the original loop.
1552 bool requiresScalarEpilogue(bool IsVectorizing) const {
1554 return false;
1555 // If we might exit from anywhere but the latch, must run the exiting
1556 // iteration in scalar form.
1558 return true;
1559 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1560 }
1561
1562 /// Returns true if we're required to use a scalar epilogue for at least
1563 /// the final iteration of the original loop for all VFs in \p Range.
1564 /// A scalar epilogue must either be required for all VFs in \p Range or for
1565 /// none.
1567 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1568 return requiresScalarEpilogue(VF.isVector());
1569 };
1570 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1571 assert(
1572 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1573 "all VFs in range must agree on whether a scalar epilogue is required");
1574 return IsRequired;
1575 }
1576
1577 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1578 /// loop hint annotation.
1580 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1581 }
1582
1583 /// Returns the TailFoldingStyle that is best for the current loop.
1585 getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1586 if (!CanFoldTailByMasking)
1588
1589 if (ForceTailFoldingStyle.getNumOccurrences())
1590 return ForceTailFoldingStyle;
1591
1592 return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
1593 }
1594
1595 /// Returns true if all loop blocks should be masked to fold tail loop.
1596 bool foldTailByMasking() const {
1598 }
1599
1600 /// Returns true if the instructions in this block requires predication
1601 /// for any reason, e.g. because tail folding now requires a predicate
1602 /// or because the block in the original loop was predicated.
1605 }
1606
1607 /// Returns true if the Phi is part of an inloop reduction.
1608 bool isInLoopReduction(PHINode *Phi) const {
1609 return InLoopReductions.contains(Phi);
1610 }
1611
1612 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1613 /// with factor VF. Return the cost of the instruction, including
1614 /// scalarization overhead if it's needed.
1616
1617 /// Estimate cost of a call instruction CI if it were vectorized with factor
1618 /// VF. Return the cost of the instruction, including scalarization overhead
1619 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1620 /// scalarized -
1621 /// i.e. either vector version isn't available, or is too expensive.
1623 Function **Variant,
1624 bool *NeedsMask = nullptr) const;
1625
1626 /// Invalidates decisions already taken by the cost model.
1628 WideningDecisions.clear();
1629 Uniforms.clear();
1630 Scalars.clear();
1631 }
1632
1633 /// The vectorization cost is a combination of the cost itself and a boolean
1634 /// indicating whether any of the contributing operations will actually
1635 /// operate on vector values after type legalization in the backend. If this
1636 /// latter value is false, then all operations will be scalarized (i.e. no
1637 /// vectorization has actually taken place).
1638 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1639
1640 /// Returns the expected execution cost. The unit of the cost does
1641 /// not matter because we use the 'cost' units to compare different
1642 /// vector widths. The cost that is returned is *not* normalized by
1643 /// the factor width. If \p Invalid is not nullptr, this function
1644 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1645 /// each instruction that has an Invalid cost for the given VF.
1649
1650 bool hasPredStores() const { return NumPredStores > 0; }
1651
1652 /// Returns true if epilogue vectorization is considered profitable, and
1653 /// false otherwise.
1654 /// \p VF is the vectorization factor chosen for the original loop.
1656
1657private:
1658 unsigned NumPredStores = 0;
1659
1660 /// \return An upper bound for the vectorization factors for both
1661 /// fixed and scalable vectorization, where the minimum-known number of
1662 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1663 /// disabled or unsupported, then the scalable part will be equal to
1664 /// ElementCount::getScalable(0).
1665 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1666 ElementCount UserVF,
1667 bool FoldTailByMasking);
1668
1669 /// \return the maximized element count based on the targets vector
1670 /// registers and the loop trip-count, but limited to a maximum safe VF.
1671 /// This is a helper function of computeFeasibleMaxVF.
1672 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1673 unsigned SmallestType,
1674 unsigned WidestType,
1675 ElementCount MaxSafeVF,
1676 bool FoldTailByMasking);
1677
1678 /// \return the maximum legal scalable VF, based on the safe max number
1679 /// of elements.
1680 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1681
1682 /// Returns the execution time cost of an instruction for a given vector
1683 /// width. Vector width of one means scalar.
1684 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1685
1686 /// The cost-computation logic from getInstructionCost which provides
1687 /// the vector type as an output parameter.
1688 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1689 Type *&VectorTy);
1690
1691 /// Return the cost of instructions in an inloop reduction pattern, if I is
1692 /// part of that pattern.
1693 std::optional<InstructionCost>
1694 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1696
1697 /// Calculate vectorization cost of memory instruction \p I.
1698 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1699
1700 /// The cost computation for scalarized memory instruction.
1701 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1702
1703 /// The cost computation for interleaving group of memory instructions.
1704 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1705
1706 /// The cost computation for Gather/Scatter instruction.
1707 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1708
1709 /// The cost computation for widening instruction \p I with consecutive
1710 /// memory access.
1711 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1712
1713 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1714 /// Load: scalar load + broadcast.
1715 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1716 /// element)
1717 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1718
1719 /// Estimate the overhead of scalarizing an instruction. This is a
1720 /// convenience wrapper for the type-based getScalarizationOverhead API.
1721 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1723
1724 /// Returns true if an artificially high cost for emulated masked memrefs
1725 /// should be used.
1726 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1727
1728 /// Map of scalar integer values to the smallest bitwidth they can be legally
1729 /// represented as. The vector equivalents of these values should be truncated
1730 /// to this type.
1732
1733 /// A type representing the costs for instructions if they were to be
1734 /// scalarized rather than vectorized. The entries are Instruction-Cost
1735 /// pairs.
1736 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1737
1738 /// A set containing all BasicBlocks that are known to present after
1739 /// vectorization as a predicated block.
1741 PredicatedBBsAfterVectorization;
1742
1743 /// Records whether it is allowed to have the original scalar loop execute at
1744 /// least once. This may be needed as a fallback loop in case runtime
1745 /// aliasing/dependence checks fail, or to handle the tail/remainder
1746 /// iterations when the trip count is unknown or doesn't divide by the VF,
1747 /// or as a peel-loop to handle gaps in interleave-groups.
1748 /// Under optsize and when the trip count is very small we don't allow any
1749 /// iterations to execute in the scalar loop.
1750 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1751
1752 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1753 bool CanFoldTailByMasking = false;
1754
1755 /// A map holding scalar costs for different vectorization factors. The
1756 /// presence of a cost for an instruction in the mapping indicates that the
1757 /// instruction will be scalarized when vectorizing with the associated
1758 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1760
1761 /// Holds the instructions known to be uniform after vectorization.
1762 /// The data is collected per VF.
1764
1765 /// Holds the instructions known to be scalar after vectorization.
1766 /// The data is collected per VF.
1768
1769 /// Holds the instructions (address computations) that are forced to be
1770 /// scalarized.
1772
1773 /// PHINodes of the reductions that should be expanded in-loop.
1774 SmallPtrSet<PHINode *, 4> InLoopReductions;
1775
1776 /// A Map of inloop reduction operations and their immediate chain operand.
1777 /// FIXME: This can be removed once reductions can be costed correctly in
1778 /// VPlan. This was added to allow quick lookup of the inloop operations.
1779 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1780
1781 /// Returns the expected difference in cost from scalarizing the expression
1782 /// feeding a predicated instruction \p PredInst. The instructions to
1783 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1784 /// non-negative return value implies the expression will be scalarized.
1785 /// Currently, only single-use chains are considered for scalarization.
1786 InstructionCost computePredInstDiscount(Instruction *PredInst,
1787 ScalarCostsTy &ScalarCosts,
1788 ElementCount VF);
1789
1790 /// Collect the instructions that are uniform after vectorization. An
1791 /// instruction is uniform if we represent it with a single scalar value in
1792 /// the vectorized loop corresponding to each vector iteration. Examples of
1793 /// uniform instructions include pointer operands of consecutive or
1794 /// interleaved memory accesses. Note that although uniformity implies an
1795 /// instruction will be scalar, the reverse is not true. In general, a
1796 /// scalarized instruction will be represented by VF scalar values in the
1797 /// vectorized loop, each corresponding to an iteration of the original
1798 /// scalar loop.
1799 void collectLoopUniforms(ElementCount VF);
1800
1801 /// Collect the instructions that are scalar after vectorization. An
1802 /// instruction is scalar if it is known to be uniform or will be scalarized
1803 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1804 /// to the list if they are used by a load/store instruction that is marked as
1805 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1806 /// VF values in the vectorized loop, each corresponding to an iteration of
1807 /// the original scalar loop.
1808 void collectLoopScalars(ElementCount VF);
1809
1810 /// Keeps cost model vectorization decision and cost for instructions.
1811 /// Right now it is used for memory instructions only.
1813 std::pair<InstWidening, InstructionCost>>;
1814
1815 DecisionList WideningDecisions;
1816
1817 /// Returns true if \p V is expected to be vectorized and it needs to be
1818 /// extracted.
1819 bool needsExtract(Value *V, ElementCount VF) const {
1820 Instruction *I = dyn_cast<Instruction>(V);
1821 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1823 return false;
1824
1825 // Assume we can vectorize V (and hence we need extraction) if the
1826 // scalars are not computed yet. This can happen, because it is called
1827 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1828 // the scalars are collected. That should be a safe assumption in most
1829 // cases, because we check if the operands have vectorizable types
1830 // beforehand in LoopVectorizationLegality.
1831 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1832 };
1833
1834 /// Returns a range containing only operands needing to be extracted.
1835 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1836 ElementCount VF) const {
1838 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1839 }
1840
1841public:
1842 /// The loop that we evaluate.
1844
1845 /// Predicated scalar evolution analysis.
1847
1848 /// Loop Info analysis.
1850
1851 /// Vectorization legality.
1853
1854 /// Vector target information.
1856
1857 /// Target Library Info.
1859
1860 /// Demanded bits analysis.
1862
1863 /// Assumption cache.
1865
1866 /// Interface to emit optimization remarks.
1868
1870
1871 /// Loop Vectorize Hint.
1873
1874 /// The interleave access information contains groups of interleaved accesses
1875 /// with the same stride and close to each other.
1877
1878 /// Values to ignore in the cost model.
1880
1881 /// Values to ignore in the cost model when VF > 1.
1883
1884 /// All element types found in the loop.
1886};
1887} // end namespace llvm
1888
1889namespace {
1890/// Helper struct to manage generating runtime checks for vectorization.
1891///
1892/// The runtime checks are created up-front in temporary blocks to allow better
1893/// estimating the cost and un-linked from the existing IR. After deciding to
1894/// vectorize, the checks are moved back. If deciding not to vectorize, the
1895/// temporary blocks are completely removed.
1896class GeneratedRTChecks {
1897 /// Basic block which contains the generated SCEV checks, if any.
1898 BasicBlock *SCEVCheckBlock = nullptr;
1899
1900 /// The value representing the result of the generated SCEV checks. If it is
1901 /// nullptr, either no SCEV checks have been generated or they have been used.
1902 Value *SCEVCheckCond = nullptr;
1903
1904 /// Basic block which contains the generated memory runtime checks, if any.
1905 BasicBlock *MemCheckBlock = nullptr;
1906
1907 /// The value representing the result of the generated memory runtime checks.
1908 /// If it is nullptr, either no memory runtime checks have been generated or
1909 /// they have been used.
1910 Value *MemRuntimeCheckCond = nullptr;
1911
1912 DominatorTree *DT;
1913 LoopInfo *LI;
1915
1916 SCEVExpander SCEVExp;
1917 SCEVExpander MemCheckExp;
1918
1919 bool CostTooHigh = false;
1920
1921public:
1922 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1924 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1925 MemCheckExp(SE, DL, "scev.check") {}
1926
1927 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1928 /// accurately estimate the cost of the runtime checks. The blocks are
1929 /// un-linked from the IR and is added back during vector code generation. If
1930 /// there is no vector code generation, the check blocks are removed
1931 /// completely.
1932 void Create(Loop *L, const LoopAccessInfo &LAI,
1933 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1934
1935 // Hard cutoff to limit compile-time increase in case a very large number of
1936 // runtime checks needs to be generated.
1937 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1938 // profile info.
1939 CostTooHigh =
1941 if (CostTooHigh)
1942 return;
1943
1944 BasicBlock *LoopHeader = L->getHeader();
1945 BasicBlock *Preheader = L->getLoopPreheader();
1946
1947 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1948 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1949 // may be used by SCEVExpander. The blocks will be un-linked from their
1950 // predecessors and removed from LI & DT at the end of the function.
1951 if (!UnionPred.isAlwaysTrue()) {
1952 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1953 nullptr, "vector.scevcheck");
1954
1955 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1956 &UnionPred, SCEVCheckBlock->getTerminator());
1957 }
1958
1959 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1960 if (RtPtrChecking.Need) {
1961 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1962 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1963 "vector.memcheck");
1964
1965 auto DiffChecks = RtPtrChecking.getDiffChecks();
1966 if (DiffChecks) {
1967 Value *RuntimeVF = nullptr;
1968 MemRuntimeCheckCond = addDiffRuntimeChecks(
1969 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1970 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1971 if (!RuntimeVF)
1972 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1973 return RuntimeVF;
1974 },
1975 IC);
1976 } else {
1977 MemRuntimeCheckCond = addRuntimeChecks(
1978 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1980 }
1981 assert(MemRuntimeCheckCond &&
1982 "no RT checks generated although RtPtrChecking "
1983 "claimed checks are required");
1984 }
1985
1986 if (!MemCheckBlock && !SCEVCheckBlock)
1987 return;
1988
1989 // Unhook the temporary block with the checks, update various places
1990 // accordingly.
1991 if (SCEVCheckBlock)
1992 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1993 if (MemCheckBlock)
1994 MemCheckBlock->replaceAllUsesWith(Preheader);
1995
1996 if (SCEVCheckBlock) {
1997 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1998 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1999 Preheader->getTerminator()->eraseFromParent();
2000 }
2001 if (MemCheckBlock) {
2002 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2003 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2004 Preheader->getTerminator()->eraseFromParent();
2005 }
2006
2007 DT->changeImmediateDominator(LoopHeader, Preheader);
2008 if (MemCheckBlock) {
2009 DT->eraseNode(MemCheckBlock);
2010 LI->removeBlock(MemCheckBlock);
2011 }
2012 if (SCEVCheckBlock) {
2013 DT->eraseNode(SCEVCheckBlock);
2014 LI->removeBlock(SCEVCheckBlock);
2015 }
2016 }
2017
2018 InstructionCost getCost() {
2019 if (SCEVCheckBlock || MemCheckBlock)
2020 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2021
2022 if (CostTooHigh) {
2024 Cost.setInvalid();
2025 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2026 return Cost;
2027 }
2028
2029 InstructionCost RTCheckCost = 0;
2030 if (SCEVCheckBlock)
2031 for (Instruction &I : *SCEVCheckBlock) {
2032 if (SCEVCheckBlock->getTerminator() == &I)
2033 continue;
2036 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2037 RTCheckCost += C;
2038 }
2039 if (MemCheckBlock)
2040 for (Instruction &I : *MemCheckBlock) {
2041 if (MemCheckBlock->getTerminator() == &I)
2042 continue;
2045 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2046 RTCheckCost += C;
2047 }
2048
2049 if (SCEVCheckBlock || MemCheckBlock)
2050 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2051 << "\n");
2052
2053 return RTCheckCost;
2054 }
2055
2056 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2057 /// unused.
2058 ~GeneratedRTChecks() {
2059 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2060 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2061 if (!SCEVCheckCond)
2062 SCEVCleaner.markResultUsed();
2063
2064 if (!MemRuntimeCheckCond)
2065 MemCheckCleaner.markResultUsed();
2066
2067 if (MemRuntimeCheckCond) {
2068 auto &SE = *MemCheckExp.getSE();
2069 // Memory runtime check generation creates compares that use expanded
2070 // values. Remove them before running the SCEVExpanderCleaners.
2071 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2072 if (MemCheckExp.isInsertedInstruction(&I))
2073 continue;
2074 SE.forgetValue(&I);
2075 I.eraseFromParent();
2076 }
2077 }
2078 MemCheckCleaner.cleanup();
2079 SCEVCleaner.cleanup();
2080
2081 if (SCEVCheckCond)
2082 SCEVCheckBlock->eraseFromParent();
2083 if (MemRuntimeCheckCond)
2084 MemCheckBlock->eraseFromParent();
2085 }
2086
2087 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2088 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2089 /// depending on the generated condition.
2090 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2091 BasicBlock *LoopVectorPreHeader,
2092 BasicBlock *LoopExitBlock) {
2093 if (!SCEVCheckCond)
2094 return nullptr;
2095
2096 Value *Cond = SCEVCheckCond;
2097 // Mark the check as used, to prevent it from being removed during cleanup.
2098 SCEVCheckCond = nullptr;
2099 if (auto *C = dyn_cast<ConstantInt>(Cond))
2100 if (C->isZero())
2101 return nullptr;
2102
2103 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2104
2105 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2106 // Create new preheader for vector loop.
2107 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2108 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2109
2110 SCEVCheckBlock->getTerminator()->eraseFromParent();
2111 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2112 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2113 SCEVCheckBlock);
2114
2115 DT->addNewBlock(SCEVCheckBlock, Pred);
2116 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2117
2118 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2119 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2120 return SCEVCheckBlock;
2121 }
2122
2123 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2124 /// the branches to branch to the vector preheader or \p Bypass, depending on
2125 /// the generated condition.
2126 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2127 BasicBlock *LoopVectorPreHeader) {
2128 // Check if we generated code that checks in runtime if arrays overlap.
2129 if (!MemRuntimeCheckCond)
2130 return nullptr;
2131
2132 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2133 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2134 MemCheckBlock);
2135
2136 DT->addNewBlock(MemCheckBlock, Pred);
2137 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2138 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2139
2140 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2141 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2142
2144 MemCheckBlock->getTerminator(),
2145 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2146 MemCheckBlock->getTerminator()->setDebugLoc(
2147 Pred->getTerminator()->getDebugLoc());
2148
2149 // Mark the check as used, to prevent it from being removed during cleanup.
2150 MemRuntimeCheckCond = nullptr;
2151 return MemCheckBlock;
2152 }
2153};
2154} // namespace
2155
2157 return Style == TailFoldingStyle::Data ||
2160}
2161
2163 return Style == TailFoldingStyle::DataAndControlFlow ||
2165}
2166
2167// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2168// vectorization. The loop needs to be annotated with #pragma omp simd
2169// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2170// vector length information is not provided, vectorization is not considered
2171// explicit. Interleave hints are not allowed either. These limitations will be
2172// relaxed in the future.
2173// Please, note that we are currently forced to abuse the pragma 'clang
2174// vectorize' semantics. This pragma provides *auto-vectorization hints*
2175// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2176// provides *explicit vectorization hints* (LV can bypass legal checks and
2177// assume that vectorization is legal). However, both hints are implemented
2178// using the same metadata (llvm.loop.vectorize, processed by
2179// LoopVectorizeHints). This will be fixed in the future when the native IR
2180// representation for pragma 'omp simd' is introduced.
2181static bool isExplicitVecOuterLoop(Loop *OuterLp,
2183 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2184 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2185
2186 // Only outer loops with an explicit vectorization hint are supported.
2187 // Unannotated outer loops are ignored.
2189 return false;
2190
2191 Function *Fn = OuterLp->getHeader()->getParent();
2192 if (!Hints.allowVectorization(Fn, OuterLp,
2193 true /*VectorizeOnlyWhenForced*/)) {
2194 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2195 return false;
2196 }
2197
2198 if (Hints.getInterleave() > 1) {
2199 // TODO: Interleave support is future work.
2200 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2201 "outer loops.\n");
2202 Hints.emitRemarkWithHints();
2203 return false;
2204 }
2205
2206 return true;
2207}
2208
2212 // Collect inner loops and outer loops without irreducible control flow. For
2213 // now, only collect outer loops that have explicit vectorization hints. If we
2214 // are stress testing the VPlan H-CFG construction, we collect the outermost
2215 // loop of every loop nest.
2216 if (L.isInnermost() || VPlanBuildStressTest ||
2218 LoopBlocksRPO RPOT(&L);
2219 RPOT.perform(LI);
2220 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2221 V.push_back(&L);
2222 // TODO: Collect inner loops inside marked outer loops in case
2223 // vectorization fails for the outer loop. Do not invoke
2224 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2225 // already known to be reducible. We can use an inherited attribute for
2226 // that.
2227 return;
2228 }
2229 }
2230 for (Loop *InnerL : L)
2231 collectSupportedLoops(*InnerL, LI, ORE, V);
2232}
2233
2234//===----------------------------------------------------------------------===//
2235// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2236// LoopVectorizationCostModel and LoopVectorizationPlanner.
2237//===----------------------------------------------------------------------===//
2238
2239/// Compute the transformed value of Index at offset StartValue using step
2240/// StepValue.
2241/// For integer induction, returns StartValue + Index * StepValue.
2242/// For pointer induction, returns StartValue[Index * StepValue].
2243/// FIXME: The newly created binary instructions should contain nsw/nuw
2244/// flags, which can be found from the original scalar operations.
2245static Value *
2247 Value *Step,
2249 const BinaryOperator *InductionBinOp) {
2250 Type *StepTy = Step->getType();
2251 Value *CastedIndex = StepTy->isIntegerTy()
2252 ? B.CreateSExtOrTrunc(Index, StepTy)
2253 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2254 if (CastedIndex != Index) {
2255 CastedIndex->setName(CastedIndex->getName() + ".cast");
2256 Index = CastedIndex;
2257 }
2258
2259 // Note: the IR at this point is broken. We cannot use SE to create any new
2260 // SCEV and then expand it, hoping that SCEV's simplification will give us
2261 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2262 // lead to various SCEV crashes. So all we can do is to use builder and rely
2263 // on InstCombine for future simplifications. Here we handle some trivial
2264 // cases only.
2265 auto CreateAdd = [&B](Value *X, Value *Y) {
2266 assert(X->getType() == Y->getType() && "Types don't match!");
2267 if (auto *CX = dyn_cast<ConstantInt>(X))
2268 if (CX->isZero())
2269 return Y;
2270 if (auto *CY = dyn_cast<ConstantInt>(Y))
2271 if (CY->isZero())
2272 return X;
2273 return B.CreateAdd(X, Y);
2274 };
2275
2276 // We allow X to be a vector type, in which case Y will potentially be
2277 // splatted into a vector with the same element count.
2278 auto CreateMul = [&B](Value *X, Value *Y) {
2279 assert(X->getType()->getScalarType() == Y->getType() &&
2280 "Types don't match!");
2281 if (auto *CX = dyn_cast<ConstantInt>(X))
2282 if (CX->isOne())
2283 return Y;
2284 if (auto *CY = dyn_cast<ConstantInt>(Y))
2285 if (CY->isOne())
2286 return X;
2287 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2288 if (XVTy && !isa<VectorType>(Y->getType()))
2289 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2290 return B.CreateMul(X, Y);
2291 };
2292
2293 switch (InductionKind) {
2295 assert(!isa<VectorType>(Index->getType()) &&
2296 "Vector indices not supported for integer inductions yet");
2297 assert(Index->getType() == StartValue->getType() &&
2298 "Index type does not match StartValue type");
2299 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2300 return B.CreateSub(StartValue, Index);
2301 auto *Offset = CreateMul(Index, Step);
2302 return CreateAdd(StartValue, Offset);
2303 }
2305 return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step));
2306 }
2308 assert(!isa<VectorType>(Index->getType()) &&
2309 "Vector indices not supported for FP inductions yet");
2310 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2311 assert(InductionBinOp &&
2312 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2313 InductionBinOp->getOpcode() == Instruction::FSub) &&
2314 "Original bin op should be defined for FP induction");
2315
2316 Value *MulExp = B.CreateFMul(Step, Index);
2317 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2318 "induction");
2319 }
2321 return nullptr;
2322 }
2323 llvm_unreachable("invalid enum");
2324}
2325
2326std::optional<unsigned> getMaxVScale(const Function &F,
2327 const TargetTransformInfo &TTI) {
2328 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2329 return MaxVScale;
2330
2331 if (F.hasFnAttribute(Attribute::VScaleRange))
2332 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2333
2334 return std::nullopt;
2335}
2336
2337/// For the given VF and UF and maximum trip count computed for the loop, return
2338/// whether the induction variable might overflow in the vectorized loop. If not,
2339/// then we know a runtime overflow check always evaluates to false and can be
2340/// removed.
2343 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2344 // Always be conservative if we don't know the exact unroll factor.
2345 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2346
2347 Type *IdxTy = Cost->Legal->getWidestInductionType();
2348 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2349
2350 // We know the runtime overflow check is known false iff the (max) trip-count
2351 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2352 // the vector loop induction variable.
2353 if (unsigned TC =
2354 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2355 uint64_t MaxVF = VF.getKnownMinValue();
2356 if (VF.isScalable()) {
2357 std::optional<unsigned> MaxVScale =
2358 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2359 if (!MaxVScale)
2360 return false;
2361 MaxVF *= *MaxVScale;
2362 }
2363
2364 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2365 }
2366
2367 return false;
2368}
2369
2370// Return whether we allow using masked interleave-groups (for dealing with
2371// strided loads/stores that reside in predicated blocks, or for dealing
2372// with gaps).
2374 // If an override option has been passed in for interleaved accesses, use it.
2375 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2377
2379}
2380
2381// Try to vectorize the interleave group that \p Instr belongs to.
2382//
2383// E.g. Translate following interleaved load group (factor = 3):
2384// for (i = 0; i < N; i+=3) {
2385// R = Pic[i]; // Member of index 0
2386// G = Pic[i+1]; // Member of index 1
2387// B = Pic[i+2]; // Member of index 2
2388// ... // do something to R, G, B
2389// }
2390// To:
2391// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2392// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2393// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2394// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2395//
2396// Or translate following interleaved store group (factor = 3):
2397// for (i = 0; i < N; i+=3) {
2398// ... do something to R, G, B
2399// Pic[i] = R; // Member of index 0
2400// Pic[i+1] = G; // Member of index 1
2401// Pic[i+2] = B; // Member of index 2
2402// }
2403// To:
2404// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2405// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2406// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2407// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2408// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2411 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2412 VPValue *BlockInMask, bool NeedsMaskForGaps) {
2413 Instruction *Instr = Group->getInsertPos();
2414 const DataLayout &DL = Instr->getModule()->getDataLayout();
2415
2416 // Prepare for the vector type of the interleaved load/store.
2417 Type *ScalarTy = getLoadStoreType(Instr);
2418 unsigned InterleaveFactor = Group->getFactor();
2419 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2420
2421 // Prepare for the new pointers.
2422 SmallVector<Value *, 2> AddrParts;
2423 unsigned Index = Group->getIndex(Instr);
2424
2425 // TODO: extend the masked interleaved-group support to reversed access.
2426 assert((!BlockInMask || !Group->isReverse()) &&
2427 "Reversed masked interleave-group not supported.");
2428
2429 Value *Idx;
2430 // If the group is reverse, adjust the index to refer to the last vector lane
2431 // instead of the first. We adjust the index from the first vector lane,
2432 // rather than directly getting the pointer for lane VF - 1, because the
2433 // pointer operand of the interleaved access is supposed to be uniform. For
2434 // uniform instructions, we're only required to generate a value for the
2435 // first vector lane in each unroll iteration.
2436 if (Group->isReverse()) {
2437 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2438 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2442 } else
2444
2445 for (unsigned Part = 0; Part < UF; Part++) {
2446 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2447 if (auto *I = dyn_cast<Instruction>(AddrPart))
2448 State.setDebugLocFrom(I->getDebugLoc());
2449
2450 // Notice current instruction could be any index. Need to adjust the address
2451 // to the member of index 0.
2452 //
2453 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2454 // b = A[i]; // Member of index 0
2455 // Current pointer is pointed to A[i+1], adjust it to A[i].
2456 //
2457 // E.g. A[i+1] = a; // Member of index 1
2458 // A[i] = b; // Member of index 0
2459 // A[i+2] = c; // Member of index 2 (Current instruction)
2460 // Current pointer is pointed to A[i+2], adjust it to A[i].
2461
2462 bool InBounds = false;
2463 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2464 InBounds = gep->isInBounds();
2465 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2466 AddrParts.push_back(AddrPart);
2467 }
2468
2469 State.setDebugLocFrom(Instr->getDebugLoc());
2470 Value *PoisonVec = PoisonValue::get(VecTy);
2471
2472 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2473 unsigned Part, Value *MaskForGaps) -> Value * {
2474 if (VF.isScalable()) {
2475 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2476 assert(InterleaveFactor == 2 &&
2477 "Unsupported deinterleave factor for scalable vectors");
2478 auto *BlockInMaskPart = State.get(BlockInMask, Part);
2479 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2480 auto *MaskTy =
2482 return Builder.CreateIntrinsic(
2483 MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2484 /*FMFSource=*/nullptr, "interleaved.mask");
2485 }
2486
2487 if (!BlockInMask)
2488 return MaskForGaps;
2489
2490 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2491 Value *ShuffledMask = Builder.CreateShuffleVector(
2492 BlockInMaskPart,
2493 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2494 "interleaved.mask");
2495 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2496 MaskForGaps)
2497 : ShuffledMask;
2498 };
2499
2500 // Vectorize the interleaved load group.
2501 if (isa<LoadInst>(Instr)) {
2502 Value *MaskForGaps = nullptr;
2503 if (NeedsMaskForGaps) {
2504 MaskForGaps =
2506 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2507 }
2508
2509 // For each unroll part, create a wide load for the group.
2510 SmallVector<Value *, 2> NewLoads;
2511 for (unsigned Part = 0; Part < UF; Part++) {
2512 Instruction *NewLoad;
2513 if (BlockInMask || MaskForGaps) {
2515 "masked interleaved groups are not allowed.");
2516 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2517 NewLoad =
2518 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2519 GroupMask, PoisonVec, "wide.masked.vec");
2520 }
2521 else
2522 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2523 Group->getAlign(), "wide.vec");
2524 Group->addMetadata(NewLoad);
2525 NewLoads.push_back(NewLoad);
2526 }
2527
2528 if (VecTy->isScalableTy()) {
2529 assert(InterleaveFactor == 2 &&
2530 "Unsupported deinterleave factor for scalable vectors");
2531
2532 for (unsigned Part = 0; Part < UF; ++Part) {
2533 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2534 // so must use intrinsics to deinterleave.
2536 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2537 /*FMFSource=*/nullptr, "strided.vec");
2538 unsigned J = 0;
2539 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2540 Instruction *Member = Group->getMember(I);
2541
2542 if (!Member)
2543 continue;
2544
2545 Value *StridedVec = Builder.CreateExtractValue(DI, I);
2546 // If this member has different type, cast the result type.
2547 if (Member->getType() != ScalarTy) {
2548 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2549 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2550 }
2551
2552 if (Group->isReverse())
2553 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2554
2555 State.set(VPDefs[J], StridedVec, Part);
2556 ++J;
2557 }
2558 }
2559
2560 return;
2561 }
2562
2563 // For each member in the group, shuffle out the appropriate data from the
2564 // wide loads.
2565 unsigned J = 0;
2566 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2567 Instruction *Member = Group->getMember(I);
2568
2569 // Skip the gaps in the group.
2570 if (!Member)
2571 continue;
2572
2573 auto StrideMask =
2574 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2575 for (unsigned Part = 0; Part < UF; Part++) {
2576 Value *StridedVec = Builder.CreateShuffleVector(
2577 NewLoads[Part], StrideMask, "strided.vec");
2578
2579 // If this member has different type, cast the result type.
2580 if (Member->getType() != ScalarTy) {
2581 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2582 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2583 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2584 }
2585
2586 if (Group->isReverse())
2587 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2588
2589 State.set(VPDefs[J], StridedVec, Part);
2590 }
2591 ++J;
2592 }
2593 return;
2594 }
2595
2596 // The sub vector type for current instruction.
2597 auto *SubVT = VectorType::get(ScalarTy, VF);
2598
2599 // Vectorize the interleaved store group.
2600 Value *MaskForGaps =
2602 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2603 "masked interleaved groups are not allowed.");
2604 assert((!MaskForGaps || !VF.isScalable()) &&
2605 "masking gaps for scalable vectors is not yet supported.");
2606 for (unsigned Part = 0; Part < UF; Part++) {
2607 // Collect the stored vector from each member.
2608 SmallVector<Value *, 4> StoredVecs;
2609 unsigned StoredIdx = 0;
2610 for (unsigned i = 0; i < InterleaveFactor; i++) {
2611 assert((Group->getMember(i) || MaskForGaps) &&
2612 "Fail to get a member from an interleaved store group");
2613 Instruction *Member = Group->getMember(i);
2614
2615 // Skip the gaps in the group.
2616 if (!Member) {
2617 Value *Undef = PoisonValue::get(SubVT);
2618 StoredVecs.push_back(Undef);
2619 continue;
2620 }
2621
2622 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2623 ++StoredIdx;
2624
2625 if (Group->isReverse())
2626 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2627
2628 // If this member has different type, cast it to a unified type.
2629
2630 if (StoredVec->getType() != SubVT)
2631 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2632
2633 StoredVecs.push_back(StoredVec);
2634 }
2635
2636 // Interleave all the smaller vectors into one wider vector.
2637 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2638 Instruction *NewStoreInstr;
2639 if (BlockInMask || MaskForGaps) {
2640 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2641 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2642 Group->getAlign(), GroupMask);
2643 } else
2644 NewStoreInstr =
2645 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2646
2647 Group->addMetadata(NewStoreInstr);
2648 }
2649}
2650
2652 VPReplicateRecipe *RepRecipe,
2653 const VPIteration &Instance,
2654 VPTransformState &State) {
2655 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2656
2657 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2658 // the first lane and part.
2659 if (isa<NoAliasScopeDeclInst>(Instr))
2660 if (!Instance.isFirstIteration())
2661 return;
2662
2663 // Does this instruction return a value ?
2664 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2665
2666 Instruction *Cloned = Instr->clone();
2667 if (!IsVoidRetTy)
2668 Cloned->setName(Instr->getName() + ".cloned");
2669
2670 RepRecipe->setFlags(Cloned);
2671
2672 if (auto DL = Instr->getDebugLoc())
2673 State.setDebugLocFrom(DL);
2674
2675 // Replace the operands of the cloned instructions with their scalar
2676 // equivalents in the new loop.
2677 for (const auto &I : enumerate(RepRecipe->operands())) {
2678 auto InputInstance = Instance;
2679 VPValue *Operand = I.value();
2681 InputInstance.Lane = VPLane::getFirstLane();
2682 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2683 }
2684 State.addNewMetadata(Cloned, Instr);
2685
2686 // Place the cloned scalar in the new loop.
2687 State.Builder.Insert(Cloned);
2688
2689 State.set(RepRecipe, Cloned, Instance);
2690
2691 // If we just cloned a new assumption, add it the assumption cache.
2692 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2694
2695 // End if-block.
2696 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2697 if (IfPredicateInstr)
2698 PredicatedInstructions.push_back(Cloned);
2699}
2700
2701Value *
2703 if (VectorTripCount)
2704 return VectorTripCount;
2705
2706 Value *TC = getTripCount();
2707 IRBuilder<> Builder(InsertBlock->getTerminator());
2708
2709 Type *Ty = TC->getType();
2710 // This is where we can make the step a runtime constant.
2711 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2712
2713 // If the tail is to be folded by masking, round the number of iterations N
2714 // up to a multiple of Step instead of rounding down. This is done by first
2715 // adding Step-1 and then rounding down. Note that it's ok if this addition
2716 // overflows: the vector induction variable will eventually wrap to zero given
2717 // that it starts at zero and its Step is a power of two; the loop will then
2718 // exit, with the last early-exit vector comparison also producing all-true.
2719 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2720 // is accounted for in emitIterationCountCheck that adds an overflow check.
2721 if (Cost->foldTailByMasking()) {
2723 "VF*UF must be a power of 2 when folding tail by masking");
2724 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2725 TC = Builder.CreateAdd(
2726 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2727 }
2728
2729 // Now we need to generate the expression for the part of the loop that the
2730 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2731 // iterations are not required for correctness, or N - Step, otherwise. Step
2732 // is equal to the vectorization factor (number of SIMD elements) times the
2733 // unroll factor (number of SIMD instructions).
2734 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2735
2736 // There are cases where we *must* run at least one iteration in the remainder
2737 // loop. See the cost model for when this can happen. If the step evenly
2738 // divides the trip count, we set the remainder to be equal to the step. If
2739 // the step does not evenly divide the trip count, no adjustment is necessary
2740 // since there will already be scalar iterations. Note that the minimum
2741 // iterations check ensures that N >= Step.
2742 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2743 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2744 R = Builder.CreateSelect(IsZero, Step, R);
2745 }
2746
2747 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2748
2749 return VectorTripCount;
2750}
2751
2753 const DataLayout &DL) {
2754 // Verify that V is a vector type with same number of elements as DstVTy.
2755 auto *DstFVTy = cast<VectorType>(DstVTy);
2756 auto VF = DstFVTy->getElementCount();
2757 auto *SrcVecTy = cast<VectorType>(V->getType());
2758 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2759 Type *SrcElemTy = SrcVecTy->getElementType();
2760 Type *DstElemTy = DstFVTy->getElementType();
2761 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2762 "Vector elements must have same size");
2763
2764 // Do a direct cast if element types are castable.
2765 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2766 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2767 }
2768 // V cannot be directly casted to desired vector type.
2769 // May happen when V is a floating point vector but DstVTy is a vector of
2770 // pointers or vice-versa. Handle this using a two-step bitcast using an
2771 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2772 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2773 "Only one type should be a pointer type");
2774 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2775 "Only one type should be a floating point type");
2776 Type *IntTy =
2777 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2778 auto *VecIntTy = VectorType::get(IntTy, VF);
2779 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2780 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2781}
2782
2784 Value *Count = getTripCount();
2785 // Reuse existing vector loop preheader for TC checks.
2786 // Note that new preheader block is generated for vector loop.
2787 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2788 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2789
2790 // Generate code to check if the loop's trip count is less than VF * UF, or
2791 // equal to it in case a scalar epilogue is required; this implies that the
2792 // vector trip count is zero. This check also covers the case where adding one
2793 // to the backedge-taken count overflowed leading to an incorrect trip count
2794 // of zero. In this case we will also jump to the scalar loop.
2795 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2797
2798 // If tail is to be folded, vector loop takes care of all iterations.
2799 Type *CountTy = Count->getType();
2800 Value *CheckMinIters = Builder.getFalse();
2801 auto CreateStep = [&]() -> Value * {
2802 // Create step with max(MinProTripCount, UF * VF).
2804 return createStepForVF(Builder, CountTy, VF, UF);
2805
2806 Value *MinProfTC =
2808 if (!VF.isScalable())
2809 return MinProfTC;
2811 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2812 };
2813
2814 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2815 if (Style == TailFoldingStyle::None)
2816 CheckMinIters =
2817 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2818 else if (VF.isScalable() &&
2821 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2822 // an overflow to zero when updating induction variables and so an
2823 // additional overflow check is required before entering the vector loop.
2824
2825 // Get the maximum unsigned value for the type.
2826 Value *MaxUIntTripCount =
2827 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2828 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2829
2830 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2831 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2832 }
2833
2834 // Create new preheader for vector loop.
2836 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2837 "vector.ph");
2838
2839 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2840 DT->getNode(Bypass)->getIDom()) &&
2841 "TC check is expected to dominate Bypass");
2842
2843 // Update dominator for Bypass & LoopExit (if needed).
2844 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2845 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2846 // If there is an epilogue which must run, there's no edge from the
2847 // middle block to exit blocks and thus no need to update the immediate
2848 // dominator of the exit blocks.
2850
2852 TCCheckBlock->getTerminator(),
2853 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2854 LoopBypassBlocks.push_back(TCCheckBlock);
2855}
2856
2858 BasicBlock *const SCEVCheckBlock =
2859 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2860 if (!SCEVCheckBlock)
2861 return nullptr;
2862
2863 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2865 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2866 "Cannot SCEV check stride or overflow when optimizing for size");
2867
2868
2869 // Update dominator only if this is first RT check.
2870 if (LoopBypassBlocks.empty()) {
2871 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2872 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2873 // If there is an epilogue which must run, there's no edge from the
2874 // middle block to exit blocks and thus no need to update the immediate
2875 // dominator of the exit blocks.
2876 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2877 }
2878
2879 LoopBypassBlocks.push_back(SCEVCheckBlock);
2880 AddedSafetyChecks = true;
2881 return SCEVCheckBlock;
2882}
2883
2885 // VPlan-native path does not do any analysis for runtime checks currently.
2887 return nullptr;
2888
2889 BasicBlock *const MemCheckBlock =
2890 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2891
2892 // Check if we generated code that checks in runtime if arrays overlap. We put
2893 // the checks into a separate block to make the more common case of few
2894 // elements faster.
2895 if (!MemCheckBlock)
2896 return nullptr;
2897
2898 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2899 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2900 "Cannot emit memory checks when optimizing for size, unless forced "
2901 "to vectorize.");
2902 ORE->emit([&]() {
2903 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2906 << "Code-size may be reduced by not forcing "
2907 "vectorization, or by source-code modifications "
2908 "eliminating the need for runtime checks "
2909 "(e.g., adding 'restrict').";
2910 });
2911 }
2912
2913 LoopBypassBlocks.push_back(MemCheckBlock);
2914
2915 AddedSafetyChecks = true;
2916
2917 return MemCheckBlock;
2918}
2919
2923 assert(LoopVectorPreHeader && "Invalid loop structure");
2924 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2925 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2926 "multiple exit loop without required epilogue?");
2927
2930 LI, nullptr, Twine(Prefix) + "middle.block");
2933 nullptr, Twine(Prefix) + "scalar.ph");
2934
2935 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2936
2937 // Set up the middle block terminator. Two cases:
2938 // 1) If we know that we must execute the scalar epilogue, emit an
2939 // unconditional branch.
2940 // 2) Otherwise, we must have a single unique exit block (due to how we
2941 // implement the multiple exit case). In this case, set up a conditional
2942 // branch from the middle block to the loop scalar preheader, and the
2943 // exit block. completeLoopSkeleton will update the condition to use an
2944 // iteration check, if required to decide whether to execute the remainder.
2945 BranchInst *BrInst =
2946 Cost->requiresScalarEpilogue(VF.isVector())
2949 Builder.getTrue());
2950 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
2952
2953 // Update dominator for loop exit. During skeleton creation, only the vector
2954 // pre-header and the middle block are created. The vector loop is entirely
2955 // created during VPlan exection.
2956 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2957 // If there is an epilogue which must run, there's no edge from the
2958 // middle block to exit blocks and thus no need to update the immediate
2959 // dominator of the exit blocks.
2961}
2962
2964 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2965 ArrayRef<BasicBlock *> BypassBlocks,
2966 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2968 assert(VectorTripCount && "Expected valid arguments");
2969
2970 Instruction *OldInduction = Legal->getPrimaryInduction();
2971 Value *&EndValue = IVEndValues[OrigPhi];
2972 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
2973 if (OrigPhi == OldInduction) {
2974 // We know what the end value is.
2975 EndValue = VectorTripCount;
2976 } else {
2978
2979 // Fast-math-flags propagate from the original induction instruction.
2980 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2981 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2982
2984 Step, II.getKind(), II.getInductionBinOp());
2985 EndValue->setName("ind.end");
2986
2987 // Compute the end value for the additional bypass (if applicable).
2988 if (AdditionalBypass.first) {
2989 B.SetInsertPoint(AdditionalBypass.first,
2990 AdditionalBypass.first->getFirstInsertionPt());
2991 EndValueFromAdditionalBypass =
2992 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
2993 Step, II.getKind(), II.getInductionBinOp());
2994 EndValueFromAdditionalBypass->setName("ind.end");
2995 }
2996 }
2997
2998 // Create phi nodes to merge from the backedge-taken check block.
2999 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3001 // Copy original phi DL over to the new one.
3002 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3003
3004 // The new PHI merges the original incoming value, in case of a bypass,
3005 // or the value at the end of the vectorized loop.
3006 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3007
3008 // Fix the scalar body counter (PHI node).
3009 // The old induction's phi node in the scalar body needs the truncated
3010 // value.
3011 for (BasicBlock *BB : BypassBlocks)
3012 BCResumeVal->addIncoming(II.getStartValue(), BB);
3013
3014 if (AdditionalBypass.first)
3015 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3016 EndValueFromAdditionalBypass);
3017 return BCResumeVal;
3018}
3019
3020/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3021/// expansion results.
3023 const SCEV2ValueTy &ExpandedSCEVs) {
3024 const SCEV *Step = ID.getStep();
3025 if (auto *C = dyn_cast<SCEVConstant>(Step))
3026 return C->getValue();
3027 if (auto *U = dyn_cast<SCEVUnknown>(Step))
3028 return U->getValue();
3029 auto I = ExpandedSCEVs.find(Step);
3030 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3031 return I->second;
3032}
3033
3035 const SCEV2ValueTy &ExpandedSCEVs,
3036 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3037 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3038 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3039 "Inconsistent information about additional bypass.");
3040 // We are going to resume the execution of the scalar loop.
3041 // Go over all of the induction variables that we found and fix the
3042 // PHIs that are left in the scalar version of the loop.
3043 // The starting values of PHI nodes depend on the counter of the last
3044 // iteration in the vectorized loop.
3045 // If we come from a bypass edge then we need to start from the original
3046 // start value.
3047 for (const auto &InductionEntry : Legal->getInductionVars()) {
3048 PHINode *OrigPhi = InductionEntry.first;
3049 const InductionDescriptor &II = InductionEntry.second;
3050 PHINode *BCResumeVal = createInductionResumeValue(
3051 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3052 AdditionalBypass);
3053 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3054 }
3055}
3056
3058 // The trip counts should be cached by now.
3059 Value *Count = getTripCount();
3061
3062 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3063
3064 // Add a check in the middle block to see if we have completed
3065 // all of the iterations in the first vector loop. Three cases:
3066 // 1) If we require a scalar epilogue, there is no conditional branch as
3067 // we unconditionally branch to the scalar preheader. Do nothing.
3068 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3069 // Thus if tail is to be folded, we know we don't need to run the
3070 // remainder and we can use the previous value for the condition (true).
3071 // 3) Otherwise, construct a runtime check.
3072 if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3073 !Cost->foldTailByMasking()) {
3074 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3075 // of the corresponding compare because they may have ended up with
3076 // different line numbers and we want to avoid awkward line stepping while
3077 // debugging. Eg. if the compare has got a line number inside the loop.
3078 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3079 // operands. Perform simplification directly on VPlan once the branch is
3080 // modeled there.
3082 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3083 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3084 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3085 }
3086
3087#ifdef EXPENSIVE_CHECKS
3088 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3089#endif
3090
3091 return LoopVectorPreHeader;
3092}
3093
3094std::pair<BasicBlock *, Value *>
3096 const SCEV2ValueTy &ExpandedSCEVs) {
3097 /*
3098 In this function we generate a new loop. The new loop will contain
3099 the vectorized instructions while the old loop will continue to run the
3100 scalar remainder.
3101
3102 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3103 / | preheader are expanded here. Eventually all required SCEV
3104 / | expansion should happen here.
3105 / v
3106 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3107 | / |
3108 | / v
3109 || [ ] <-- vector pre header.
3110 |/ |
3111 | v
3112 | [ ] \
3113 | [ ]_| <-- vector loop (created during VPlan execution).
3114 | |
3115 | v
3116 \ -[ ] <--- middle-block.
3117 \/ |
3118 /\ v
3119 | ->[ ] <--- new preheader.
3120 | |
3121 (opt) v <-- edge from middle to exit iff epilogue is not required.
3122 | [ ] \
3123 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3124 \ |
3125 \ v
3126 >[ ] <-- exit block(s).
3127 ...
3128 */
3129
3130 // Create an empty vector loop, and prepare basic blocks for the runtime
3131 // checks.
3133
3134 // Now, compare the new count to zero. If it is zero skip the vector loop and
3135 // jump to the scalar loop. This check also covers the case where the
3136 // backedge-taken count is uint##_max: adding one to it will overflow leading
3137 // to an incorrect trip count of zero. In this (rare) case we will also jump
3138 // to the scalar loop.
3140
3141 // Generate the code to check any assumptions that we've made for SCEV
3142 // expressions.
3144
3145 // Generate the code that checks in runtime if arrays overlap. We put the
3146 // checks into a separate block to make the more common case of few elements
3147 // faster.
3149
3150 // Emit phis for the new starting index of the scalar loop.
3151 createInductionResumeValues(ExpandedSCEVs);
3152
3153 return {completeLoopSkeleton(), nullptr};
3154}
3155
3156// Fix up external users of the induction variable. At this point, we are
3157// in LCSSA form, with all external PHIs that use the IV having one input value,
3158// coming from the remainder loop. We need those PHIs to also have a correct
3159// value for the IV when arriving directly from the middle block.
3161 const InductionDescriptor &II,
3162 Value *VectorTripCount, Value *EndValue,
3163 BasicBlock *MiddleBlock,
3164 BasicBlock *VectorHeader, VPlan &Plan,
3165 VPTransformState &State) {
3166 // There are two kinds of external IV usages - those that use the value
3167 // computed in the last iteration (the PHI) and those that use the penultimate
3168 // value (the value that feeds into the phi from the loop latch).
3169 // We allow both, but they, obviously, have different values.
3170
3171 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3172
3173 DenseMap<Value *, Value *> MissingVals;
3174
3175 // An external user of the last iteration's value should see the value that
3176 // the remainder loop uses to initialize its own IV.
3178 for (User *U : PostInc->users()) {
3179 Instruction *UI = cast<Instruction>(U);
3180 if (!OrigLoop->contains(UI)) {
3181 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3182 MissingVals[UI] = EndValue;
3183 }
3184 }
3185
3186 // An external user of the penultimate value need to see EndValue - Step.
3187 // The simplest way to get this is to recompute it from the constituent SCEVs,
3188 // that is Start + (Step * (CRD - 1)).
3189 for (User *U : OrigPhi->users()) {
3190 auto *UI = cast<Instruction>(U);
3191 if (!OrigLoop->contains(UI)) {
3192 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3193 IRBuilder<> B(MiddleBlock->getTerminator());
3194
3195 // Fast-math-flags propagate from the original induction instruction.
3196 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3197 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3198
3199 Value *CountMinusOne = B.CreateSub(
3201 CountMinusOne->setName("cmo");
3202
3203 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3204 assert(StepVPV && "step must have been expanded during VPlan execution");
3205 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3206 : State.get(StepVPV, {0, 0});
3207 Value *Escape =
3208 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3209 II.getKind(), II.getInductionBinOp());
3210 Escape->setName("ind.escape");
3211 MissingVals[UI] = Escape;
3212 }
3213 }
3214
3215 for (auto &I : MissingVals) {
3216 PHINode *PHI = cast<PHINode>(I.first);
3217 // One corner case we have to handle is two IVs "chasing" each-other,
3218 // that is %IV2 = phi [...], [ %IV1, %latch ]
3219 // In this case, if IV1 has an external use, we need to avoid adding both
3220 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3221 // don't already have an incoming value for the middle block.
3222 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3223 PHI->addIncoming(I.second, MiddleBlock);
3224 Plan.removeLiveOut(PHI);
3225 }
3226 }
3227}
3228
3229namespace {
3230
3231struct CSEDenseMapInfo {
3232 static bool canHandle(const Instruction *I) {
3233 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3234 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3235 }
3236
3237 static inline Instruction *getEmptyKey() {
3239 }
3240
3241 static inline Instruction *getTombstoneKey() {
3243 }
3244
3245 static unsigned getHashValue(const Instruction *I) {
3246 assert(canHandle(I) && "Unknown instruction!");
3247 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3248 I->value_op_end()));
3249 }
3250
3251 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3252 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3253 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3254 return LHS == RHS;
3255 return LHS->isIdenticalTo(RHS);
3256 }
3257};
3258
3259} // end anonymous namespace
3260
3261///Perform cse of induction variable instructions.
3262static void cse(BasicBlock *BB) {
3263 // Perform simple cse.
3265 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3266 if (!CSEDenseMapInfo::canHandle(&In))
3267 continue;
3268
3269 // Check if we can replace this instruction with any of the
3270 // visited instructions.
3271 if (Instruction *V = CSEMap.lookup(&In)) {
3272 In.replaceAllUsesWith(V);
3273 In.eraseFromParent();
3274 continue;
3275 }
3276
3277 CSEMap[&In] = &In;
3278 }
3279}
3280
3282 CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const {
3283 Function *F = CI->getCalledFunction();
3284 Type *ScalarRetTy = CI->getType();
3285 SmallVector<Type *, 4> Tys, ScalarTys;
3286 bool MaskRequired = Legal->isMaskRequired(CI);
3287 for (auto &ArgOp : CI->args())
3288 ScalarTys.push_back(ArgOp->getType());
3289
3290 // Estimate cost of scalarized vector call. The source operands are assumed
3291 // to be vectors, so we need to extract individual elements from there,
3292 // execute VF scalar calls, and then gather the result into the vector return
3293 // value.
3295 InstructionCost ScalarCallCost =
3296 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);
3297 if (VF.isScalar())
3298 return ScalarCallCost;
3299
3300 // Compute corresponding vector type for return value and arguments.
3301 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3302 for (Type *ScalarTy : ScalarTys)
3303 Tys.push_back(ToVectorTy(ScalarTy, VF));
3304
3305 // Compute costs of unpacking argument values for the scalar calls and
3306 // packing the return values to a vector.
3307 InstructionCost ScalarizationCost =
3308 getScalarizationOverhead(CI, VF, CostKind);
3309
3311 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3312
3313 // If we can't emit a vector call for this function, then the currently found
3314 // cost is the cost we need to return.
3315 InstructionCost MaskCost = 0;
3316 VFShape Shape = VFShape::get(*CI, VF, MaskRequired);
3317 if (NeedsMask)
3318 *NeedsMask = MaskRequired;
3319 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3320 // If we want an unmasked vector function but can't find one matching the VF,
3321 // maybe we can find vector function that does use a mask and synthesize
3322 // an all-true mask.
3323 if (!VecFunc && !MaskRequired) {
3324 Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true);
3325 VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3326 // If we found one, add in the cost of creating a mask
3327 if (VecFunc) {
3328 if (NeedsMask)
3329 *NeedsMask = true;
3330 MaskCost = TTI.getShuffleCost(
3334 VF));
3335 }
3336 }
3337
3338 // We don't support masked function calls yet, but we can scalarize a
3339 // masked call with branches (unless VF is scalable).
3340 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3342
3343 // If the corresponding vector cost is cheaper, return its cost.
3344 InstructionCost VectorCallCost =
3345 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
3346 if (VectorCallCost < Cost) {
3347 *Variant = VecFunc;
3348 Cost = VectorCallCost;
3349 }
3350 return Cost;
3351}
3352
3354 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3355 return Elt;
3356 return VectorType::get(Elt, VF);
3357}
3358
3361 ElementCount VF) const {
3363 assert(ID && "Expected intrinsic call!");
3364 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3365 FastMathFlags FMF;
3366 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3367 FMF = FPMO->getFastMathFlags();
3368
3371 SmallVector<Type *> ParamTys;
3372 std::transform(FTy->param_begin(), FTy->param_end(),
3373 std::back_inserter(ParamTys),
3374 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3375
3376 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3377 dyn_cast<IntrinsicInst>(CI));
3378 return TTI.getIntrinsicInstrCost(CostAttrs,
3380}
3381
3383 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3384 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3385 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3386}
3387
3389 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3390 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3391 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3392}
3393
3395 // For every instruction `I` in MinBWs, truncate the operands, create a
3396 // truncated version of `I` and reextend its result. InstCombine runs
3397 // later and will remove any ext/trunc pairs.
3399 for (const auto &KV : Cost->getMinimalBitwidths()) {
3400 // If the value wasn't vectorized, we must maintain the original scalar
3401 // type. The absence of the value from State indicates that it
3402 // wasn't vectorized.
3403 // FIXME: Should not rely on getVPValue at this point.
3404 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3405 if (!State.hasAnyVectorValue(Def))
3406 continue;
3407 // If the instruction is defined outside the loop, only update the first
3408 // part; the first part will be re-used for all other parts.
3409 unsigned UFToUse = OrigLoop->contains(KV.first) ? UF : 1;
3410 for (unsigned Part = 0; Part < UFToUse; ++Part) {
3411 Value *I = State.get(Def, Part);
3412 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3413 continue;
3414 Type *OriginalTy = I->getType();
3415 Type *ScalarTruncatedTy =
3416 IntegerType::get(OriginalTy->getContext(), KV.second);
3417 auto *TruncatedTy = VectorType::get(
3418 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3419 if (TruncatedTy == OriginalTy)
3420 continue;
3421
3422 IRBuilder<> B(cast<Instruction>(I));
3423 auto ShrinkOperand = [&](Value *V) -> Value * {
3424 if (auto *ZI = dyn_cast<ZExtInst>(V))
3425 if (ZI->getSrcTy() == TruncatedTy)
3426 return ZI->getOperand(0);
3427 return B.CreateZExtOrTrunc(V, TruncatedTy);
3428 };
3429
3430 // The actual instruction modification depends on the instruction type,
3431 // unfortunately.
3432 Value *NewI = nullptr;
3433 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3434 Value *Op0 = ShrinkOperand(BO->getOperand(0));
3435 Value *Op1 = ShrinkOperand(BO->getOperand(1));
3436 NewI = B.CreateBinOp(BO->getOpcode(), Op0, Op1);
3437
3438 // Any wrapping introduced by shrinking this operation shouldn't be
3439 // considered undefined behavior. So, we can't unconditionally copy
3440 // arithmetic wrapping flags to NewI.
3441 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3442 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3443 Value *Op0 = ShrinkOperand(BO->getOperand(0));
3444 Value *Op1 = ShrinkOperand(BO->getOperand(1));
3445 NewI = B.CreateICmp(CI->getPredicate(), Op0, Op1);
3446 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3447 Value *TV = ShrinkOperand(SI->getTrueValue());
3448 Value *FV = ShrinkOperand(SI->getFalseValue());
3449 NewI = B.CreateSelect(SI->getCondition(), TV, FV);
3450 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3451 switch (CI->getOpcode()) {
3452 default:
3453 llvm_unreachable("Unhandled cast!");
3454 case Instruction::Trunc:
3455 NewI = ShrinkOperand(CI->getOperand(0));
3456 break;
3457 case Instruction::SExt:
3458 NewI = B.CreateSExtOrTrunc(
3459 CI->getOperand(0),
3460 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3461 break;
3462 case Instruction::ZExt:
3463 NewI = B.CreateZExtOrTrunc(
3464 CI->getOperand(0),
3465 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3466 break;
3467 }
3468 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3469 auto Elements0 =
3470 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3471 auto *O0 = B.CreateZExtOrTrunc(
3472 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3473 auto Elements1 =
3474 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3475 auto *O1 = B.CreateZExtOrTrunc(
3476 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3477
3478 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3479 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3480 // Don't do anything with the operands, just extend the result.
3481 continue;
3482 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3483 auto Elements =
3484 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3485 auto *O0 = B.CreateZExtOrTrunc(
3486 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3487 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3488 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3489 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3490 auto Elements =
3491 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3492 auto *O0 = B.CreateZExtOrTrunc(
3493 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3494 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3495 } else {
3496 // If we don't know what to do, be conservative and don't do anything.
3497 continue;
3498 }
3499
3500 // Lastly, extend the result.
3501 NewI->takeName(cast<Instruction>(I));
3502 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3503 I->replaceAllUsesWith(Res);
3504 cast<Instruction>(I)->eraseFromParent();
3505 Erased.insert(I);
3506 State.reset(Def, Res, Part);
3507 }
3508 }
3509
3510 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3511 for (const auto &KV : Cost->getMinimalBitwidths()) {
3512 // If the value wasn't vectorized, we must maintain the original scalar
3513 // type. The absence of the value from State indicates that it
3514 // wasn't vectorized.
3515 // FIXME: Should not rely on getVPValue at this point.
3516 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3517 if (!State.hasAnyVectorValue(Def))
3518 continue;
3519 unsigned UFToUse = OrigLoop->contains(KV.first) ? UF : 1;
3520 for (unsigned Part = 0; Part < UFToUse; ++Part) {
3521 Value *I = State.get(Def, Part);
3522 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3523 if (Inst && Inst->use_empty()) {
3524 Value *NewI = Inst->getOperand(0);
3525 Inst->eraseFromParent();
3526 State.reset(Def, NewI, Part);
3527 }
3528 }
3529 }
3530}
3531
3533 VPlan &Plan) {
3534 // Insert truncates and extends for any truncated instructions as hints to
3535 // InstCombine.
3536 if (VF.isVector())
3538
3539 // Fix widened non-induction PHIs by setting up the PHI operands.
3541 fixNonInductionPHIs(Plan, State);
3542
3543 // At this point every instruction in the original loop is widened to a
3544 // vector form. Now we need to fix the recurrences in the loop. These PHI
3545 // nodes are currently empty because we did not want to introduce cycles.
3546 // This is the second stage of vectorizing recurrences.
3547 fixCrossIterationPHIs(State);
3548
3549 // Forget the original basic block.
3551
3552 // After vectorization, the exit blocks of the original loop will have
3553 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3554 // looked through single-entry phis.
3555 SmallVector<BasicBlock *> ExitBlocks;
3556 OrigLoop->getExitBlocks(ExitBlocks);
3557 for (BasicBlock *Exit : ExitBlocks)
3558 for (PHINode &PN : Exit->phis())
3559 PSE.getSE()->forgetValue(&PN);
3560
3562 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3563 if (Cost->requiresScalarEpilogue(VF.isVector())) {
3564 // No edge from the middle block to the unique exit block has been inserted
3565 // and there is nothing to fix from vector loop; phis should have incoming
3566 // from scalar loop only.
3567 } else {
3568 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3569 // the cost model.
3570
3571 // If we inserted an edge from the middle block to the unique exit block,
3572 // update uses outside the loop (phis) to account for the newly inserted
3573 // edge.
3574
3575 // Fix-up external users of the induction variables.
3576 for (const auto &Entry : Legal->getInductionVars())
3577 fixupIVUsers(Entry.first, Entry.second,
3579 IVEndValues[Entry.first], LoopMiddleBlock,
3580 VectorLoop->getHeader(), Plan, State);
3581 }
3582
3583 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3584 // in the exit block, so update the builder.
3585 State.Builder.SetInsertPoint(State.CFG.ExitBB,
3586 State.CFG.ExitBB->getFirstNonPHIIt());
3587 for (const auto &KV : Plan.getLiveOuts())
3588 KV.second->fixPhi(Plan, State);
3589
3591 sinkScalarOperands(&*PI);
3592
3593 // Remove redundant induction instructions.
3594 cse(VectorLoop->getHeader());
3595
3596 // Set/update profile weights for the vector and remainder loops as original
3597 // loop iterations are now distributed among them. Note that original loop
3598 // represented by LoopScalarBody becomes remainder loop after vectorization.
3599 //
3600 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3601 // end up getting slightly roughened result but that should be OK since
3602 // profile is not inherently precise anyway. Note also possible bypass of
3603 // vector code caused by legality checks is ignored, assigning all the weight
3604 // to the vector loop, optimistically.
3605 //
3606 // For scalable vectorization we can't know at compile time how many iterations
3607 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3608 // vscale of '1'.
3611 VF.getKnownMinValue() * UF);
3612}
3613
3615 // In order to support recurrences we need to be able to vectorize Phi nodes.
3616 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3617 // stage #2: We now need to fix the recurrences by adding incoming edges to
3618 // the currently empty PHI nodes. At this point every instruction in the
3619 // original loop is widened to a vector form so we can use them to construct
3620 // the incoming edges.
3621 VPBasicBlock *Header =
3623
3624 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
3625 // sank outside of the loop would keep the same order as they had in the
3626 // original loop.
3627 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
3628 for (VPRecipeBase &R : Header->phis()) {
3629 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3630 ReductionPHIList.emplace_back(ReductionPhi);
3631 }
3632 stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1,
3633 const VPReductionPHIRecipe *R2) {
3634 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
3635 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
3636
3637 // If neither of the recipes has an intermediate store, keep the order the
3638 // same.
3639 if (!IS1 && !IS2)
3640 return false;
3641
3642 // If only one of the recipes has an intermediate store, then move it
3643 // towards the beginning of the list.
3644 if (IS1 && !IS2)
3645 return true;
3646
3647 if (!IS1 && IS2)
3648 return false;
3649
3650 // If both recipes have an intermediate store, then the recipe with the
3651 // later store should be processed earlier. So it should go to the beginning
3652 // of the list.
3653 return DT->dominates(IS2, IS1);
3654 });
3655
3656 for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList)
3657 fixReduction(ReductionPhi, State);
3658
3659 for (VPRecipeBase &R : Header->phis()) {
3660 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3661 fixFixedOrderRecurrence(FOR, State);
3662 }
3663}
3664
3667 // This is the second phase of vectorizing first-order recurrences. An
3668 // overview of the transformation is described below. Suppose we have the
3669 // following loop.
3670 //
3671 // for (int i = 0; i < n; ++i)
3672 // b[i] = a[i] - a[i - 1];
3673 //
3674 // There is a first-order recurrence on "a". For this loop, the shorthand
3675 // scalar IR looks like:
3676 //
3677 // scalar.ph:
3678 // s_init = a[-1]
3679 // br scalar.body
3680 //
3681 // scalar.body:
3682 // i = phi [0, scalar.ph], [i+1, scalar.body]
3683 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3684 // s2 = a[i]
3685 // b[i] = s2 - s1
3686 // br cond, scalar.body, ...
3687 //
3688 // In this example, s1 is a recurrence because it's value depends on the
3689 // previous iteration. In the first phase of vectorization, we created a
3690 // vector phi v1 for s1. We now complete the vectorization and produce the
3691 // shorthand vector IR shown below (for VF = 4, UF = 1).
3692 //
3693 // vector.ph:
3694 // v_init = vector(..., ..., ..., a[-1])
3695 // br vector.body
3696 //
3697 // vector.body
3698 // i = phi [0, vector.ph], [i+4, vector.body]
3699 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3700 // v2 = a[i, i+1, i+2, i+3];
3701 // v3 = vector(v1(3), v2(0, 1, 2))
3702 // b[i, i+1, i+2, i+3] = v2 - v3
3703 // br cond, vector.body, middle.block
3704 //
3705 // middle.block:
3706 // x = v2(3)
3707 // br scalar.ph
3708 //
3709 // scalar.ph:
3710 // s_init = phi [x, middle.block], [a[-1], otherwise]
3711 // br scalar.body
3712 //
3713 // After execution completes the vector loop, we extract the next value of
3714 // the recurrence (x) to use as the initial value in the scalar loop.
3715
3716 // Extract the last vector element in the middle block. This will be the
3717 // initial value for the recurrence when jumping to the scalar loop.
3718 VPValue *PreviousDef = PhiR->getBackedgeValue();
3719 Value *Incoming = State.get(PreviousDef, UF - 1);
3720 auto *ExtractForScalar = Incoming;
3721 auto *IdxTy = Builder.getInt32Ty();
3722 Value *RuntimeVF = nullptr;
3723 if (VF.isVector()) {
3724 auto *One = ConstantInt::get(IdxTy, 1);
3726 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3727 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3728 ExtractForScalar =
3729 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3730 }
3731
3732 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3733 assert(PhiR->getNumUsers() == 1 &&
3734 RecurSplice->getOpcode() ==
3736 "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3737 SmallVector<VPLiveOut *> LiveOuts;
3738 for (VPUser *U : RecurSplice->users())
3739 if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3740 LiveOuts.push_back(LiveOut);
3741
3742 if (!LiveOuts.empty()) {
3743 // Extract the second last element in the middle block if the
3744 // Phi is used outside the loop. We need to extract the phi itself
3745 // and not the last element (the phi update in the current iteration). This
3746 // will be the value when jumping to the exit block from the
3747 // LoopMiddleBlock, when the scalar loop is not run at all.
3748 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3749 if (VF.isVector()) {
3750 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3751 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3752 Incoming, Idx, "vector.recur.extract.for.phi");
3753 } else {
3754 assert(UF > 1 && "VF and UF cannot both be 1");
3755 // When loop is unrolled without vectorizing, initialize
3756 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3757 // value of `Incoming`. This is analogous to the vectorized case above:
3758 // extracting the second last element when VF > 1.
3759 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3760 }
3761
3762 for (VPLiveOut *LiveOut : LiveOuts) {
3763 assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3764 PHINode *LCSSAPhi = LiveOut->getPhi();
3765 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3766 State.Plan->removeLiveOut(LCSSAPhi);
3767 }
3768 }
3769
3770 // Fix the initial value of the original recurrence in the scalar loop.
3772 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3773 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3774 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3775 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3776 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3777 Start->addIncoming(Incoming, BB);
3778 }
3779
3780 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3781 Phi->setName("scalar.recur");
3782}
3783
3785 VPTransformState &State) {
3786 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3787 // Get it's reduction variable descriptor.
3788 assert(Legal->isReductionVariable(OrigPhi) &&
3789 "Unable to find the reduction variable");
3790 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3791
3792 RecurKind RK = RdxDesc.getRecurrenceKind();
3793 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3794 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3795 if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue))
3796 State.setDebugLocFrom(I->getDebugLoc());
3797
3798 VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3799 // This is the vector-clone of the value that leaves the loop.
3800 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3801
3802 // Before each round, move the insertion point right between
3803 // the PHIs and the values we are going to write.
3804 // This allows us to write both PHINodes and the extractelement
3805 // instructions.
3808
3809 State.setDebugLocFrom(LoopExitInst->getDebugLoc());
3810
3811 Type *PhiTy = OrigPhi->getType();
3812
3813 VPBasicBlock *LatchVPBB =
3815 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3816 // If tail is folded by masking, the vector value to leave the loop should be
3817 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3818 // instead of the former. For an inloop reduction the reduction will already
3819 // be predicated, and does not need to be handled here.
3820 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3821 for (unsigned Part = 0; Part < UF; ++Part) {
3822 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3823 SelectInst *Sel = nullptr;
3824 for (User *U : VecLoopExitInst->users()) {
3825 if (isa<SelectInst>(U)) {
3826 assert((!Sel || U == Sel) &&
3827 "Reduction exit feeding two different selects");
3828 Sel = cast<SelectInst>(U);
3829 } else
3830 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3831 }
3832 assert(Sel && "Reduction exit feeds no select");
3833 State.reset(LoopExitInstDef, Sel, Part);
3834
3835 // If the target can create a predicated operator for the reduction at no
3836 // extra cost in the loop (for example a predicated vadd), it can be
3837 // cheaper for the select to remain in the loop than be sunk out of it,
3838 // and so use the select value for the phi instead of the old
3839 // LoopExitValue.
3842 RdxDesc.getOpcode(), PhiTy,
3844 auto *VecRdxPhi =
3845 cast<PHINode>(State.get(PhiR, Part));
3846 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3847 }
3848 }
3849 }
3850
3851 // If the vector reduction can be performed in a smaller type, we truncate
3852 // then extend the loop exit value to enable InstCombine to evaluate the
3853 // entire expression in the smaller type.
3854 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3855 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3856 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3857 Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3858 VectorParts RdxParts(UF);
3859 for (unsigned Part = 0; Part < UF; ++Part) {
3860 RdxParts[Part] = State.get(LoopExitInstDef, Part);
3861 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3862 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3863 : Builder.CreateZExt(Trunc, VecTy);
3864 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3865 if (U != Trunc) {
3866 U->replaceUsesOfWith(RdxParts[Part], Extnd);
3867 RdxParts[Part] = Extnd;
3868 }
3869 }
3872 for (unsigned Part = 0; Part < UF; ++Part) {
3873 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3874 State.reset(LoopExitInstDef, RdxParts[Part], Part);
3875 }
3876 }
3877
3878 // Reduce all of the unrolled parts into a single vector.
3879 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3880 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3881
3882 // The middle block terminator has already been assigned a DebugLoc here (the
3883 // OrigLoop's single latch terminator). We want the whole middle block to
3884 // appear to execute on this line because: (a) it is all compiler generated,
3885 // (b) these instructions are always executed after evaluating the latch
3886 // conditional branch, and (c) other passes may add new predecessors which
3887 // terminate on this line. This is the easiest way to ensure we don't
3888 // accidentally cause an extra step back into the loop while debugging.
3890 if (PhiR->isOrdered())
3891 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3892 else {
3893 // Floating-point operations should have some FMF to enable the reduction.
3896 for (unsigned Part = 1; Part < UF; ++Part) {
3897 Value *RdxPart = State.get(LoopExitInstDef, Part);
3898 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3899 ReducedPartRdx = Builder.CreateBinOp(
3900 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3902 ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
3903 ReducedPartRdx, RdxPart);
3904 else
3905 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3906 }
3907 }
3908
3909 // Create the reduction after the loop. Note that inloop reductions create the
3910 // target reduction in the loop using a Reduction recipe.
3911 if (VF.isVector() && !PhiR->isInLoop()) {
3912 ReducedPartRdx =
3913 createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
3914 // If the reduction can be performed in a smaller type, we need to extend
3915 // the reduction to the wider type before we branch to the original loop.
3916 if (PhiTy != RdxDesc.getRecurrenceType())
3917 ReducedPartRdx = RdxDesc.isSigned()
3918 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3919 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3920 }
3921
3922 PHINode *ResumePhi =
3923 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
3924
3925 // Create a phi node that merges control-flow from the backedge-taken check
3926 // block and the middle block.
3927 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
3929
3930 // If we are fixing reductions in the epilogue loop then we should already
3931 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
3932 // we carry over the incoming values correctly.
3933 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
3934 if (Incoming == LoopMiddleBlock)
3935 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
3936 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
3937 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
3938 Incoming);
3939 else
3940 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
3941 }
3942
3943 // Set the resume value for this reduction
3944 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
3945
3946 // If there were stores of the reduction value to a uniform memory address
3947 // inside the loop, create the final store here.
3948 if (StoreInst *SI = RdxDesc.IntermediateStore) {
3949 StoreInst *NewSI =
3950 Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(),
3951 SI->getAlign());
3952 propagateMetadata(NewSI, SI);
3953
3954 // If the reduction value is used in other places,
3955 // then let the code below create PHI's for that.
3956 }
3957
3958 // Now, we need to fix the users of the reduction variable
3959 // inside and outside of the scalar remainder loop.
3960
3961 // We know that the loop is in LCSSA form. We need to update the PHI nodes
3962 // in the exit blocks. See comment on analogous loop in
3963 // fixFixedOrderRecurrence for a more complete explaination of the logic.
3964 if (!Cost->requiresScalarEpilogue(VF.isVector()))
3965 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3966 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
3967 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3968 State.Plan->removeLiveOut(&LCSSAPhi);
3969 }
3970
3971 // Fix the scalar loop reduction variable with the incoming reduction sum
3972 // from the vector body and from the backedge value.
3973 int IncomingEdgeBlockIdx =
3975 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3976 // Pick the other block.
3977 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3978 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3979 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3980}
3981
3983 // The basic block and loop containing the predicated instruction.
3984 auto *PredBB = PredInst->getParent();
3985 auto *VectorLoop = LI->getLoopFor(PredBB);
3986
3987 // Initialize a worklist with the operands of the predicated instruction.
3988 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3989
3990 // Holds instructions that we need to analyze again. An instruction may be
3991 // reanalyzed if we don't yet know if we can sink it or not.
3992 SmallVector<Instruction *, 8> InstsToReanalyze;
3993
3994 // Returns true if a given use occurs in the predicated block. Phi nodes use
3995 // their operands in their corresponding predecessor blocks.
3996 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3997 auto *I = cast<Instruction>(U.getUser());
3998 BasicBlock *BB = I->getParent();
3999 if (auto *Phi = dyn_cast<PHINode>(I))
4000 BB = Phi->getIncomingBlock(
4001 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4002 return BB == PredBB;
4003 };
4004
4005 // Iteratively sink the scalarized operands of the predicated instruction
4006 // into the block we created for it. When an instruction is sunk, it's
4007 // operands are then added to the worklist. The algorithm ends after one pass
4008 // through the worklist doesn't sink a single instruction.
4009 bool Changed;
4010 do {
4011 // Add the instructions that need to be reanalyzed to the worklist, and
4012 // reset the changed indicator.
4013 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4014 InstsToReanalyze.clear();
4015 Changed = false;
4016
4017 while (!Worklist.empty()) {
4018 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4019
4020 // We can't sink an instruction if it is a phi node, is not in the loop,
4021 // may have side effects or may read from memory.
4022 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
4023 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4024 I->mayHaveSideEffects() || I->mayReadFromMemory())
4025 continue;
4026
4027 // If the instruction is already in PredBB, check if we can sink its
4028 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4029 // sinking the scalar instruction I, hence it appears in PredBB; but it
4030 // may have failed to sink I's operands (recursively), which we try
4031 // (again) here.
4032 if (I->getParent() == PredBB) {
4033 Worklist.insert(I->op_begin(), I->op_end());
4034 continue;
4035 }
4036
4037 // It's legal to sink the instruction if all its uses occur in the
4038 // predicated block. Otherwise, there's nothing to do yet, and we may
4039 // need to reanalyze the instruction.
4040 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4041 InstsToReanalyze.push_back(I);
4042 continue;
4043 }
4044
4045 // Move the instruction to the beginning of the predicated block, and add
4046 // it's operands to the worklist.
4047 I->moveBefore(&*PredBB->getFirstInsertionPt());
4048 Worklist.insert(I->op_begin(), I->op_end());
4049
4050 // The sinking may have enabled other instructions to be sunk, so we will
4051 // need to iterate.
4052 Changed = true;
4053 }
4054 } while (Changed);
4055}
4056
4058 VPTransformState &State) {
4059 auto Iter = vp_depth_first_deep(Plan.getEntry());
4060 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4061 for (VPRecipeBase &P : VPBB->phis()) {
4062 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4063 if (!VPPhi)
4064 continue;
4065 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4066 // Make sure the builder has a valid insert point.
4067 Builder.SetInsertPoint(NewPhi);
4068 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4069 VPValue *Inc = VPPhi->getIncomingValue(i);
4070 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4071 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4072 }
4073 }
4074 }
4075}
4076
4078 const RecurrenceDescriptor &RdxDesc) {
4079 return Cost->useOrderedReductions(RdxDesc);
4080}
4081
4082void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4083 // We should not collect Scalars more than once per VF. Right now, this
4084 // function is called from collectUniformsAndScalars(), which already does
4085 // this check. Collecting Scalars for VF=1 does not make any sense.
4086 assert(VF.isVector() && !Scalars.contains(VF) &&
4087 "This function should not be visited twice for the same VF");
4088
4089 // This avoids any chances of creating a REPLICATE recipe during planning
4090 // since that would result in generation of scalarized code during execution,
4091 // which is not supported for scalable vectors.
4092 if (VF.isScalable()) {
4093 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4094 return;
4095 }
4096
4098
4099 // These sets are used to seed the analysis with pointers used by memory
4100 // accesses that will remain scalar.
4102 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4103 auto *Latch = TheLoop->getLoopLatch();
4104
4105 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4106 // The pointer operands of loads and stores will be scalar as long as the
4107 // memory access is not a gather or scatter operation. The value operand of a
4108 // store will remain scalar if the store is scalarized.
4109 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4110 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4111 assert(WideningDecision != CM_Unknown &&
4112 "Widening decision should be ready at this moment");
4113 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4114 if (Ptr == Store->getValueOperand())
4115 return WideningDecision == CM_Scalarize;
4116 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4117 "Ptr is neither a value or pointer operand");
4118 return WideningDecision != CM_GatherScatter;
4119 };
4120
4121 // A helper that returns true if the given value is a bitcast or
4122 // getelementptr instruction contained in the loop.
4123 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4124 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4125 isa<GetElementPtrInst>(V)) &&
4127 };
4128
4129 // A helper that evaluates a memory access's use of a pointer. If the use will
4130 // be a scalar use and the pointer is only used by memory accesses, we place
4131 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4132 // PossibleNonScalarPtrs.
4133 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4134 // We only care about bitcast and getelementptr instructions contained in
4135 // the loop.
4136 if (!isLoopVaryingBitCastOrGEP(Ptr))
4137 return;
4138
4139 // If the pointer has already been identified as scalar (e.g., if it was
4140 // also identified as uniform), there's nothing to do.
4141 auto *I = cast<Instruction>(Ptr);
4142 if (Worklist.count(I))
4143 return;
4144
4145 // If the use of the pointer will be a scalar use, and all users of the
4146 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4147 // place the pointer in PossibleNonScalarPtrs.
4148 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4149 return isa<LoadInst>(U) || isa<StoreInst>(U);
4150 }))
4151 ScalarPtrs.insert(I);
4152 else
4153 PossibleNonScalarPtrs.insert(I);
4154 };
4155
4156 // We seed the scalars analysis with three classes of instructions: (1)
4157 // instructions marked uniform-after-vectorization and (2) bitcast,
4158 // getelementptr and (pointer) phi instructions used by memory accesses
4159 // requiring a scalar use.
4160 //
4161 // (1) Add to the worklist all instructions that have been identified as
4162 // uniform-after-vectorization.
4163 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4164
4165 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4166 // memory accesses requiring a scalar use. The pointer operands of loads and
4167 // stores will be scalar as long as the memory accesses is not a gather or
4168 // scatter operation. The value operand of a store will remain scalar if the
4169 // store is scalarized.
4170 for (auto *BB : TheLoop->blocks())
4171 for (auto &I : *BB) {
4172 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4173 evaluatePtrUse(Load, Load->getPointerOperand());
4174 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4175 evaluatePtrUse(Store, Store->getPointerOperand());
4176 evaluatePtrUse(Store, Store->getValueOperand());
4177 }
4178 }
4179 for (auto *I : ScalarPtrs)
4180 if (!PossibleNonScalarPtrs.count(I)) {
4181 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4182 Worklist.insert(I);
4183 }
4184
4185 // Insert the forced scalars.
4186 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4187 // induction variable when the PHI user is scalarized.
4188 auto ForcedScalar = ForcedScalars.find(VF);
4189 if (ForcedScalar != ForcedScalars.end())
4190 for (auto *I : ForcedScalar->second) {
4191 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
4192 Worklist.insert(I);
4193 }
4194
4195 // Expand the worklist by looking through any bitcasts and getelementptr
4196 // instructions we've already identified as scalar. This is similar to the
4197 // expansion step in collectLoopUniforms(); however, here we're only
4198 // expanding to include additional bitcasts and getelementptr instructions.
4199 unsigned Idx = 0;
4200 while (Idx != Worklist.size()) {
4201 Instruction *Dst = Worklist[Idx++];
4202 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4203 continue;
4204 auto *Src = cast<Instruction>(Dst->getOperand(0));
4205 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4206 auto *J = cast<Instruction>(U);
4207 return !TheLoop->contains(J) || Worklist.count(J) ||
4208 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4209 isScalarUse(J, Src));
4210 })) {
4211 Worklist.insert(Src);
4212 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4213 }
4214 }
4215
4216 // An induction variable will remain scalar if all users of the induction
4217 // variable and induction variable update remain scalar.
4218 for (const auto &Induction : Legal->getInductionVars()) {
4219 auto *Ind = Induction.first;
4220 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4221
4222 // If tail-folding is applied, the primary induction variable will be used
4223 // to feed a vector compare.
4224 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4225 continue;
4226
4227 // Returns true if \p Indvar is a pointer induction that is used directly by
4228 // load/store instruction \p I.
4229 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4230 Instruction *I) {
4231 return Induction.second.getKind() ==
4233 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4234 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4235 };
4236
4237 // Determine if all users of the induction variable are scalar after
4238 // vectorization.
4239 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4240 auto *I = cast<Instruction>(U);
4241 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4242 IsDirectLoadStoreFromPtrIndvar(Ind, I);
4243 });
4244 if (!ScalarInd)
4245 continue;
4246
4247 // Determine if all users of the induction variable update instruction are
4248 // scalar after vectorization.
4249 auto ScalarIndUpdate =
4250 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4251 auto *I = cast<Instruction>(U);
4252 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4253 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4254 });
4255 if (!ScalarIndUpdate)
4256 continue;
4257
4258 // The induction variable and its update instruction will remain scalar.
4259 Worklist.insert(Ind);
4260 Worklist.insert(IndUpdate);
4261 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4262 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4263 << "\n");
4264 }
4265
4266 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4267}
4268
4270 Instruction *I, ElementCount VF) const {
4271 if (!isPredicatedInst(I))
4272 return false;
4273
4274 // Do we have a non-scalar lowering for this predicated
4275 // instruction? No - it is scalar with predication.
4276 switch(I->getOpcode()) {
4277 default:
4278 return true;
4279 case Instruction::Call:
4280 return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
4281 case Instruction::Load:
4282 case Instruction::Store: {
4284 auto *Ty = getLoadStoreType(I);
4285 Type *VTy = Ty;
4286 if (VF.isVector())
4287 VTy = VectorType::get(Ty, VF);
4288 const Align Alignment = getLoadStoreAlignment(I);
4289 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4290 TTI.isLegalMaskedGather(VTy, Alignment))
4291 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4292 TTI.isLegalMaskedScatter(VTy, Alignment));
4293 }
4294 case Instruction::UDiv:
4295 case Instruction::SDiv:
4296 case Instruction::SRem:
4297 case Instruction::URem: {
4298 // We have the option to use the safe-divisor idiom to avoid predication.
4299 // The cost based decision here will always select safe-divisor for
4300 // scalable vectors as scalarization isn't legal.
4301 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
4302 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
4303 }
4304 }
4305}
4306
4308 if (!blockNeedsPredicationForAnyReason(I->getParent()))
4309 return false;
4310
4311 // Can we prove this instruction is safe to unconditionally execute?
4312 // If not, we must use some form of predication.
4313 switch(I->getOpcode()) {
4314 default:
4315 return false;
4316 case Instruction::Load:
4317 case Instruction::Store: {
4318 if (!Legal->isMaskRequired(I))
4319 return false;
4320 // When we know the load's address is loop invariant and the instruction
4321 // in the original scalar loop was unconditionally executed then we
4322 // don't need to mark it as a predicated instruction. Tail folding may
4323 // introduce additional predication, but we're guaranteed to always have
4324 // at least one active lane. We call Legal->blockNeedsPredication here
4325 // because it doesn't query tail-folding. For stores, we need to prove
4326 // both speculation safety (which follows from the same argument as loads),
4327 // but also must prove the value being stored is correct. The easiest
4328 // form of the later is to require that all values stored are the same.
4330 (isa<LoadInst>(I) ||
4331 (isa<StoreInst>(I) &&
4332 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4333 !Legal->blockNeedsPredication(I->getParent()))
4334 return false;
4335 return true;
4336 }
4337 case Instruction::UDiv:
4338 case Instruction::SDiv:
4339 case Instruction::SRem:
4340 case Instruction::URem:
4341 // TODO: We can use the loop-preheader as context point here and get
4342 // context sensitive reasoning
4344 case Instruction::Call:
4345 return Legal->isMaskRequired(I);
4346 }
4347}
4348
4349std::pair<InstructionCost, InstructionCost>
4351 ElementCount VF) const {
4352 assert(I->getOpcode() == Instruction::UDiv ||
4353 I->getOpcode() == Instruction::SDiv ||
4354 I->getOpcode() == Instruction::SRem ||
4355 I->getOpcode() == Instruction::URem);
4357
4359
4360 // Scalarization isn't legal for scalable vector types
4361 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4362 if (!VF.isScalable()) {
4363 // Get the scalarization cost and scale this amount by the probability of
4364 // executing the predicated block. If the instruction is not predicated,
4365 // we fall through to the next case.
4366 ScalarizationCost = 0;
4367
4368 // These instructions have a non-void type, so account for the phi nodes
4369 // that we will create. This cost is likely to be zero. The phi node
4370 // cost, if any, should be scaled by the block probability because it
4371 // models a copy at the end of each predicated block.
4372 ScalarizationCost += VF.getKnownMinValue() *
4373 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4374
4375 // The cost of the non-predicated instruction.
4376 ScalarizationCost += VF.getKnownMinValue() *
4377 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4378
4379 // The cost of insertelement and extractelement instructions needed for
4380 // scalarization.
4381 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4382
4383 // Scale the cost by the probability of executing the predicated blocks.
4384 // This assumes the predicated block for each vector lane is equally
4385 // likely.
4386 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4387 }
4388 InstructionCost SafeDivisorCost = 0;
4389
4390 auto *VecTy = ToVectorTy(I->getType(), VF);
4391
4392 // The cost of the select guard to ensure all lanes are well defined
4393 // after we speculate above any internal control flow.
4394 SafeDivisorCost += TTI.getCmpSelInstrCost(
4395 Instruction::Select, VecTy,
4396 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4398
4399 // Certain instructions can be cheaper to vectorize if they have a constant
4400 // second vector operand. One example of this are shifts on x86.
4401 Value *Op2 = I->getOperand(1);
4402 auto Op2Info = TTI.getOperandInfo(Op2);
4403 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4404 Legal->isInvariant(Op2))
4406
4407 SmallVector<const Value *, 4> Operands(I->operand_values());
4408 SafeDivisorCost += TTI.getArithmeticInstrCost(
4409 I->getOpcode(), VecTy, CostKind,
4410 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4411 Op2Info, Operands, I);
4412 return {ScalarizationCost, SafeDivisorCost};
4413}
4414
4416 Instruction *I, ElementCount VF) {
4417 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4419 "Decision should not be set yet.");
4420 auto *Group = getInterleavedAccessGroup(I);
4421 assert(Group && "Must have a group.");
4422
4423 // If the instruction's allocated size doesn't equal it's type size, it
4424 // requires padding and will be scalarized.
4425 auto &DL = I->getModule()->getDataLayout();
4426 auto *ScalarTy = getLoadStoreType(I);
4427 if (hasIrregularType(ScalarTy, DL))
4428 return false;
4429
4430 // If the group involves a non-integral pointer, we may not be able to
4431 // losslessly cast all values to a common type.
4432 unsigned InterleaveFactor = Group->getFactor();
4433 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4434 for (unsigned i = 0; i < InterleaveFactor; i++) {
4435 Instruction *Member = Group->getMember(i);
4436 if (!Member)
4437 continue;
4438 auto *MemberTy = getLoadStoreType(Member);
4439 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4440 // Don't coerce non-integral pointers to integers or vice versa.
4441 if (MemberNI != ScalarNI) {
4442 // TODO: Consider adding special nullptr value case here
4443 return false;
4444 } else if (MemberNI && ScalarNI &&
4445 ScalarTy->getPointerAddressSpace() !=
4446 MemberTy->getPointerAddressSpace()) {
4447 return false;
4448 }
4449 }
4450
4451 // Check if masking is required.
4452 // A Group may need masking for one of two reasons: it resides in a block that
4453 // needs predication, or it was decided to use masking to deal with gaps
4454 // (either a gap at the end of a load-access that may result in a speculative
4455 // load, or any gaps in a store-access).
4456 bool PredicatedAccessRequiresMasking =
4457 blockNeedsPredicationForAnyReason(I->getParent()) &&
4459 bool LoadAccessWithGapsRequiresEpilogMasking =
4460 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4462 bool StoreAccessWithGapsRequiresMasking =
4463 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4464 if (!PredicatedAccessRequiresMasking &&
4465 !LoadAccessWithGapsRequiresEpilogMasking &&
4466 !StoreAccessWithGapsRequiresMasking)
4467 return true;
4468
4469 // If masked interleaving is required, we expect that the user/target had
4470 // enabled it, because otherwise it either wouldn't have been created or
4471 // it should have been invalidated by the CostModel.
4473 "Masked interleave-groups for predicated accesses are not enabled.");
4474
4475 if (Group->isReverse())
4476 return false;
4477
4478 auto *Ty = getLoadStoreType(I);
4479 const Align Alignment = getLoadStoreAlignment(I);
4480 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4481 : TTI.isLegalMaskedStore(Ty, Alignment);
4482}
4483
4485 Instruction *I, ElementCount VF) {
4486 // Get and ensure we have a valid memory instruction.
4487 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4488
4490 auto *ScalarTy = getLoadStoreType(I);
4491
4492 // In order to be widened, the pointer should be consecutive, first of all.
4493 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4494 return false;
4495
4496 // If the instruction is a store located in a predicated block, it will be
4497 // scalarized.
4498 if (isScalarWithPredication(I, VF))
4499 return false;
4500
4501 // If the instruction's allocated size doesn't equal it's type size, it
4502 // requires padding and will be scalarized.
4503 auto &DL = I->getModule()->getDataLayout();
4504 if (hasIrregularType(ScalarTy, DL))
4505 return false;
4506
4507 return true;
4508}
4509
4510void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4511 // We should not collect Uniforms more than once per VF. Right now,
4512 // this function is called from collectUniformsAndScalars(), which
4513 // already does this check. Collecting Uniforms for VF=1 does not make any</