LLVM 23.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
84#include "llvm/Analysis/CFG.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
147#include <algorithm>
148#include <cassert>
149#include <cmath>
150#include <cstdint>
151#include <functional>
152#include <iterator>
153#include <limits>
154#include <memory>
155#include <string>
156#include <tuple>
157#include <utility>
158
159using namespace llvm;
160using namespace SCEVPatternMatch;
161
162#define LV_NAME "loop-vectorize"
163#define DEBUG_TYPE LV_NAME
164
165#ifndef NDEBUG
166const char VerboseDebug[] = DEBUG_TYPE "-verbose";
167#endif
168
169STATISTIC(LoopsVectorized, "Number of loops vectorized");
170STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
172STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
173
175 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
176 cl::desc("Enable vectorization of epilogue loops."));
177
179 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
180 cl::desc("When epilogue vectorization is enabled, and a value greater than "
181 "1 is specified, forces the given VF for all applicable epilogue "
182 "loops."));
183
185 "epilogue-vectorization-minimum-VF", cl::Hidden,
186 cl::desc("Only loops with vectorization factor equal to or larger than "
187 "the specified value are considered for epilogue vectorization."));
188
189/// Loops with a known constant trip count below this number are vectorized only
190/// if no scalar iteration overheads are incurred.
192 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
193 cl::desc("Loops with a constant trip count that is smaller than this "
194 "value are vectorized only if no scalar iteration overheads "
195 "are incurred."));
196
198 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
199 cl::desc("The maximum allowed number of runtime memory checks"));
200
201/// Note: This currently only applies to `llvm.masked.load` and
202/// `llvm.masked.store`. TODO: Extend this to cover other operations as needed.
204 "force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden,
205 cl::desc("Assume the target supports masked memory operations (used for "
206 "testing)."));
207
208// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
209// that predication is preferred, and this lists all options. I.e., the
210// vectorizer will try to fold the tail-loop (epilogue) into the vector body
211// and predicate the instructions accordingly. If tail-folding fails, there are
212// different fallback strategies depending on these values:
219} // namespace PreferPredicateTy
220
222 "prefer-predicate-over-epilogue",
225 cl::desc("Tail-folding and predication preferences over creating a scalar "
226 "epilogue loop."),
228 "scalar-epilogue",
229 "Don't tail-predicate loops, create scalar epilogue"),
231 "predicate-else-scalar-epilogue",
232 "prefer tail-folding, create scalar epilogue if tail "
233 "folding fails."),
235 "predicate-dont-vectorize",
236 "prefers tail-folding, don't attempt vectorization if "
237 "tail-folding fails.")));
238
240 "force-tail-folding-style", cl::desc("Force the tail folding style"),
243 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
246 "Create lane mask for data only, using active.lane.mask intrinsic"),
248 "data-without-lane-mask",
249 "Create lane mask with compare/stepvector"),
251 "Create lane mask using active.lane.mask intrinsic, and use "
252 "it for both data and control flow"),
254 "Use predicated EVL instructions for tail folding. If EVL "
255 "is unsupported, fallback to data-without-lane-mask.")));
256
258 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
259 cl::desc("Enable use of wide lane masks when used for control flow in "
260 "tail-folded loops"));
261
263 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
264 cl::desc("Maximize bandwidth when selecting vectorization factor which "
265 "will be determined by the smallest type in loop."));
266
268 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
269 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
270
271/// An interleave-group may need masking if it resides in a block that needs
272/// predication, or in order to mask away gaps.
274 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
275 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
276
278 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
279 cl::desc("A flag that overrides the target's number of scalar registers."));
280
282 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
283 cl::desc("A flag that overrides the target's number of vector registers."));
284
286 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
287 cl::desc("A flag that overrides the target's max interleave factor for "
288 "scalar loops."));
289
291 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
292 cl::desc("A flag that overrides the target's max interleave factor for "
293 "vectorized loops."));
294
296 "force-target-instruction-cost", cl::init(0), cl::Hidden,
297 cl::desc("A flag that overrides the target's expected cost for "
298 "an instruction to a single constant value. Mostly "
299 "useful for getting consistent testing."));
300
302 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
303 cl::desc(
304 "Pretend that scalable vectors are supported, even if the target does "
305 "not support them. This flag should only be used for testing."));
306
308 "small-loop-cost", cl::init(20), cl::Hidden,
309 cl::desc(
310 "The cost of a loop that is considered 'small' by the interleaver."));
311
313 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
314 cl::desc("Enable the use of the block frequency analysis to access PGO "
315 "heuristics minimizing code growth in cold regions and being more "
316 "aggressive in hot regions."));
317
318// Runtime interleave loops for load/store throughput.
320 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
321 cl::desc(
322 "Enable runtime interleaving until load/store ports are saturated"));
323
324/// The number of stores in a loop that are allowed to need predication.
326 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
327 cl::desc("Max number of stores to be predicated behind an if."));
328
330 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
331 cl::desc("Count the induction variable only once when interleaving"));
332
334 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
335 cl::desc("Enable if predication of stores during vectorization."));
336
338 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
339 cl::desc("The maximum interleave count to use when interleaving a scalar "
340 "reduction in a nested loop."));
341
342static cl::opt<bool>
343 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
345 cl::desc("Prefer in-loop vector reductions, "
346 "overriding the targets preference."));
347
349 "force-ordered-reductions", cl::init(false), cl::Hidden,
350 cl::desc("Enable the vectorisation of loops with in-order (strict) "
351 "FP reductions"));
352
354 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
355 cl::desc(
356 "Prefer predicating a reduction operation over an after loop select."));
357
359 "enable-vplan-native-path", cl::Hidden,
360 cl::desc("Enable VPlan-native vectorization path with "
361 "support for outer loop vectorization."));
362
364 llvm::VerifyEachVPlan("vplan-verify-each",
365#ifdef EXPENSIVE_CHECKS
366 cl::init(true),
367#else
368 cl::init(false),
369#endif
371 cl::desc("Verify VPlans after VPlan transforms."));
372
373#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
375 "vplan-print-after-all", cl::init(false), cl::Hidden,
376 cl::desc("Print VPlans after all VPlan transformations."));
377
379 "vplan-print-after", cl::Hidden,
380 cl::desc("Print VPlans after specified VPlan transformations (regexp)."));
381
383 "vplan-print-vector-region-scope", cl::init(false), cl::Hidden,
384 cl::desc("Limit VPlan printing to vector loop region in "
385 "`-vplan-print-after*` if the plan has one."));
386#endif
387
388// This flag enables the stress testing of the VPlan H-CFG construction in the
389// VPlan-native vectorization path. It must be used in conjuction with
390// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
391// verification of the H-CFGs built.
393 "vplan-build-stress-test", cl::init(false), cl::Hidden,
394 cl::desc(
395 "Build VPlan for every supported loop nest in the function and bail "
396 "out right after the build (stress test the VPlan H-CFG construction "
397 "in the VPlan-native vectorization path)."));
398
400 "interleave-loops", cl::init(true), cl::Hidden,
401 cl::desc("Enable loop interleaving in Loop vectorization passes"));
403 "vectorize-loops", cl::init(true), cl::Hidden,
404 cl::desc("Run the Loop vectorization passes"));
405
407 "force-widen-divrem-via-safe-divisor", cl::Hidden,
408 cl::desc(
409 "Override cost based safe divisor widening for div/rem instructions"));
410
412 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
414 cl::desc("Try wider VFs if they enable the use of vector variants"));
415
417 "enable-early-exit-vectorization", cl::init(true), cl::Hidden,
418 cl::desc(
419 "Enable vectorization of early exit loops with uncountable exits."));
420
422 "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
423 cl::desc("Discard VFs if their register pressure is too high."));
424
425// Likelyhood of bypassing the vectorized loop because there are zero trips left
426// after prolog. See `emitIterationCountCheck`.
427static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
428
429/// A helper function that returns true if the given type is irregular. The
430/// type is irregular if its allocated size doesn't equal the store size of an
431/// element of the corresponding vector type.
432static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
433 // Determine if an array of N elements of type Ty is "bitcast compatible"
434 // with a <N x Ty> vector.
435 // This is only true if there is no padding between the array elements.
436 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
437}
438
439/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
440/// ElementCount to include loops whose trip count is a function of vscale.
442 const Loop *L) {
443 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
444 return ElementCount::getFixed(ExpectedTC);
445
446 const SCEV *BTC = SE->getBackedgeTakenCount(L);
448 return ElementCount::getFixed(0);
449
450 const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
451 if (isa<SCEVVScale>(ExitCount))
453
454 const APInt *Scale;
455 if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
456 if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
457 if (Scale->getActiveBits() <= 32)
459
460 return ElementCount::getFixed(0);
461}
462
463/// Get the maximum trip count for \p L from the SCEV unsigned range, excluding
464/// zero from the range. Only valid when not folding the tail, as the minimum
465/// iteration count check guards against a zero trip count. Returns 0 if
466/// unknown.
468 Loop *L) {
469 const SCEV *BTC = PSE.getBackedgeTakenCount();
471 return 0;
472 ScalarEvolution *SE = PSE.getSE();
473 const SCEV *TripCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
474 ConstantRange TCRange = SE->getUnsignedRange(TripCount);
475 APInt MaxTCFromRange = TCRange.getUnsignedMax();
476 if (!MaxTCFromRange.isZero() && MaxTCFromRange.getActiveBits() <= 32)
477 return MaxTCFromRange.getZExtValue();
478 return 0;
479}
480
481/// Returns "best known" trip count, which is either a valid positive trip count
482/// or std::nullopt when an estimate cannot be made (including when the trip
483/// count would overflow), for the specified loop \p L as defined by the
484/// following procedure:
485/// 1) Returns exact trip count if it is known.
486/// 2) Returns expected trip count according to profile data if any.
487/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
488/// 4) Returns the maximum trip count from the SCEV range excluding zero,
489/// if \p CanUseConstantMax and \p CanExcludeZeroTrips.
490/// 5) Returns std::nullopt if all of the above failed.
491static std::optional<ElementCount>
493 bool CanUseConstantMax = true,
494 bool CanExcludeZeroTrips = false) {
495 // Check if exact trip count is known.
496 if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
497 return ExpectedTC;
498
499 // Check if there is an expected trip count available from profile data.
501 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
502 return ElementCount::getFixed(*EstimatedTC);
503
504 if (!CanUseConstantMax)
505 return std::nullopt;
506
507 // Check if upper bound estimate is known.
508 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
509 return ElementCount::getFixed(ExpectedTC);
510
511 // Get the maximum trip count from the SCEV range excluding zero. This is
512 // only safe when not folding the tail, as the minimum iteration count check
513 // prevents entering the vector loop with a zero trip count.
514 if (CanUseConstantMax && CanExcludeZeroTrips)
515 if (unsigned RefinedTC = getMaxTCFromNonZeroRange(PSE, L))
516 return ElementCount::getFixed(RefinedTC);
517
518 return std::nullopt;
519}
520
521namespace {
522// Forward declare GeneratedRTChecks.
523class GeneratedRTChecks;
524
525using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
526} // namespace
527
528namespace llvm {
529
531
532/// InnerLoopVectorizer vectorizes loops which contain only one basic
533/// block to a specified vectorization factor (VF).
534/// This class performs the widening of scalars into vectors, or multiple
535/// scalars. This class also implements the following features:
536/// * It inserts an epilogue loop for handling loops that don't have iteration
537/// counts that are known to be a multiple of the vectorization factor.
538/// * It handles the code generation for reduction variables.
539/// * Scalarization (implementation using scalars) of un-vectorizable
540/// instructions.
541/// InnerLoopVectorizer does not perform any vectorization-legality
542/// checks, and relies on the caller to check for the different legality
543/// aspects. The InnerLoopVectorizer relies on the
544/// LoopVectorizationLegality class to provide information about the induction
545/// and reduction variables that were found to a given vectorization factor.
547public:
551 ElementCount VecWidth, unsigned UnrollFactor,
553 GeneratedRTChecks &RTChecks, VPlan &Plan)
554 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
555 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
558 Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
559
560 virtual ~InnerLoopVectorizer() = default;
561
562 /// Creates a basic block for the scalar preheader. Both
563 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
564 /// the method to create additional blocks and checks needed for epilogue
565 /// vectorization.
567
568 /// Fix the vectorized code, taking care of header phi's, and more.
570
571 /// Fix the non-induction PHIs in \p Plan.
573
574protected:
576
577 /// Create and return a new IR basic block for the scalar preheader whose name
578 /// is prefixed with \p Prefix.
580
581 /// Allow subclasses to override and print debug traces before/after vplan
582 /// execution, when trace information is requested.
583 virtual void printDebugTracesAtStart() {}
584 virtual void printDebugTracesAtEnd() {}
585
586 /// The original loop.
588
589 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
590 /// dynamic knowledge to simplify SCEV expressions and converts them to a
591 /// more usable form.
593
594 /// Loop Info.
596
597 /// Dominator Tree.
599
600 /// Target Transform Info.
602
603 /// Assumption Cache.
605
606 /// The vectorization SIMD factor to use. Each vector will have this many
607 /// vector elements.
609
610 /// The vectorization unroll factor to use. Each scalar is vectorized to this
611 /// many different vector instructions.
612 unsigned UF;
613
614 /// The builder that we use
616
617 // --- Vectorization state ---
618
619 /// The profitablity analysis.
621
622 /// Structure to hold information about generated runtime checks, responsible
623 /// for cleaning the checks, if vectorization turns out unprofitable.
624 GeneratedRTChecks &RTChecks;
625
627
628 /// The vector preheader block of \p Plan, used as target for check blocks
629 /// introduced during skeleton creation.
631};
632
633/// Encapsulate information regarding vectorization of a loop and its epilogue.
634/// This information is meant to be updated and used across two stages of
635/// epilogue vectorization.
638 unsigned MainLoopUF = 0;
640 unsigned EpilogueUF = 0;
645
647 ElementCount EVF, unsigned EUF,
649 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
651 assert(EUF == 1 &&
652 "A high UF for the epilogue loop is likely not beneficial.");
653 }
654};
655
656/// An extension of the inner loop vectorizer that creates a skeleton for a
657/// vectorized loop that has its epilogue (residual) also vectorized.
658/// The idea is to run the vplan on a given loop twice, firstly to setup the
659/// skeleton and vectorize the main loop, and secondly to complete the skeleton
660/// from the first step and vectorize the epilogue. This is achieved by
661/// deriving two concrete strategy classes from this base class and invoking
662/// them in succession from the loop vectorizer planner.
664public:
674
675 /// Holds and updates state information required to vectorize the main loop
676 /// and its epilogue in two separate passes. This setup helps us avoid
677 /// regenerating and recomputing runtime safety checks. It also helps us to
678 /// shorten the iteration-count-check path length for the cases where the
679 /// iteration count of the loop is so small that the main vector loop is
680 /// completely skipped.
682
683protected:
685};
686
687/// A specialized derived class of inner loop vectorizer that performs
688/// vectorization of *main* loops in the process of vectorizing loops and their
689/// epilogues.
691public:
702
703protected:
704 void printDebugTracesAtStart() override;
705 void printDebugTracesAtEnd() override;
706};
707
708// A specialized derived class of inner loop vectorizer that performs
709// vectorization of *epilogue* loops in the process of vectorizing loops and
710// their epilogues.
712public:
719 GeneratedRTChecks &Checks, VPlan &Plan)
721 Checks, Plan, EPI.EpilogueVF,
722 EPI.EpilogueVF, EPI.EpilogueUF) {}
723 /// Implements the interface for creating a vectorized skeleton using the
724 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
726
727protected:
728 void printDebugTracesAtStart() override;
729 void printDebugTracesAtEnd() override;
730};
731} // end namespace llvm
732
733/// Look for a meaningful debug location on the instruction or its operands.
735 if (!I)
736 return DebugLoc::getUnknown();
737
739 if (I->getDebugLoc() != Empty)
740 return I->getDebugLoc();
741
742 for (Use &Op : I->operands()) {
743 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
744 if (OpInst->getDebugLoc() != Empty)
745 return OpInst->getDebugLoc();
746 }
747
748 return I->getDebugLoc();
749}
750
751/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
752/// is passed, the message relates to that particular instruction.
753#ifndef NDEBUG
754static void debugVectorizationMessage(const StringRef Prefix,
755 const StringRef DebugMsg,
756 Instruction *I) {
757 dbgs() << "LV: " << Prefix << DebugMsg;
758 if (I != nullptr)
759 dbgs() << " " << *I;
760 else
761 dbgs() << '.';
762 dbgs() << '\n';
763}
764#endif
765
766/// Create an analysis remark that explains why vectorization failed
767///
768/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
769/// RemarkName is the identifier for the remark. If \p I is passed it is an
770/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
771/// the location of the remark. If \p DL is passed, use it as debug location for
772/// the remark. \return the remark object that can be streamed to.
773static OptimizationRemarkAnalysis
774createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
775 Instruction *I, DebugLoc DL = {}) {
776 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
777 // If debug location is attached to the instruction, use it. Otherwise if DL
778 // was not provided, use the loop's.
779 if (I && I->getDebugLoc())
780 DL = I->getDebugLoc();
781 else if (!DL)
782 DL = TheLoop->getStartLoc();
783
784 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
785}
786
787namespace llvm {
788
789/// Return the runtime value for VF.
791 return B.CreateElementCount(Ty, VF);
792}
793
795 const StringRef OREMsg, const StringRef ORETag,
796 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
797 Instruction *I) {
798 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
799 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
800 ORE->emit(
801 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
802 << "loop not vectorized: " << OREMsg);
803}
804
805/// Reports an informative message: print \p Msg for debugging purposes as well
806/// as an optimization remark. Uses either \p I as location of the remark, or
807/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
808/// remark. If \p DL is passed, use it as debug location for the remark.
809static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
811 Loop *TheLoop, Instruction *I = nullptr,
812 DebugLoc DL = {}) {
814 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
815 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
816 I, DL)
817 << Msg);
818}
819
820/// Report successful vectorization of the loop. In case an outer loop is
821/// vectorized, prepend "outer" to the vectorization remark.
823 VectorizationFactor VF, unsigned IC) {
825 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
826 nullptr));
827 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
828 ORE->emit([&]() {
829 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
830 TheLoop->getHeader())
831 << "vectorized " << LoopType << "loop (vectorization width: "
832 << ore::NV("VectorizationFactor", VF.Width)
833 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
834 });
835}
836
837} // end namespace llvm
838
839namespace llvm {
840
841// Loop vectorization cost-model hints how the scalar epilogue loop should be
842// lowered.
844
845 // The default: allowing scalar epilogues.
847
848 // Vectorization with OptForSize: don't allow epilogues.
850
851 // A special case of vectorisation with OptForSize: loops with a very small
852 // trip count are considered for vectorization under OptForSize, thereby
853 // making sure the cost of their loop body is dominant, free of runtime
854 // guards and scalar iteration overheads.
856
857 // Loop hint predicate indicating an epilogue is undesired.
859
860 // Directive indicating we must either tail fold or not vectorize
862};
863
864/// LoopVectorizationCostModel - estimates the expected speedups due to
865/// vectorization.
866/// In many cases vectorization is not profitable. This can happen because of
867/// a number of reasons. In this class we mainly attempt to predict the
868/// expected speedup/slowdowns due to the supported instruction set. We use the
869/// TargetTransformInfo to query the different backends for the cost of
870/// different operations.
873
874public:
882 std::function<BlockFrequencyInfo &()> GetBFI,
883 const Function *F, const LoopVectorizeHints *Hints,
885 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
886 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), GetBFI(GetBFI),
889 if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
890 initializeVScaleForTuning();
892 }
893
894 /// \return An upper bound for the vectorization factors (both fixed and
895 /// scalable). If the factors are 0, vectorization and interleaving should be
896 /// avoided up front.
897 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
898
899 /// \return True if runtime checks are required for vectorization, and false
900 /// otherwise.
901 bool runtimeChecksRequired();
902
903 /// Setup cost-based decisions for user vectorization factor.
904 /// \return true if the UserVF is a feasible VF to be chosen.
907 return expectedCost(UserVF).isValid();
908 }
909
910 /// \return True if maximizing vector bandwidth is enabled by the target or
911 /// user options, for the given register kind.
912 bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
913
914 /// \return True if register pressure should be considered for the given VF.
915 bool shouldConsiderRegPressureForVF(ElementCount VF);
916
917 /// \return The size (in bits) of the smallest and widest types in the code
918 /// that needs to be vectorized. We ignore values that remain scalar such as
919 /// 64 bit loop indices.
920 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
921
922 /// Memory access instruction may be vectorized in more than one way.
923 /// Form of instruction after vectorization depends on cost.
924 /// This function takes cost-based decisions for Load/Store instructions
925 /// and collects them in a map. This decisions map is used for building
926 /// the lists of loop-uniform and loop-scalar instructions.
927 /// The calculated cost is saved with widening decision in order to
928 /// avoid redundant calculations.
929 void setCostBasedWideningDecision(ElementCount VF);
930
931 /// A call may be vectorized in different ways depending on whether we have
932 /// vectorized variants available and whether the target supports masking.
933 /// This function analyzes all calls in the function at the supplied VF,
934 /// makes a decision based on the costs of available options, and stores that
935 /// decision in a map for use in planning and plan execution.
936 void setVectorizedCallDecision(ElementCount VF);
937
938 /// Collect values we want to ignore in the cost model.
939 void collectValuesToIgnore();
940
941 /// Collect all element types in the loop for which widening is needed.
942 void collectElementTypesForWidening();
943
944 /// Split reductions into those that happen in the loop, and those that happen
945 /// outside. In loop reductions are collected into InLoopReductions.
946 void collectInLoopReductions();
947
948 /// Returns true if we should use strict in-order reductions for the given
949 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
950 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
951 /// of FP operations.
952 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
953 return !Hints->allowReordering() && RdxDesc.isOrdered();
954 }
955
956 /// \returns The smallest bitwidth each instruction can be represented with.
957 /// The vector equivalents of these instructions should be truncated to this
958 /// type.
960 return MinBWs;
961 }
962
963 /// \returns True if it is more profitable to scalarize instruction \p I for
964 /// vectorization factor \p VF.
966 assert(VF.isVector() &&
967 "Profitable to scalarize relevant only for VF > 1.");
968 assert(
969 TheLoop->isInnermost() &&
970 "cost-model should not be used for outer loops (in VPlan-native path)");
971
972 auto Scalars = InstsToScalarize.find(VF);
973 assert(Scalars != InstsToScalarize.end() &&
974 "VF not yet analyzed for scalarization profitability");
975 return Scalars->second.contains(I);
976 }
977
978 /// Returns true if \p I is known to be uniform after vectorization.
980 assert(
981 TheLoop->isInnermost() &&
982 "cost-model should not be used for outer loops (in VPlan-native path)");
983 // Pseudo probe needs to be duplicated for each unrolled iteration and
984 // vector lane so that profiled loop trip count can be accurately
985 // accumulated instead of being under counted.
987 return false;
988
989 if (VF.isScalar())
990 return true;
991
992 auto UniformsPerVF = Uniforms.find(VF);
993 assert(UniformsPerVF != Uniforms.end() &&
994 "VF not yet analyzed for uniformity");
995 return UniformsPerVF->second.count(I);
996 }
997
998 /// Returns true if \p I is known to be scalar after vectorization.
1000 assert(
1001 TheLoop->isInnermost() &&
1002 "cost-model should not be used for outer loops (in VPlan-native path)");
1003 if (VF.isScalar())
1004 return true;
1005
1006 auto ScalarsPerVF = Scalars.find(VF);
1007 assert(ScalarsPerVF != Scalars.end() &&
1008 "Scalar values are not calculated for VF");
1009 return ScalarsPerVF->second.count(I);
1010 }
1011
1012 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1013 /// for vectorization factor \p VF.
1015 // Truncs must truncate at most to their destination type.
1016 if (isa_and_nonnull<TruncInst>(I) && MinBWs.contains(I) &&
1017 I->getType()->getScalarSizeInBits() < MinBWs.lookup(I))
1018 return false;
1019 return VF.isVector() && MinBWs.contains(I) &&
1020 !isProfitableToScalarize(I, VF) &&
1022 }
1023
1024 /// Decision that was taken during cost calculation for memory instruction.
1027 CM_Widen, // For consecutive accesses with stride +1.
1028 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1034 };
1035
1036 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1037 /// instruction \p I and vector width \p VF.
1040 assert(VF.isVector() && "Expected VF >=2");
1041 WideningDecisions[{I, VF}] = {W, Cost};
1042 }
1043
1044 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1045 /// interleaving group \p Grp and vector width \p VF.
1049 assert(VF.isVector() && "Expected VF >=2");
1050 /// Broadcast this decicion to all instructions inside the group.
1051 /// When interleaving, the cost will only be assigned one instruction, the
1052 /// insert position. For other cases, add the appropriate fraction of the
1053 /// total cost to each instruction. This ensures accurate costs are used,
1054 /// even if the insert position instruction is not used.
1055 InstructionCost InsertPosCost = Cost;
1056 InstructionCost OtherMemberCost = 0;
1057 if (W != CM_Interleave)
1058 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1059 ;
1060 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1061 if (auto *I = Grp->getMember(Idx)) {
1062 if (Grp->getInsertPos() == I)
1063 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1064 else
1065 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1066 }
1067 }
1068 }
1069
1070 /// Return the cost model decision for the given instruction \p I and vector
1071 /// width \p VF. Return CM_Unknown if this instruction did not pass
1072 /// through the cost modeling.
1074 assert(VF.isVector() && "Expected VF to be a vector VF");
1075 assert(
1076 TheLoop->isInnermost() &&
1077 "cost-model should not be used for outer loops (in VPlan-native path)");
1078
1079 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1080 auto Itr = WideningDecisions.find(InstOnVF);
1081 if (Itr == WideningDecisions.end())
1082 return CM_Unknown;
1083 return Itr->second.first;
1084 }
1085
1086 /// Return the vectorization cost for the given instruction \p I and vector
1087 /// width \p VF.
1089 assert(VF.isVector() && "Expected VF >=2");
1090 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1091 assert(WideningDecisions.contains(InstOnVF) &&
1092 "The cost is not calculated");
1093 return WideningDecisions[InstOnVF].second;
1094 }
1095
1103
1105 Function *Variant, Intrinsic::ID IID,
1106 std::optional<unsigned> MaskPos,
1108 assert(!VF.isScalar() && "Expected vector VF");
1109 CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, MaskPos, Cost};
1110 }
1111
1113 ElementCount VF) const {
1114 assert(!VF.isScalar() && "Expected vector VF");
1115 auto I = CallWideningDecisions.find({CI, VF});
1116 if (I == CallWideningDecisions.end())
1117 return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
1118 return I->second;
1119 }
1120
1121 /// Return True if instruction \p I is an optimizable truncate whose operand
1122 /// is an induction variable. Such a truncate will be removed by adding a new
1123 /// induction variable with the destination type.
1125 // If the instruction is not a truncate, return false.
1126 auto *Trunc = dyn_cast<TruncInst>(I);
1127 if (!Trunc)
1128 return false;
1129
1130 // Get the source and destination types of the truncate.
1131 Type *SrcTy = toVectorTy(Trunc->getSrcTy(), VF);
1132 Type *DestTy = toVectorTy(Trunc->getDestTy(), VF);
1133
1134 // If the truncate is free for the given types, return false. Replacing a
1135 // free truncate with an induction variable would add an induction variable
1136 // update instruction to each iteration of the loop. We exclude from this
1137 // check the primary induction variable since it will need an update
1138 // instruction regardless.
1139 Value *Op = Trunc->getOperand(0);
1140 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1141 return false;
1142
1143 // If the truncated value is not an induction variable, return false.
1144 return Legal->isInductionPhi(Op);
1145 }
1146
1147 /// Collects the instructions to scalarize for each predicated instruction in
1148 /// the loop.
1149 void collectInstsToScalarize(ElementCount VF);
1150
1151 /// Collect values that will not be widened, including Uniforms, Scalars, and
1152 /// Instructions to Scalarize for the given \p VF.
1153 /// The sets depend on CM decision for Load/Store instructions
1154 /// that may be vectorized as interleave, gather-scatter or scalarized.
1155 /// Also make a decision on what to do about call instructions in the loop
1156 /// at that VF -- scalarize, call a known vector routine, or call a
1157 /// vector intrinsic.
1159 // Do the analysis once.
1160 if (VF.isScalar() || Uniforms.contains(VF))
1161 return;
1163 collectLoopUniforms(VF);
1165 collectLoopScalars(VF);
1167 }
1168
1169 /// Returns true if the target machine supports masked store operation
1170 /// for the given \p DataType and kind of access to \p Ptr.
1171 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1172 unsigned AddressSpace) const {
1173 return Legal->isConsecutivePtr(DataType, Ptr) &&
1175 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace));
1176 }
1177
1178 /// Returns true if the target machine supports masked load operation
1179 /// for the given \p DataType and kind of access to \p Ptr.
1180 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1181 unsigned AddressSpace) const {
1182 return Legal->isConsecutivePtr(DataType, Ptr) &&
1184 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace));
1185 }
1186
1187 /// Returns true if the target machine can represent \p V as a masked gather
1188 /// or scatter operation.
1190 bool LI = isa<LoadInst>(V);
1191 bool SI = isa<StoreInst>(V);
1192 if (!LI && !SI)
1193 return false;
1194 auto *Ty = getLoadStoreType(V);
1196 if (VF.isVector())
1197 Ty = VectorType::get(Ty, VF);
1198 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1199 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1200 }
1201
1202 /// Returns true if the target machine supports all of the reduction
1203 /// variables found for the given VF.
1205 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1206 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1207 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1208 }));
1209 }
1210
1211 /// Given costs for both strategies, return true if the scalar predication
1212 /// lowering should be used for div/rem. This incorporates an override
1213 /// option so it is not simply a cost comparison.
1215 InstructionCost SafeDivisorCost) const {
1216 switch (ForceSafeDivisor) {
1217 case cl::BOU_UNSET:
1218 return ScalarCost < SafeDivisorCost;
1219 case cl::BOU_TRUE:
1220 return false;
1221 case cl::BOU_FALSE:
1222 return true;
1223 }
1224 llvm_unreachable("impossible case value");
1225 }
1226
1227 /// Returns true if \p I is an instruction which requires predication and
1228 /// for which our chosen predication strategy is scalarization (i.e. we
1229 /// don't have an alternate strategy such as masking available).
1230 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1231 bool isScalarWithPredication(Instruction *I, ElementCount VF);
1232
1233 /// Wrapper function for LoopVectorizationLegality::isMaskRequired,
1234 /// that passes the Instruction \p I and if we fold tail.
1235 bool isMaskRequired(Instruction *I) const;
1236
1237 /// Returns true if \p I is an instruction that needs to be predicated
1238 /// at runtime. The result is independent of the predication mechanism.
1239 /// Superset of instructions that return true for isScalarWithPredication.
1240 bool isPredicatedInst(Instruction *I) const;
1241
1242 /// A helper function that returns how much we should divide the cost of a
1243 /// predicated block by. Typically this is the reciprocal of the block
1244 /// probability, i.e. if we return X we are assuming the predicated block will
1245 /// execute once for every X iterations of the loop header so the block should
1246 /// only contribute 1/X of its cost to the total cost calculation, but when
1247 /// optimizing for code size it will just be 1 as code size costs don't depend
1248 /// on execution probabilities.
1249 ///
1250 /// Note that if a block wasn't originally predicated but was predicated due
1251 /// to tail folding, the divisor will still be 1 because it will execute for
1252 /// every iteration of the loop header.
1253 inline uint64_t
1254 getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1255 const BasicBlock *BB);
1256
1257 /// Returns true if an artificially high cost for emulated masked memrefs
1258 /// should be used.
1259 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1260
1261 /// Return the costs for our two available strategies for lowering a
1262 /// div/rem operation which requires speculating at least one lane.
1263 /// First result is for scalarization (will be invalid for scalable
1264 /// vectors); second is for the safe-divisor strategy.
1265 std::pair<InstructionCost, InstructionCost>
1266 getDivRemSpeculationCost(Instruction *I, ElementCount VF);
1267
1268 /// Returns true if \p I is a memory instruction with consecutive memory
1269 /// access that can be widened.
1270 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1271
1272 /// Returns true if \p I is a memory instruction in an interleaved-group
1273 /// of memory accesses that can be vectorized with wide vector loads/stores
1274 /// and shuffles.
1275 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1276
1277 /// Check if \p Instr belongs to any interleaved access group.
1279 return InterleaveInfo.isInterleaved(Instr);
1280 }
1281
1282 /// Get the interleaved access group that \p Instr belongs to.
1285 return InterleaveInfo.getInterleaveGroup(Instr);
1286 }
1287
1288 /// Returns true if we're required to use a scalar epilogue for at least
1289 /// the final iteration of the original loop.
1290 bool requiresScalarEpilogue(bool IsVectorizing) const {
1291 if (!isScalarEpilogueAllowed()) {
1292 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1293 return false;
1294 }
1295 // If we might exit from anywhere but the latch and early exit vectorization
1296 // is disabled, we must run the exiting iteration in scalar form.
1297 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1298 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1299 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1300 "from latch block\n");
1301 return true;
1302 }
1303 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1304 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1305 "interleaved group requires scalar epilogue\n");
1306 return true;
1307 }
1308 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1309 return false;
1310 }
1311
1312 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1313 /// loop hint annotation.
1315 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1316 }
1317
1318 /// Returns true if tail-folding is preferred over a scalar epilogue.
1320 return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
1321 ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
1322 }
1323
1324 /// Returns the TailFoldingStyle that is best for the current loop.
1326 return ChosenTailFoldingStyle;
1327 }
1328
1329 /// Selects and saves TailFoldingStyle.
1330 /// \param IsScalableVF true if scalable vector factors enabled.
1331 /// \param UserIC User specific interleave count.
1332 void setTailFoldingStyle(bool IsScalableVF, unsigned UserIC) {
1333 assert(ChosenTailFoldingStyle == TailFoldingStyle::None &&
1334 "Tail folding must not be selected yet.");
1335 if (!Legal->canFoldTailByMasking()) {
1336 ChosenTailFoldingStyle = TailFoldingStyle::None;
1337 return;
1338 }
1339
1340 // Default to TTI preference, but allow command line override.
1341 ChosenTailFoldingStyle = TTI.getPreferredTailFoldingStyle();
1342 if (ForceTailFoldingStyle.getNumOccurrences())
1343 ChosenTailFoldingStyle = ForceTailFoldingStyle.getValue();
1344
1345 if (ChosenTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1346 return;
1347 // Override EVL styles if needed.
1348 // FIXME: Investigate opportunity for fixed vector factor.
1349 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1350 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1351 if (EVLIsLegal)
1352 return;
1353 // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
1354 // if it's allowed, or DataWithoutLaneMask otherwise.
1355 if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
1356 ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
1357 ChosenTailFoldingStyle = TailFoldingStyle::None;
1358 else
1359 ChosenTailFoldingStyle = TailFoldingStyle::DataWithoutLaneMask;
1360
1361 LLVM_DEBUG(
1362 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1363 "not try to generate VP Intrinsics "
1364 << (UserIC > 1
1365 ? "since interleave count specified is greater than 1.\n"
1366 : "due to non-interleaving reasons.\n"));
1367 }
1368
1369 /// Returns true if all loop blocks should be masked to fold tail loop.
1370 bool foldTailByMasking() const {
1372 }
1373
1374 /// Returns true if the use of wide lane masks is requested and the loop is
1375 /// using tail-folding with a lane mask for control flow.
1378 return false;
1379
1381 }
1382
1383 /// Return maximum safe number of elements to be processed per vector
1384 /// iteration, which do not prevent store-load forwarding and are safe with
1385 /// regard to the memory dependencies. Required for EVL-based VPlans to
1386 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1387 /// MaxSafeElements).
1388 /// TODO: need to consider adjusting cost model to use this value as a
1389 /// vectorization factor for EVL-based vectorization.
1390 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1391
1392 /// Returns true if the instructions in this block requires predication
1393 /// for any reason, e.g. because tail folding now requires a predicate
1394 /// or because the block in the original loop was predicated.
1396 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1397 }
1398
1399 /// Returns true if VP intrinsics with explicit vector length support should
1400 /// be generated in the tail folded loop.
1404
1405 /// Returns true if the Phi is part of an inloop reduction.
1406 bool isInLoopReduction(PHINode *Phi) const {
1407 return InLoopReductions.contains(Phi);
1408 }
1409
1410 /// Returns the set of in-loop reduction PHIs.
1412 return InLoopReductions;
1413 }
1414
1415 /// Returns true if the predicated reduction select should be used to set the
1416 /// incoming value for the reduction phi.
1417 bool usePredicatedReductionSelect(RecurKind RecurrenceKind) const {
1418 // Force to use predicated reduction select since the EVL of the
1419 // second-to-last iteration might not be VF*UF.
1420 if (foldTailWithEVL())
1421 return true;
1422
1423 // Note: For FindLast recurrences we prefer a predicated select to simplify
1424 // matching in handleFindLastReductions(), rather than handle multiple
1425 // cases.
1427 return true;
1428
1430 TTI.preferPredicatedReductionSelect();
1431 }
1432
1433 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1434 /// with factor VF. Return the cost of the instruction, including
1435 /// scalarization overhead if it's needed.
1436 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1437
1438 /// Estimate cost of a call instruction CI if it were vectorized with factor
1439 /// VF. Return the cost of the instruction, including scalarization overhead
1440 /// if it's needed.
1441 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1442
1443 /// Invalidates decisions already taken by the cost model.
1445 WideningDecisions.clear();
1446 CallWideningDecisions.clear();
1447 Uniforms.clear();
1448 Scalars.clear();
1449 }
1450
1451 /// Returns the expected execution cost. The unit of the cost does
1452 /// not matter because we use the 'cost' units to compare different
1453 /// vector widths. The cost that is returned is *not* normalized by
1454 /// the factor width.
1455 InstructionCost expectedCost(ElementCount VF);
1456
1457 bool hasPredStores() const { return NumPredStores > 0; }
1458
1459 /// Returns true if epilogue vectorization is considered profitable, and
1460 /// false otherwise.
1461 /// \p VF is the vectorization factor chosen for the original loop.
1462 /// \p Multiplier is an aditional scaling factor applied to VF before
1463 /// comparing to EpilogueVectorizationMinVF.
1464 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1465 const unsigned IC) const;
1466
1467 /// Returns the execution time cost of an instruction for a given vector
1468 /// width. Vector width of one means scalar.
1469 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1470
1471 /// Return the cost of instructions in an inloop reduction pattern, if I is
1472 /// part of that pattern.
1473 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1474 ElementCount VF,
1475 Type *VectorTy) const;
1476
1477 /// Returns true if \p Op should be considered invariant and if it is
1478 /// trivially hoistable.
1479 bool shouldConsiderInvariant(Value *Op);
1480
1481 /// Return the value of vscale used for tuning the cost model.
1482 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1483
1484private:
1485 unsigned NumPredStores = 0;
1486
1487 /// Used to store the value of vscale used for tuning the cost model. It is
1488 /// initialized during object construction.
1489 std::optional<unsigned> VScaleForTuning;
1490
1491 /// Initializes the value of vscale used for tuning the cost model. If
1492 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1493 /// return the value returned by the corresponding TTI method.
1494 void initializeVScaleForTuning() {
1495 const Function *Fn = TheLoop->getHeader()->getParent();
1496 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
1497 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
1498 auto Min = Attr.getVScaleRangeMin();
1499 auto Max = Attr.getVScaleRangeMax();
1500 if (Max && Min == Max) {
1501 VScaleForTuning = Max;
1502 return;
1503 }
1504 }
1505
1506 VScaleForTuning = TTI.getVScaleForTuning();
1507 }
1508
1509 /// \return An upper bound for the vectorization factors for both
1510 /// fixed and scalable vectorization, where the minimum-known number of
1511 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1512 /// disabled or unsupported, then the scalable part will be equal to
1513 /// ElementCount::getScalable(0).
1514 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1515 ElementCount UserVF, unsigned UserIC,
1516 bool FoldTailByMasking);
1517
1518 /// If \p VF * \p UserIC > MaxTripcount, clamps VF to the next lower VF that
1519 /// results in VF * UserIC <= MaxTripCount.
1520 ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
1521 unsigned UserIC,
1522 bool FoldTailByMasking) const;
1523
1524 /// \return the maximized element count based on the targets vector
1525 /// registers and the loop trip-count, but limited to a maximum safe VF.
1526 /// This is a helper function of computeFeasibleMaxVF.
1527 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1528 unsigned SmallestType,
1529 unsigned WidestType,
1530 ElementCount MaxSafeVF, unsigned UserIC,
1531 bool FoldTailByMasking);
1532
1533 /// Checks if scalable vectorization is supported and enabled. Caches the
1534 /// result to avoid repeated debug dumps for repeated queries.
1535 bool isScalableVectorizationAllowed();
1536
1537 /// \return the maximum legal scalable VF, based on the safe max number
1538 /// of elements.
1539 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1540
1541 /// Calculate vectorization cost of memory instruction \p I.
1542 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1543
1544 /// The cost computation for scalarized memory instruction.
1545 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1546
1547 /// The cost computation for interleaving group of memory instructions.
1548 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1549
1550 /// The cost computation for Gather/Scatter instruction.
1551 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1552
1553 /// The cost computation for widening instruction \p I with consecutive
1554 /// memory access.
1555 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1556
1557 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1558 /// Load: scalar load + broadcast.
1559 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1560 /// element)
1561 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1562
1563 /// Estimate the overhead of scalarizing an instruction. This is a
1564 /// convenience wrapper for the type-based getScalarizationOverhead API.
1566 ElementCount VF) const;
1567
1568 /// Map of scalar integer values to the smallest bitwidth they can be legally
1569 /// represented as. The vector equivalents of these values should be truncated
1570 /// to this type.
1571 MapVector<Instruction *, uint64_t> MinBWs;
1572
1573 /// A type representing the costs for instructions if they were to be
1574 /// scalarized rather than vectorized. The entries are Instruction-Cost
1575 /// pairs.
1576 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1577
1578 /// A set containing all BasicBlocks that are known to present after
1579 /// vectorization as a predicated block.
1580 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1581 PredicatedBBsAfterVectorization;
1582
1583 /// Records whether it is allowed to have the original scalar loop execute at
1584 /// least once. This may be needed as a fallback loop in case runtime
1585 /// aliasing/dependence checks fail, or to handle the tail/remainder
1586 /// iterations when the trip count is unknown or doesn't divide by the VF,
1587 /// or as a peel-loop to handle gaps in interleave-groups.
1588 /// Under optsize and when the trip count is very small we don't allow any
1589 /// iterations to execute in the scalar loop.
1590 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1591
1592 /// Control finally chosen tail folding style.
1593 TailFoldingStyle ChosenTailFoldingStyle = TailFoldingStyle::None;
1594
1595 /// true if scalable vectorization is supported and enabled.
1596 std::optional<bool> IsScalableVectorizationAllowed;
1597
1598 /// Maximum safe number of elements to be processed per vector iteration,
1599 /// which do not prevent store-load forwarding and are safe with regard to the
1600 /// memory dependencies. Required for EVL-based veectorization, where this
1601 /// value is used as the upper bound of the safe AVL.
1602 std::optional<unsigned> MaxSafeElements;
1603
1604 /// A map holding scalar costs for different vectorization factors. The
1605 /// presence of a cost for an instruction in the mapping indicates that the
1606 /// instruction will be scalarized when vectorizing with the associated
1607 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1608 MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
1609
1610 /// Holds the instructions known to be uniform after vectorization.
1611 /// The data is collected per VF.
1612 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1613
1614 /// Holds the instructions known to be scalar after vectorization.
1615 /// The data is collected per VF.
1616 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1617
1618 /// Holds the instructions (address computations) that are forced to be
1619 /// scalarized.
1620 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1621
1622 /// PHINodes of the reductions that should be expanded in-loop.
1623 SmallPtrSet<PHINode *, 4> InLoopReductions;
1624
1625 /// A Map of inloop reduction operations and their immediate chain operand.
1626 /// FIXME: This can be removed once reductions can be costed correctly in
1627 /// VPlan. This was added to allow quick lookup of the inloop operations.
1628 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1629
1630 /// Returns the expected difference in cost from scalarizing the expression
1631 /// feeding a predicated instruction \p PredInst. The instructions to
1632 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1633 /// non-negative return value implies the expression will be scalarized.
1634 /// Currently, only single-use chains are considered for scalarization.
1635 InstructionCost computePredInstDiscount(Instruction *PredInst,
1636 ScalarCostsTy &ScalarCosts,
1637 ElementCount VF);
1638
1639 /// Collect the instructions that are uniform after vectorization. An
1640 /// instruction is uniform if we represent it with a single scalar value in
1641 /// the vectorized loop corresponding to each vector iteration. Examples of
1642 /// uniform instructions include pointer operands of consecutive or
1643 /// interleaved memory accesses. Note that although uniformity implies an
1644 /// instruction will be scalar, the reverse is not true. In general, a
1645 /// scalarized instruction will be represented by VF scalar values in the
1646 /// vectorized loop, each corresponding to an iteration of the original
1647 /// scalar loop.
1648 void collectLoopUniforms(ElementCount VF);
1649
1650 /// Collect the instructions that are scalar after vectorization. An
1651 /// instruction is scalar if it is known to be uniform or will be scalarized
1652 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1653 /// to the list if they are used by a load/store instruction that is marked as
1654 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1655 /// VF values in the vectorized loop, each corresponding to an iteration of
1656 /// the original scalar loop.
1657 void collectLoopScalars(ElementCount VF);
1658
1659 /// Keeps cost model vectorization decision and cost for instructions.
1660 /// Right now it is used for memory instructions only.
1661 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1662 std::pair<InstWidening, InstructionCost>>;
1663
1664 DecisionList WideningDecisions;
1665
1666 using CallDecisionList =
1667 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1668
1669 CallDecisionList CallWideningDecisions;
1670
1671 /// Returns true if \p V is expected to be vectorized and it needs to be
1672 /// extracted.
1673 bool needsExtract(Value *V, ElementCount VF) const {
1675 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1676 TheLoop->isLoopInvariant(I) ||
1677 getWideningDecision(I, VF) == CM_Scalarize ||
1678 (isa<CallInst>(I) &&
1679 getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
1680 return false;
1681
1682 // Assume we can vectorize V (and hence we need extraction) if the
1683 // scalars are not computed yet. This can happen, because it is called
1684 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1685 // the scalars are collected. That should be a safe assumption in most
1686 // cases, because we check if the operands have vectorizable types
1687 // beforehand in LoopVectorizationLegality.
1688 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1689 };
1690
1691 /// Returns a range containing only operands needing to be extracted.
1692 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1693 ElementCount VF) const {
1694
1695 SmallPtrSet<const Value *, 4> UniqueOperands;
1696 SmallVector<Value *, 4> Res;
1697 for (Value *Op : Ops) {
1698 if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second ||
1699 !needsExtract(Op, VF))
1700 continue;
1701 Res.push_back(Op);
1702 }
1703 return Res;
1704 }
1705
1706public:
1707 /// The loop that we evaluate.
1709
1710 /// Predicated scalar evolution analysis.
1712
1713 /// Loop Info analysis.
1715
1716 /// Vectorization legality.
1718
1719 /// Vector target information.
1721
1722 /// Target Library Info.
1724
1725 /// Demanded bits analysis.
1727
1728 /// Assumption cache.
1730
1731 /// Interface to emit optimization remarks.
1733
1734 /// A function to lazily fetch BlockFrequencyInfo. This avoids computing it
1735 /// unless necessary, e.g. when the loop isn't legal to vectorize or when
1736 /// there is no predication.
1737 std::function<BlockFrequencyInfo &()> GetBFI;
1738 /// The BlockFrequencyInfo returned from GetBFI.
1740 /// Returns the BlockFrequencyInfo for the function if cached, otherwise
1741 /// fetches it via GetBFI. Avoids an indirect call to the std::function.
1743 if (!BFI)
1744 BFI = &GetBFI();
1745 return *BFI;
1746 }
1747
1749
1750 /// Loop Vectorize Hint.
1752
1753 /// The interleave access information contains groups of interleaved accesses
1754 /// with the same stride and close to each other.
1756
1757 /// Values to ignore in the cost model.
1759
1760 /// Values to ignore in the cost model when VF > 1.
1762
1763 /// All element types found in the loop.
1765
1766 /// The kind of cost that we are calculating
1768
1769 /// Whether this loop should be optimized for size based on function attribute
1770 /// or profile information.
1772
1773 /// The highest VF possible for this loop, without using MaxBandwidth.
1775};
1776} // end namespace llvm
1777
1778namespace {
1779/// Helper struct to manage generating runtime checks for vectorization.
1780///
1781/// The runtime checks are created up-front in temporary blocks to allow better
1782/// estimating the cost and un-linked from the existing IR. After deciding to
1783/// vectorize, the checks are moved back. If deciding not to vectorize, the
1784/// temporary blocks are completely removed.
1785class GeneratedRTChecks {
1786 /// Basic block which contains the generated SCEV checks, if any.
1787 BasicBlock *SCEVCheckBlock = nullptr;
1788
1789 /// The value representing the result of the generated SCEV checks. If it is
1790 /// nullptr no SCEV checks have been generated.
1791 Value *SCEVCheckCond = nullptr;
1792
1793 /// Basic block which contains the generated memory runtime checks, if any.
1794 BasicBlock *MemCheckBlock = nullptr;
1795
1796 /// The value representing the result of the generated memory runtime checks.
1797 /// If it is nullptr no memory runtime checks have been generated.
1798 Value *MemRuntimeCheckCond = nullptr;
1799
1800 DominatorTree *DT;
1801 LoopInfo *LI;
1803
1804 SCEVExpander SCEVExp;
1805 SCEVExpander MemCheckExp;
1806
1807 bool CostTooHigh = false;
1808
1809 Loop *OuterLoop = nullptr;
1810
1812
1813 /// The kind of cost that we are calculating
1815
1816public:
1817 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1820 : DT(DT), LI(LI), TTI(TTI),
1821 SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1822 MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1823 PSE(PSE), CostKind(CostKind) {}
1824
1825 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1826 /// accurately estimate the cost of the runtime checks. The blocks are
1827 /// un-linked from the IR and are added back during vector code generation. If
1828 /// there is no vector code generation, the check blocks are removed
1829 /// completely.
1830 void create(Loop *L, const LoopAccessInfo &LAI,
1831 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
1832 OptimizationRemarkEmitter &ORE) {
1833
1834 // Hard cutoff to limit compile-time increase in case a very large number of
1835 // runtime checks needs to be generated.
1836 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1837 // profile info.
1838 CostTooHigh =
1840 if (CostTooHigh) {
1841 // Mark runtime checks as never succeeding when they exceed the threshold.
1842 MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
1843 SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
1844 ORE.emit([&]() {
1845 return OptimizationRemarkAnalysisAliasing(
1846 DEBUG_TYPE, "TooManyMemoryRuntimeChecks", L->getStartLoc(),
1847 L->getHeader())
1848 << "loop not vectorized: too many memory checks needed";
1849 });
1850 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
1851 return;
1852 }
1853
1854 BasicBlock *LoopHeader = L->getHeader();
1855 BasicBlock *Preheader = L->getLoopPreheader();
1856
1857 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1858 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1859 // may be used by SCEVExpander. The blocks will be un-linked from their
1860 // predecessors and removed from LI & DT at the end of the function.
1861 if (!UnionPred.isAlwaysTrue()) {
1862 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1863 nullptr, "vector.scevcheck");
1864
1865 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1866 &UnionPred, SCEVCheckBlock->getTerminator());
1867 if (isa<Constant>(SCEVCheckCond)) {
1868 // Clean up directly after expanding the predicate to a constant, to
1869 // avoid further expansions re-using anything left over from SCEVExp.
1870 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1871 SCEVCleaner.cleanup();
1872 }
1873 }
1874
1875 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1876 if (RtPtrChecking.Need) {
1877 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1878 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1879 "vector.memcheck");
1880
1881 auto DiffChecks = RtPtrChecking.getDiffChecks();
1882 if (DiffChecks) {
1883 Value *RuntimeVF = nullptr;
1884 MemRuntimeCheckCond = addDiffRuntimeChecks(
1885 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1886 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1887 if (!RuntimeVF)
1888 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1889 return RuntimeVF;
1890 },
1891 IC);
1892 } else {
1893 MemRuntimeCheckCond = addRuntimeChecks(
1894 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1896 }
1897 assert(MemRuntimeCheckCond &&
1898 "no RT checks generated although RtPtrChecking "
1899 "claimed checks are required");
1900 }
1901
1902 SCEVExp.eraseDeadInstructions(SCEVCheckCond);
1903
1904 if (!MemCheckBlock && !SCEVCheckBlock)
1905 return;
1906
1907 // Unhook the temporary block with the checks, update various places
1908 // accordingly.
1909 if (SCEVCheckBlock)
1910 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1911 if (MemCheckBlock)
1912 MemCheckBlock->replaceAllUsesWith(Preheader);
1913
1914 if (SCEVCheckBlock) {
1915 SCEVCheckBlock->getTerminator()->moveBefore(
1916 Preheader->getTerminator()->getIterator());
1917 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1918 UI->setDebugLoc(DebugLoc::getTemporary());
1919 Preheader->getTerminator()->eraseFromParent();
1920 }
1921 if (MemCheckBlock) {
1922 MemCheckBlock->getTerminator()->moveBefore(
1923 Preheader->getTerminator()->getIterator());
1924 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1925 UI->setDebugLoc(DebugLoc::getTemporary());
1926 Preheader->getTerminator()->eraseFromParent();
1927 }
1928
1929 DT->changeImmediateDominator(LoopHeader, Preheader);
1930 if (MemCheckBlock) {
1931 DT->eraseNode(MemCheckBlock);
1932 LI->removeBlock(MemCheckBlock);
1933 }
1934 if (SCEVCheckBlock) {
1935 DT->eraseNode(SCEVCheckBlock);
1936 LI->removeBlock(SCEVCheckBlock);
1937 }
1938
1939 // Outer loop is used as part of the later cost calculations.
1940 OuterLoop = L->getParentLoop();
1941 }
1942
1944 if (SCEVCheckBlock || MemCheckBlock)
1945 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1946
1947 if (CostTooHigh) {
1949 Cost.setInvalid();
1950 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1951 return Cost;
1952 }
1953
1954 InstructionCost RTCheckCost = 0;
1955 if (SCEVCheckBlock)
1956 for (Instruction &I : *SCEVCheckBlock) {
1957 if (SCEVCheckBlock->getTerminator() == &I)
1958 continue;
1960 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1961 RTCheckCost += C;
1962 }
1963 if (MemCheckBlock) {
1964 InstructionCost MemCheckCost = 0;
1965 for (Instruction &I : *MemCheckBlock) {
1966 if (MemCheckBlock->getTerminator() == &I)
1967 continue;
1969 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1970 MemCheckCost += C;
1971 }
1972
1973 // If the runtime memory checks are being created inside an outer loop
1974 // we should find out if these checks are outer loop invariant. If so,
1975 // the checks will likely be hoisted out and so the effective cost will
1976 // reduce according to the outer loop trip count.
1977 if (OuterLoop) {
1978 ScalarEvolution *SE = MemCheckExp.getSE();
1979 // TODO: If profitable, we could refine this further by analysing every
1980 // individual memory check, since there could be a mixture of loop
1981 // variant and invariant checks that mean the final condition is
1982 // variant.
1983 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1984 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1985 // It seems reasonable to assume that we can reduce the effective
1986 // cost of the checks even when we know nothing about the trip
1987 // count. Assume that the outer loop executes at least twice.
1988 unsigned BestTripCount = 2;
1989
1990 // Get the best known TC estimate.
1991 if (auto EstimatedTC = getSmallBestKnownTC(
1992 PSE, OuterLoop, /* CanUseConstantMax = */ false))
1993 if (EstimatedTC->isFixed())
1994 BestTripCount = EstimatedTC->getFixedValue();
1995
1996 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1997
1998 // Let's ensure the cost is always at least 1.
1999 NewMemCheckCost = std::max(NewMemCheckCost.getValue(),
2000 (InstructionCost::CostType)1);
2001
2002 if (BestTripCount > 1)
2004 << "We expect runtime memory checks to be hoisted "
2005 << "out of the outer loop. Cost reduced from "
2006 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2007
2008 MemCheckCost = NewMemCheckCost;
2009 }
2010 }
2011
2012 RTCheckCost += MemCheckCost;
2013 }
2014
2015 if (SCEVCheckBlock || MemCheckBlock)
2016 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2017 << "\n");
2018
2019 return RTCheckCost;
2020 }
2021
2022 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2023 /// unused.
2024 ~GeneratedRTChecks() {
2025 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2026 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2027 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(SCEVCheckBlock);
2028 bool MemChecksUsed = !MemCheckBlock || !pred_empty(MemCheckBlock);
2029 if (SCEVChecksUsed)
2030 SCEVCleaner.markResultUsed();
2031
2032 if (MemChecksUsed) {
2033 MemCheckCleaner.markResultUsed();
2034 } else {
2035 auto &SE = *MemCheckExp.getSE();
2036 // Memory runtime check generation creates compares that use expanded
2037 // values. Remove them before running the SCEVExpanderCleaners.
2038 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2039 if (MemCheckExp.isInsertedInstruction(&I))
2040 continue;
2041 SE.forgetValue(&I);
2042 I.eraseFromParent();
2043 }
2044 }
2045 MemCheckCleaner.cleanup();
2046 SCEVCleaner.cleanup();
2047
2048 if (!SCEVChecksUsed)
2049 SCEVCheckBlock->eraseFromParent();
2050 if (!MemChecksUsed)
2051 MemCheckBlock->eraseFromParent();
2052 }
2053
2054 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2055 /// outside VPlan.
2056 std::pair<Value *, BasicBlock *> getSCEVChecks() const {
2057 using namespace llvm::PatternMatch;
2058 if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
2059 return {nullptr, nullptr};
2060
2061 return {SCEVCheckCond, SCEVCheckBlock};
2062 }
2063
2064 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2065 /// outside VPlan.
2066 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
2067 using namespace llvm::PatternMatch;
2068 if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
2069 return {nullptr, nullptr};
2070 return {MemRuntimeCheckCond, MemCheckBlock};
2071 }
2072
2073 /// Return true if any runtime checks have been added
2074 bool hasChecks() const {
2075 return getSCEVChecks().first || getMemRuntimeChecks().first;
2076 }
2077};
2078} // namespace
2079
2081 return Style == TailFoldingStyle::Data ||
2083}
2084
2088
2089// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2090// vectorization. The loop needs to be annotated with #pragma omp simd
2091// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2092// vector length information is not provided, vectorization is not considered
2093// explicit. Interleave hints are not allowed either. These limitations will be
2094// relaxed in the future.
2095// Please, note that we are currently forced to abuse the pragma 'clang
2096// vectorize' semantics. This pragma provides *auto-vectorization hints*
2097// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2098// provides *explicit vectorization hints* (LV can bypass legal checks and
2099// assume that vectorization is legal). However, both hints are implemented
2100// using the same metadata (llvm.loop.vectorize, processed by
2101// LoopVectorizeHints). This will be fixed in the future when the native IR
2102// representation for pragma 'omp simd' is introduced.
2103static bool isExplicitVecOuterLoop(Loop *OuterLp,
2105 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2106 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2107
2108 // Only outer loops with an explicit vectorization hint are supported.
2109 // Unannotated outer loops are ignored.
2111 return false;
2112
2113 Function *Fn = OuterLp->getHeader()->getParent();
2114 if (!Hints.allowVectorization(Fn, OuterLp,
2115 true /*VectorizeOnlyWhenForced*/)) {
2116 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2117 return false;
2118 }
2119
2120 if (Hints.getInterleave() > 1) {
2121 // TODO: Interleave support is future work.
2122 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2123 "outer loops.\n");
2124 Hints.emitRemarkWithHints();
2125 return false;
2126 }
2127
2128 return true;
2129}
2130
2134 // Collect inner loops and outer loops without irreducible control flow. For
2135 // now, only collect outer loops that have explicit vectorization hints. If we
2136 // are stress testing the VPlan H-CFG construction, we collect the outermost
2137 // loop of every loop nest.
2138 if (L.isInnermost() || VPlanBuildStressTest ||
2140 LoopBlocksRPO RPOT(&L);
2141 RPOT.perform(LI);
2143 V.push_back(&L);
2144 // TODO: Collect inner loops inside marked outer loops in case
2145 // vectorization fails for the outer loop. Do not invoke
2146 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2147 // already known to be reducible. We can use an inherited attribute for
2148 // that.
2149 return;
2150 }
2151 }
2152 for (Loop *InnerL : L)
2153 collectSupportedLoops(*InnerL, LI, ORE, V);
2154}
2155
2156//===----------------------------------------------------------------------===//
2157// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2158// LoopVectorizationCostModel and LoopVectorizationPlanner.
2159//===----------------------------------------------------------------------===//
2160
2161/// FIXME: The newly created binary instructions should contain nsw/nuw
2162/// flags, which can be found from the original scalar operations.
2163Value *
2165 Value *Step,
2167 const BinaryOperator *InductionBinOp) {
2168 using namespace llvm::PatternMatch;
2169 Type *StepTy = Step->getType();
2170 Value *CastedIndex = StepTy->isIntegerTy()
2171 ? B.CreateSExtOrTrunc(Index, StepTy)
2172 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2173 if (CastedIndex != Index) {
2174 CastedIndex->setName(CastedIndex->getName() + ".cast");
2175 Index = CastedIndex;
2176 }
2177
2178 // Note: the IR at this point is broken. We cannot use SE to create any new
2179 // SCEV and then expand it, hoping that SCEV's simplification will give us
2180 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2181 // lead to various SCEV crashes. So all we can do is to use builder and rely
2182 // on InstCombine for future simplifications. Here we handle some trivial
2183 // cases only.
2184 auto CreateAdd = [&B](Value *X, Value *Y) {
2185 assert(X->getType() == Y->getType() && "Types don't match!");
2186 if (match(X, m_ZeroInt()))
2187 return Y;
2188 if (match(Y, m_ZeroInt()))
2189 return X;
2190 return B.CreateAdd(X, Y);
2191 };
2192
2193 // We allow X to be a vector type, in which case Y will potentially be
2194 // splatted into a vector with the same element count.
2195 auto CreateMul = [&B](Value *X, Value *Y) {
2196 assert(X->getType()->getScalarType() == Y->getType() &&
2197 "Types don't match!");
2198 if (match(X, m_One()))
2199 return Y;
2200 if (match(Y, m_One()))
2201 return X;
2202 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2203 if (XVTy && !isa<VectorType>(Y->getType()))
2204 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2205 return B.CreateMul(X, Y);
2206 };
2207
2208 switch (InductionKind) {
2210 assert(!isa<VectorType>(Index->getType()) &&
2211 "Vector indices not supported for integer inductions yet");
2212 assert(Index->getType() == StartValue->getType() &&
2213 "Index type does not match StartValue type");
2214 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2215 return B.CreateSub(StartValue, Index);
2216 auto *Offset = CreateMul(Index, Step);
2217 return CreateAdd(StartValue, Offset);
2218 }
2220 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2222 assert(!isa<VectorType>(Index->getType()) &&
2223 "Vector indices not supported for FP inductions yet");
2224 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2225 assert(InductionBinOp &&
2226 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2227 InductionBinOp->getOpcode() == Instruction::FSub) &&
2228 "Original bin op should be defined for FP induction");
2229
2230 Value *MulExp = B.CreateFMul(Step, Index);
2231 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2232 "induction");
2233 }
2235 return nullptr;
2236 }
2237 llvm_unreachable("invalid enum");
2238}
2239
2240static std::optional<unsigned> getMaxVScale(const Function &F,
2241 const TargetTransformInfo &TTI) {
2242 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2243 return MaxVScale;
2244
2245 if (F.hasFnAttribute(Attribute::VScaleRange))
2246 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2247
2248 return std::nullopt;
2249}
2250
2251/// For the given VF and UF and maximum trip count computed for the loop, return
2252/// whether the induction variable might overflow in the vectorized loop. If not,
2253/// then we know a runtime overflow check always evaluates to false and can be
2254/// removed.
2256 const LoopVectorizationCostModel *Cost,
2257 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2258 // Always be conservative if we don't know the exact unroll factor.
2259 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2260
2261 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2262 APInt MaxUIntTripCount = IdxTy->getMask();
2263
2264 // We know the runtime overflow check is known false iff the (max) trip-count
2265 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2266 // the vector loop induction variable.
2267 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2268 uint64_t MaxVF = VF.getKnownMinValue();
2269 if (VF.isScalable()) {
2270 std::optional<unsigned> MaxVScale =
2271 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2272 if (!MaxVScale)
2273 return false;
2274 MaxVF *= *MaxVScale;
2275 }
2276
2277 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2278 }
2279
2280 return false;
2281}
2282
2283// Return whether we allow using masked interleave-groups (for dealing with
2284// strided loads/stores that reside in predicated blocks, or for dealing
2285// with gaps).
2287 // If an override option has been passed in for interleaved accesses, use it.
2288 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2290
2291 return TTI.enableMaskedInterleavedAccessVectorization();
2292}
2293
2294/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2295/// VPBB are moved to the end of the newly created VPIRBasicBlock. All
2296/// predecessors and successors of VPBB, if any, are rewired to the new
2297/// VPIRBasicBlock. If \p VPBB may be unreachable, \p Plan must be passed.
2299 BasicBlock *IRBB,
2300 VPlan *Plan = nullptr) {
2301 if (!Plan)
2302 Plan = VPBB->getPlan();
2303 VPIRBasicBlock *IRVPBB = Plan->createVPIRBasicBlock(IRBB);
2304 auto IP = IRVPBB->begin();
2305 for (auto &R : make_early_inc_range(VPBB->phis()))
2306 R.moveBefore(*IRVPBB, IP);
2307
2308 for (auto &R :
2310 R.moveBefore(*IRVPBB, IRVPBB->end());
2311
2312 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2313 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2314 return IRVPBB;
2315}
2316
2318 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2319 assert(VectorPH && "Invalid loop structure");
2320 assert((OrigLoop->getUniqueLatchExitBlock() ||
2321 Cost->requiresScalarEpilogue(VF.isVector())) &&
2322 "loops not exiting via the latch without required epilogue?");
2323
2324 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2325 // wrapping the newly created scalar preheader here at the moment, because the
2326 // Plan's scalar preheader may be unreachable at this point. Instead it is
2327 // replaced in executePlan.
2328 return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr,
2329 Twine(Prefix) + "scalar.ph");
2330}
2331
2332/// Knowing that loop \p L executes a single vector iteration, add instructions
2333/// that will get simplified and thus should not have any cost to \p
2334/// InstsToIgnore.
2337 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2338 auto *Cmp = L->getLatchCmpInst();
2339 if (Cmp)
2340 InstsToIgnore.insert(Cmp);
2341 for (const auto &KV : IL) {
2342 // Extract the key by hand so that it can be used in the lambda below. Note
2343 // that captured structured bindings are a C++20 extension.
2344 const PHINode *IV = KV.first;
2345
2346 // Get next iteration value of the induction variable.
2347 Instruction *IVInst =
2348 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2349 if (all_of(IVInst->users(),
2350 [&](const User *U) { return U == IV || U == Cmp; }))
2351 InstsToIgnore.insert(IVInst);
2352 }
2353}
2354
2356 // Create a new IR basic block for the scalar preheader.
2357 BasicBlock *ScalarPH = createScalarPreheader("");
2358 return ScalarPH->getSinglePredecessor();
2359}
2360
2361namespace {
2362
2363struct CSEDenseMapInfo {
2364 static bool canHandle(const Instruction *I) {
2367 }
2368
2369 static inline Instruction *getEmptyKey() {
2371 }
2372
2373 static inline Instruction *getTombstoneKey() {
2374 return DenseMapInfo<Instruction *>::getTombstoneKey();
2375 }
2376
2377 static unsigned getHashValue(const Instruction *I) {
2378 assert(canHandle(I) && "Unknown instruction!");
2379 return hash_combine(I->getOpcode(),
2380 hash_combine_range(I->operand_values()));
2381 }
2382
2383 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2384 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2385 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2386 return LHS == RHS;
2387 return LHS->isIdenticalTo(RHS);
2388 }
2389};
2390
2391} // end anonymous namespace
2392
2393/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
2394/// removal, in favor of the VPlan-based one.
2395static void legacyCSE(BasicBlock *BB) {
2396 // Perform simple cse.
2398 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2399 if (!CSEDenseMapInfo::canHandle(&In))
2400 continue;
2401
2402 // Check if we can replace this instruction with any of the
2403 // visited instructions.
2404 if (Instruction *V = CSEMap.lookup(&In)) {
2405 In.replaceAllUsesWith(V);
2406 In.eraseFromParent();
2407 continue;
2408 }
2409
2410 CSEMap[&In] = &In;
2411 }
2412}
2413
2414/// This function attempts to return a value that represents the ElementCount
2415/// at runtime. For fixed-width VFs we know this precisely at compile
2416/// time, but for scalable VFs we calculate it based on an estimate of the
2417/// vscale value.
2419 std::optional<unsigned> VScale) {
2420 unsigned EstimatedVF = VF.getKnownMinValue();
2421 if (VF.isScalable())
2422 if (VScale)
2423 EstimatedVF *= *VScale;
2424 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2425 return EstimatedVF;
2426}
2427
2430 ElementCount VF) const {
2431 // We only need to calculate a cost if the VF is scalar; for actual vectors
2432 // we should already have a pre-calculated cost at each VF.
2433 if (!VF.isScalar())
2434 return getCallWideningDecision(CI, VF).Cost;
2435
2436 Type *RetTy = CI->getType();
2438 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2439 return *RedCost;
2440
2442 for (auto &ArgOp : CI->args())
2443 Tys.push_back(ArgOp->getType());
2444
2445 InstructionCost ScalarCallCost =
2446 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2447
2448 // If this is an intrinsic we may have a lower cost for it.
2451 return std::min(ScalarCallCost, IntrinsicCost);
2452 }
2453 return ScalarCallCost;
2454}
2455
2457 if (VF.isScalar() || !canVectorizeTy(Ty))
2458 return Ty;
2459 return toVectorizedTy(Ty, VF);
2460}
2461
2464 ElementCount VF) const {
2466 assert(ID && "Expected intrinsic call!");
2467 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2468 FastMathFlags FMF;
2469 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2470 FMF = FPMO->getFastMathFlags();
2471
2474 SmallVector<Type *> ParamTys;
2475 std::transform(FTy->param_begin(), FTy->param_end(),
2476 std::back_inserter(ParamTys),
2477 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2478
2479 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2482 return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2483}
2484
2486 // Fix widened non-induction PHIs by setting up the PHI operands.
2487 fixNonInductionPHIs(State);
2488
2489 // Don't apply optimizations below when no (vector) loop remains, as they all
2490 // require one at the moment.
2491 VPBasicBlock *HeaderVPBB =
2492 vputils::getFirstLoopHeader(*State.Plan, State.VPDT);
2493 if (!HeaderVPBB)
2494 return;
2495
2496 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2497
2498 // Remove redundant induction instructions.
2499 legacyCSE(HeaderBB);
2500}
2501
2503 auto Iter = vp_depth_first_shallow(Plan.getEntry());
2505 for (VPRecipeBase &P : VPBB->phis()) {
2507 if (!VPPhi)
2508 continue;
2509 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
2510 // Make sure the builder has a valid insert point.
2511 Builder.SetInsertPoint(NewPhi);
2512 for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2513 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
2514 }
2515 }
2516}
2517
2518void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2519 // We should not collect Scalars more than once per VF. Right now, this
2520 // function is called from collectUniformsAndScalars(), which already does
2521 // this check. Collecting Scalars for VF=1 does not make any sense.
2522 assert(VF.isVector() && !Scalars.contains(VF) &&
2523 "This function should not be visited twice for the same VF");
2524
2525 // This avoids any chances of creating a REPLICATE recipe during planning
2526 // since that would result in generation of scalarized code during execution,
2527 // which is not supported for scalable vectors.
2528 if (VF.isScalable()) {
2529 Scalars[VF].insert_range(Uniforms[VF]);
2530 return;
2531 }
2532
2534
2535 // These sets are used to seed the analysis with pointers used by memory
2536 // accesses that will remain scalar.
2538 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2539 auto *Latch = TheLoop->getLoopLatch();
2540
2541 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2542 // The pointer operands of loads and stores will be scalar as long as the
2543 // memory access is not a gather or scatter operation. The value operand of a
2544 // store will remain scalar if the store is scalarized.
2545 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2546 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
2547 assert(WideningDecision != CM_Unknown &&
2548 "Widening decision should be ready at this moment");
2549 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
2550 if (Ptr == Store->getValueOperand())
2551 return WideningDecision == CM_Scalarize;
2552 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2553 "Ptr is neither a value or pointer operand");
2554 return WideningDecision != CM_GatherScatter;
2555 };
2556
2557 // A helper that returns true if the given value is a getelementptr
2558 // instruction contained in the loop.
2559 auto IsLoopVaryingGEP = [&](Value *V) {
2560 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
2561 };
2562
2563 // A helper that evaluates a memory access's use of a pointer. If the use will
2564 // be a scalar use and the pointer is only used by memory accesses, we place
2565 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2566 // PossibleNonScalarPtrs.
2567 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2568 // We only care about bitcast and getelementptr instructions contained in
2569 // the loop.
2570 if (!IsLoopVaryingGEP(Ptr))
2571 return;
2572
2573 // If the pointer has already been identified as scalar (e.g., if it was
2574 // also identified as uniform), there's nothing to do.
2575 auto *I = cast<Instruction>(Ptr);
2576 if (Worklist.count(I))
2577 return;
2578
2579 // If the use of the pointer will be a scalar use, and all users of the
2580 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2581 // place the pointer in PossibleNonScalarPtrs.
2582 if (IsScalarUse(MemAccess, Ptr) &&
2584 ScalarPtrs.insert(I);
2585 else
2586 PossibleNonScalarPtrs.insert(I);
2587 };
2588
2589 // We seed the scalars analysis with three classes of instructions: (1)
2590 // instructions marked uniform-after-vectorization and (2) bitcast,
2591 // getelementptr and (pointer) phi instructions used by memory accesses
2592 // requiring a scalar use.
2593 //
2594 // (1) Add to the worklist all instructions that have been identified as
2595 // uniform-after-vectorization.
2596 Worklist.insert_range(Uniforms[VF]);
2597
2598 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2599 // memory accesses requiring a scalar use. The pointer operands of loads and
2600 // stores will be scalar unless the operation is a gather or scatter.
2601 // The value operand of a store will remain scalar if the store is scalarized.
2602 for (auto *BB : TheLoop->blocks())
2603 for (auto &I : *BB) {
2604 if (auto *Load = dyn_cast<LoadInst>(&I)) {
2605 EvaluatePtrUse(Load, Load->getPointerOperand());
2606 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
2607 EvaluatePtrUse(Store, Store->getPointerOperand());
2608 EvaluatePtrUse(Store, Store->getValueOperand());
2609 }
2610 }
2611 for (auto *I : ScalarPtrs)
2612 if (!PossibleNonScalarPtrs.count(I)) {
2613 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2614 Worklist.insert(I);
2615 }
2616
2617 // Insert the forced scalars.
2618 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2619 // induction variable when the PHI user is scalarized.
2620 auto ForcedScalar = ForcedScalars.find(VF);
2621 if (ForcedScalar != ForcedScalars.end())
2622 for (auto *I : ForcedScalar->second) {
2623 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2624 Worklist.insert(I);
2625 }
2626
2627 // Expand the worklist by looking through any bitcasts and getelementptr
2628 // instructions we've already identified as scalar. This is similar to the
2629 // expansion step in collectLoopUniforms(); however, here we're only
2630 // expanding to include additional bitcasts and getelementptr instructions.
2631 unsigned Idx = 0;
2632 while (Idx != Worklist.size()) {
2633 Instruction *Dst = Worklist[Idx++];
2634 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
2635 continue;
2636 auto *Src = cast<Instruction>(Dst->getOperand(0));
2637 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
2638 auto *J = cast<Instruction>(U);
2639 return !TheLoop->contains(J) || Worklist.count(J) ||
2640 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
2641 IsScalarUse(J, Src));
2642 })) {
2643 Worklist.insert(Src);
2644 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2645 }
2646 }
2647
2648 // An induction variable will remain scalar if all users of the induction
2649 // variable and induction variable update remain scalar.
2650 for (const auto &Induction : Legal->getInductionVars()) {
2651 auto *Ind = Induction.first;
2652 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
2653
2654 // If tail-folding is applied, the primary induction variable will be used
2655 // to feed a vector compare.
2656 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2657 continue;
2658
2659 // Returns true if \p Indvar is a pointer induction that is used directly by
2660 // load/store instruction \p I.
2661 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2662 Instruction *I) {
2663 return Induction.second.getKind() ==
2666 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
2667 };
2668
2669 // Determine if all users of the induction variable are scalar after
2670 // vectorization.
2671 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
2672 auto *I = cast<Instruction>(U);
2673 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
2674 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2675 });
2676 if (!ScalarInd)
2677 continue;
2678
2679 // If the induction variable update is a fixed-order recurrence, neither the
2680 // induction variable or its update should be marked scalar after
2681 // vectorization.
2682 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
2683 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
2684 continue;
2685
2686 // Determine if all users of the induction variable update instruction are
2687 // scalar after vectorization.
2688 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
2689 auto *I = cast<Instruction>(U);
2690 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
2691 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2692 });
2693 if (!ScalarIndUpdate)
2694 continue;
2695
2696 // The induction variable and its update instruction will remain scalar.
2697 Worklist.insert(Ind);
2698 Worklist.insert(IndUpdate);
2699 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2700 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2701 << "\n");
2702 }
2703
2704 Scalars[VF].insert_range(Worklist);
2705}
2706
2708 ElementCount VF) {
2709 if (!isPredicatedInst(I))
2710 return false;
2711
2712 // Do we have a non-scalar lowering for this predicated
2713 // instruction? No - it is scalar with predication.
2714 switch(I->getOpcode()) {
2715 default:
2716 return true;
2717 case Instruction::Call:
2718 if (VF.isScalar())
2719 return true;
2721 case Instruction::Load:
2722 case Instruction::Store: {
2723 auto *Ptr = getLoadStorePointerOperand(I);
2724 auto *Ty = getLoadStoreType(I);
2725 unsigned AS = getLoadStoreAddressSpace(I);
2726 Type *VTy = Ty;
2727 if (VF.isVector())
2728 VTy = VectorType::get(Ty, VF);
2729 const Align Alignment = getLoadStoreAlignment(I);
2730 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment, AS) ||
2731 TTI.isLegalMaskedGather(VTy, Alignment))
2732 : !(isLegalMaskedStore(Ty, Ptr, Alignment, AS) ||
2733 TTI.isLegalMaskedScatter(VTy, Alignment));
2734 }
2735 case Instruction::UDiv:
2736 case Instruction::SDiv:
2737 case Instruction::SRem:
2738 case Instruction::URem: {
2739 // We have the option to use the safe-divisor idiom to avoid predication.
2740 // The cost based decision here will always select safe-divisor for
2741 // scalable vectors as scalarization isn't legal.
2742 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2743 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2744 }
2745 }
2746}
2747
2749 return Legal->isMaskRequired(I, foldTailByMasking());
2750}
2751
2752// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2754 // TODO: We can use the loop-preheader as context point here and get
2755 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2759 return false;
2760
2761 // If the instruction was executed conditionally in the original scalar loop,
2762 // predication is needed with a mask whose lanes are all possibly inactive.
2763 if (Legal->blockNeedsPredication(I->getParent()))
2764 return true;
2765
2766 // If we're not folding the tail by masking, predication is unnecessary.
2767 if (!foldTailByMasking())
2768 return false;
2769
2770 // All that remain are instructions with side-effects originally executed in
2771 // the loop unconditionally, but now execute under a tail-fold mask (only)
2772 // having at least one active lane (the first). If the side-effects of the
2773 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2774 // - it will cause the same side-effects as when masked.
2775 switch(I->getOpcode()) {
2776 default:
2778 "instruction should have been considered by earlier checks");
2779 case Instruction::Call:
2780 // Side-effects of a Call are assumed to be non-invariant, needing a
2781 // (fold-tail) mask.
2783 "should have returned earlier for calls not needing a mask");
2784 return true;
2785 case Instruction::Load:
2786 // If the address is loop invariant no predication is needed.
2787 return !Legal->isInvariant(getLoadStorePointerOperand(I));
2788 case Instruction::Store: {
2789 // For stores, we need to prove both speculation safety (which follows from
2790 // the same argument as loads), but also must prove the value being stored
2791 // is correct. The easiest form of the later is to require that all values
2792 // stored are the same.
2793 return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
2794 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
2795 }
2796 case Instruction::UDiv:
2797 case Instruction::URem:
2798 // If the divisor is loop-invariant no predication is needed.
2799 return !Legal->isInvariant(I->getOperand(1));
2800 case Instruction::SDiv:
2801 case Instruction::SRem:
2802 // Conservative for now, since masked-off lanes may be poison and could
2803 // trigger signed overflow.
2804 return true;
2805 }
2806}
2807
2811 return 1;
2812 // If the block wasn't originally predicated then return early to avoid
2813 // computing BlockFrequencyInfo unnecessarily.
2814 if (!Legal->blockNeedsPredication(BB))
2815 return 1;
2816
2817 uint64_t HeaderFreq =
2818 getBFI().getBlockFreq(TheLoop->getHeader()).getFrequency();
2819 uint64_t BBFreq = getBFI().getBlockFreq(BB).getFrequency();
2820 assert(HeaderFreq >= BBFreq &&
2821 "Header has smaller block freq than dominated BB?");
2822 return std::round((double)HeaderFreq / BBFreq);
2823}
2824
2825std::pair<InstructionCost, InstructionCost>
2827 ElementCount VF) {
2828 assert(I->getOpcode() == Instruction::UDiv ||
2829 I->getOpcode() == Instruction::SDiv ||
2830 I->getOpcode() == Instruction::SRem ||
2831 I->getOpcode() == Instruction::URem);
2833
2834 // Scalarization isn't legal for scalable vector types
2835 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2836 if (!VF.isScalable()) {
2837 // Get the scalarization cost and scale this amount by the probability of
2838 // executing the predicated block. If the instruction is not predicated,
2839 // we fall through to the next case.
2840 ScalarizationCost = 0;
2841
2842 // These instructions have a non-void type, so account for the phi nodes
2843 // that we will create. This cost is likely to be zero. The phi node
2844 // cost, if any, should be scaled by the block probability because it
2845 // models a copy at the end of each predicated block.
2846 ScalarizationCost +=
2847 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
2848
2849 // The cost of the non-predicated instruction.
2850 ScalarizationCost +=
2851 VF.getFixedValue() *
2852 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
2853
2854 // The cost of insertelement and extractelement instructions needed for
2855 // scalarization.
2856 ScalarizationCost += getScalarizationOverhead(I, VF);
2857
2858 // Scale the cost by the probability of executing the predicated blocks.
2859 // This assumes the predicated block for each vector lane is equally
2860 // likely.
2861 ScalarizationCost =
2862 ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
2863 }
2864
2865 InstructionCost SafeDivisorCost = 0;
2866 auto *VecTy = toVectorTy(I->getType(), VF);
2867 // The cost of the select guard to ensure all lanes are well defined
2868 // after we speculate above any internal control flow.
2869 SafeDivisorCost +=
2870 TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
2871 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
2873
2874 SmallVector<const Value *, 4> Operands(I->operand_values());
2875 SafeDivisorCost += TTI.getArithmeticInstrCost(
2876 I->getOpcode(), VecTy, CostKind,
2877 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2878 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2879 Operands, I);
2880 return {ScalarizationCost, SafeDivisorCost};
2881}
2882
2884 Instruction *I, ElementCount VF) const {
2885 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2887 "Decision should not be set yet.");
2888 auto *Group = getInterleavedAccessGroup(I);
2889 assert(Group && "Must have a group.");
2890 unsigned InterleaveFactor = Group->getFactor();
2891
2892 // If the instruction's allocated size doesn't equal its type size, it
2893 // requires padding and will be scalarized.
2894 auto &DL = I->getDataLayout();
2895 auto *ScalarTy = getLoadStoreType(I);
2896 if (hasIrregularType(ScalarTy, DL))
2897 return false;
2898
2899 // For scalable vectors, the interleave factors must be <= 8 since we require
2900 // the (de)interleaveN intrinsics instead of shufflevectors.
2901 if (VF.isScalable() && InterleaveFactor > 8)
2902 return false;
2903
2904 // If the group involves a non-integral pointer, we may not be able to
2905 // losslessly cast all values to a common type.
2906 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
2907 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
2908 Instruction *Member = Group->getMember(Idx);
2909 if (!Member)
2910 continue;
2911 auto *MemberTy = getLoadStoreType(Member);
2912 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
2913 // Don't coerce non-integral pointers to integers or vice versa.
2914 if (MemberNI != ScalarNI)
2915 // TODO: Consider adding special nullptr value case here
2916 return false;
2917 if (MemberNI && ScalarNI &&
2918 ScalarTy->getPointerAddressSpace() !=
2919 MemberTy->getPointerAddressSpace())
2920 return false;
2921 }
2922
2923 // Check if masking is required.
2924 // A Group may need masking for one of two reasons: it resides in a block that
2925 // needs predication, or it was decided to use masking to deal with gaps
2926 // (either a gap at the end of a load-access that may result in a speculative
2927 // load, or any gaps in a store-access).
2928 bool PredicatedAccessRequiresMasking =
2930 bool LoadAccessWithGapsRequiresEpilogMasking =
2931 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
2933 bool StoreAccessWithGapsRequiresMasking =
2934 isa<StoreInst>(I) && !Group->isFull();
2935 if (!PredicatedAccessRequiresMasking &&
2936 !LoadAccessWithGapsRequiresEpilogMasking &&
2937 !StoreAccessWithGapsRequiresMasking)
2938 return true;
2939
2940 // If masked interleaving is required, we expect that the user/target had
2941 // enabled it, because otherwise it either wouldn't have been created or
2942 // it should have been invalidated by the CostModel.
2944 "Masked interleave-groups for predicated accesses are not enabled.");
2945
2946 if (Group->isReverse())
2947 return false;
2948
2949 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
2950 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
2951 StoreAccessWithGapsRequiresMasking;
2952 if (VF.isScalable() && NeedsMaskForGaps)
2953 return false;
2954
2955 auto *Ty = getLoadStoreType(I);
2956 const Align Alignment = getLoadStoreAlignment(I);
2957 unsigned AS = getLoadStoreAddressSpace(I);
2958 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS)
2959 : TTI.isLegalMaskedStore(Ty, Alignment, AS);
2960}
2961
2963 Instruction *I, ElementCount VF) {
2964 // Get and ensure we have a valid memory instruction.
2965 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
2966
2967 auto *Ptr = getLoadStorePointerOperand(I);
2968 auto *ScalarTy = getLoadStoreType(I);
2969
2970 // In order to be widened, the pointer should be consecutive, first of all.
2971 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
2972 return false;
2973
2974 // If the instruction is a store located in a predicated block, it will be
2975 // scalarized.
2976 if (isScalarWithPredication(I, VF))
2977 return false;
2978
2979 // If the instruction's allocated size doesn't equal it's type size, it
2980 // requires padding and will be scalarized.
2981 auto &DL = I->getDataLayout();
2982 if (hasIrregularType(ScalarTy, DL))
2983 return false;
2984
2985 return true;
2986}
2987
2988void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
2989 // We should not collect Uniforms more than once per VF. Right now,
2990 // this function is called from collectUniformsAndScalars(), which
2991 // already does this check. Collecting Uniforms for VF=1 does not make any
2992 // sense.
2993
2994 assert(VF.isVector() && !Uniforms.contains(VF) &&
2995 "This function should not be visited twice for the same VF");
2996
2997 // Visit the list of Uniforms. If we find no uniform value, we won't
2998 // analyze again. Uniforms.count(VF) will return 1.
2999 Uniforms[VF].clear();
3000
3001 // Now we know that the loop is vectorizable!
3002 // Collect instructions inside the loop that will remain uniform after
3003 // vectorization.
3004
3005 // Global values, params and instructions outside of current loop are out of
3006 // scope.
3007 auto IsOutOfScope = [&](Value *V) -> bool {
3009 return (!I || !TheLoop->contains(I));
3010 };
3011
3012 // Worklist containing uniform instructions demanding lane 0.
3013 SetVector<Instruction *> Worklist;
3014
3015 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3016 // that require predication must not be considered uniform after
3017 // vectorization, because that would create an erroneous replicating region
3018 // where only a single instance out of VF should be formed.
3019 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3020 if (IsOutOfScope(I)) {
3021 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3022 << *I << "\n");
3023 return;
3024 }
3025 if (isPredicatedInst(I)) {
3026 LLVM_DEBUG(
3027 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3028 << "\n");
3029 return;
3030 }
3031 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3032 Worklist.insert(I);
3033 };
3034
3035 // Start with the conditional branches exiting the loop. If the branch
3036 // condition is an instruction contained in the loop that is only used by the
3037 // branch, it is uniform. Note conditions from uncountable early exits are not
3038 // uniform.
3040 TheLoop->getExitingBlocks(Exiting);
3041 for (BasicBlock *E : Exiting) {
3042 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3043 continue;
3044 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3045 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3046 AddToWorklistIfAllowed(Cmp);
3047 }
3048
3049 auto PrevVF = VF.divideCoefficientBy(2);
3050 // Return true if all lanes perform the same memory operation, and we can
3051 // thus choose to execute only one.
3052 auto IsUniformMemOpUse = [&](Instruction *I) {
3053 // If the value was already known to not be uniform for the previous
3054 // (smaller VF), it cannot be uniform for the larger VF.
3055 if (PrevVF.isVector()) {
3056 auto Iter = Uniforms.find(PrevVF);
3057 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3058 return false;
3059 }
3060 if (!Legal->isUniformMemOp(*I, VF))
3061 return false;
3062 if (isa<LoadInst>(I))
3063 // Loading the same address always produces the same result - at least
3064 // assuming aliasing and ordering which have already been checked.
3065 return true;
3066 // Storing the same value on every iteration.
3067 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3068 };
3069
3070 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3071 InstWidening WideningDecision = getWideningDecision(I, VF);
3072 assert(WideningDecision != CM_Unknown &&
3073 "Widening decision should be ready at this moment");
3074
3075 if (IsUniformMemOpUse(I))
3076 return true;
3077
3078 return (WideningDecision == CM_Widen ||
3079 WideningDecision == CM_Widen_Reverse ||
3080 WideningDecision == CM_Interleave);
3081 };
3082
3083 // Returns true if Ptr is the pointer operand of a memory access instruction
3084 // I, I is known to not require scalarization, and the pointer is not also
3085 // stored.
3086 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3087 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3088 return false;
3089 return getLoadStorePointerOperand(I) == Ptr &&
3090 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3091 };
3092
3093 // Holds a list of values which are known to have at least one uniform use.
3094 // Note that there may be other uses which aren't uniform. A "uniform use"
3095 // here is something which only demands lane 0 of the unrolled iterations;
3096 // it does not imply that all lanes produce the same value (e.g. this is not
3097 // the usual meaning of uniform)
3098 SetVector<Value *> HasUniformUse;
3099
3100 // Scan the loop for instructions which are either a) known to have only
3101 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3102 for (auto *BB : TheLoop->blocks())
3103 for (auto &I : *BB) {
3104 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3105 switch (II->getIntrinsicID()) {
3106 case Intrinsic::sideeffect:
3107 case Intrinsic::experimental_noalias_scope_decl:
3108 case Intrinsic::assume:
3109 case Intrinsic::lifetime_start:
3110 case Intrinsic::lifetime_end:
3111 if (TheLoop->hasLoopInvariantOperands(&I))
3112 AddToWorklistIfAllowed(&I);
3113 break;
3114 default:
3115 break;
3116 }
3117 }
3118
3119 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3120 if (IsOutOfScope(EVI->getAggregateOperand())) {
3121 AddToWorklistIfAllowed(EVI);
3122 continue;
3123 }
3124 // Only ExtractValue instructions where the aggregate value comes from a
3125 // call are allowed to be non-uniform.
3126 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3127 "Expected aggregate value to be call return value");
3128 }
3129
3130 // If there's no pointer operand, there's nothing to do.
3131 auto *Ptr = getLoadStorePointerOperand(&I);
3132 if (!Ptr)
3133 continue;
3134
3135 // If the pointer can be proven to be uniform, always add it to the
3136 // worklist.
3137 if (isa<Instruction>(Ptr) && Legal->isUniform(Ptr, VF))
3138 AddToWorklistIfAllowed(cast<Instruction>(Ptr));
3139
3140 if (IsUniformMemOpUse(&I))
3141 AddToWorklistIfAllowed(&I);
3142
3143 if (IsVectorizedMemAccessUse(&I, Ptr))
3144 HasUniformUse.insert(Ptr);
3145 }
3146
3147 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3148 // demanding) users. Since loops are assumed to be in LCSSA form, this
3149 // disallows uses outside the loop as well.
3150 for (auto *V : HasUniformUse) {
3151 if (IsOutOfScope(V))
3152 continue;
3153 auto *I = cast<Instruction>(V);
3154 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3155 auto *UI = cast<Instruction>(U);
3156 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3157 });
3158 if (UsersAreMemAccesses)
3159 AddToWorklistIfAllowed(I);
3160 }
3161
3162 // Expand Worklist in topological order: whenever a new instruction
3163 // is added , its users should be already inside Worklist. It ensures
3164 // a uniform instruction will only be used by uniform instructions.
3165 unsigned Idx = 0;
3166 while (Idx != Worklist.size()) {
3167 Instruction *I = Worklist[Idx++];
3168
3169 for (auto *OV : I->operand_values()) {
3170 // isOutOfScope operands cannot be uniform instructions.
3171 if (IsOutOfScope(OV))
3172 continue;
3173 // First order recurrence Phi's should typically be considered
3174 // non-uniform.
3175 auto *OP = dyn_cast<PHINode>(OV);
3176 if (OP && Legal->isFixedOrderRecurrence(OP))
3177 continue;
3178 // If all the users of the operand are uniform, then add the
3179 // operand into the uniform worklist.
3180 auto *OI = cast<Instruction>(OV);
3181 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3182 auto *J = cast<Instruction>(U);
3183 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3184 }))
3185 AddToWorklistIfAllowed(OI);
3186 }
3187 }
3188
3189 // For an instruction to be added into Worklist above, all its users inside
3190 // the loop should also be in Worklist. However, this condition cannot be
3191 // true for phi nodes that form a cyclic dependence. We must process phi
3192 // nodes separately. An induction variable will remain uniform if all users
3193 // of the induction variable and induction variable update remain uniform.
3194 // The code below handles both pointer and non-pointer induction variables.
3195 BasicBlock *Latch = TheLoop->getLoopLatch();
3196 for (const auto &Induction : Legal->getInductionVars()) {
3197 auto *Ind = Induction.first;
3198 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3199
3200 // Determine if all users of the induction variable are uniform after
3201 // vectorization.
3202 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3203 auto *I = cast<Instruction>(U);
3204 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3205 IsVectorizedMemAccessUse(I, Ind);
3206 });
3207 if (!UniformInd)
3208 continue;
3209
3210 // Determine if all users of the induction variable update instruction are
3211 // uniform after vectorization.
3212 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3213 auto *I = cast<Instruction>(U);
3214 return I == Ind || Worklist.count(I) ||
3215 IsVectorizedMemAccessUse(I, IndUpdate);
3216 });
3217 if (!UniformIndUpdate)
3218 continue;
3219
3220 // The induction variable and its update instruction will remain uniform.
3221 AddToWorklistIfAllowed(Ind);
3222 AddToWorklistIfAllowed(IndUpdate);
3223 }
3224
3225 Uniforms[VF].insert_range(Worklist);
3226}
3227
3229 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3230
3231 if (Legal->getRuntimePointerChecking()->Need) {
3232 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3233 "runtime pointer checks needed. Enable vectorization of this "
3234 "loop with '#pragma clang loop vectorize(enable)' when "
3235 "compiling with -Os/-Oz",
3236 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3237 return true;
3238 }
3239
3240 if (!PSE.getPredicate().isAlwaysTrue()) {
3241 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3242 "runtime SCEV checks needed. Enable vectorization of this "
3243 "loop with '#pragma clang loop vectorize(enable)' when "
3244 "compiling with -Os/-Oz",
3245 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3246 return true;
3247 }
3248
3249 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3250 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3251 reportVectorizationFailure("Runtime stride check for small trip count",
3252 "runtime stride == 1 checks needed. Enable vectorization of "
3253 "this loop without such check by compiling with -Os/-Oz",
3254 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3255 return true;
3256 }
3257
3258 return false;
3259}
3260
3261bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3262 if (IsScalableVectorizationAllowed)
3263 return *IsScalableVectorizationAllowed;
3264
3265 IsScalableVectorizationAllowed = false;
3266 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3267 return false;
3268
3269 if (Hints->isScalableVectorizationDisabled()) {
3270 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3271 "ScalableVectorizationDisabled", ORE, TheLoop);
3272 return false;
3273 }
3274
3275 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3276
3277 auto MaxScalableVF = ElementCount::getScalable(
3278 std::numeric_limits<ElementCount::ScalarTy>::max());
3279
3280 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3281 // FIXME: While for scalable vectors this is currently sufficient, this should
3282 // be replaced by a more detailed mechanism that filters out specific VFs,
3283 // instead of invalidating vectorization for a whole set of VFs based on the
3284 // MaxVF.
3285
3286 // Disable scalable vectorization if the loop contains unsupported reductions.
3287 if (!canVectorizeReductions(MaxScalableVF)) {
3289 "Scalable vectorization not supported for the reduction "
3290 "operations found in this loop.",
3291 "ScalableVFUnfeasible", ORE, TheLoop);
3292 return false;
3293 }
3294
3295 // Disable scalable vectorization if the loop contains any instructions
3296 // with element types not supported for scalable vectors.
3297 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3298 return !Ty->isVoidTy() &&
3300 })) {
3301 reportVectorizationInfo("Scalable vectorization is not supported "
3302 "for all element types found in this loop.",
3303 "ScalableVFUnfeasible", ORE, TheLoop);
3304 return false;
3305 }
3306
3307 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3308 reportVectorizationInfo("The target does not provide maximum vscale value "
3309 "for safe distance analysis.",
3310 "ScalableVFUnfeasible", ORE, TheLoop);
3311 return false;
3312 }
3313
3314 IsScalableVectorizationAllowed = true;
3315 return true;
3316}
3317
3318ElementCount
3319LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3320 if (!isScalableVectorizationAllowed())
3321 return ElementCount::getScalable(0);
3322
3323 auto MaxScalableVF = ElementCount::getScalable(
3324 std::numeric_limits<ElementCount::ScalarTy>::max());
3325 if (Legal->isSafeForAnyVectorWidth())
3326 return MaxScalableVF;
3327
3328 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3329 // Limit MaxScalableVF by the maximum safe dependence distance.
3330 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3331
3332 if (!MaxScalableVF)
3334 "Max legal vector width too small, scalable vectorization "
3335 "unfeasible.",
3336 "ScalableVFUnfeasible", ORE, TheLoop);
3337
3338 return MaxScalableVF;
3339}
3340
3341FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3342 unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
3343 bool FoldTailByMasking) {
3344 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3345 unsigned SmallestType, WidestType;
3346 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3347
3348 // Get the maximum safe dependence distance in bits computed by LAA.
3349 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3350 // the memory accesses that is most restrictive (involved in the smallest
3351 // dependence distance).
3352 unsigned MaxSafeElementsPowerOf2 =
3353 bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3354 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3355 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3356 MaxSafeElementsPowerOf2 =
3357 std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
3358 }
3359 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
3360 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
3361
3362 if (!Legal->isSafeForAnyVectorWidth())
3363 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3364
3365 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3366 << ".\n");
3367 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3368 << ".\n");
3369
3370 // First analyze the UserVF, fall back if the UserVF should be ignored.
3371 if (UserVF) {
3372 auto MaxSafeUserVF =
3373 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3374
3375 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3376 // If `VF=vscale x N` is safe, then so is `VF=N`
3377 if (UserVF.isScalable())
3378 return FixedScalableVFPair(
3379 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3380
3381 return UserVF;
3382 }
3383
3384 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3385
3386 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3387 // is better to ignore the hint and let the compiler choose a suitable VF.
3388 if (!UserVF.isScalable()) {
3389 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3390 << " is unsafe, clamping to max safe VF="
3391 << MaxSafeFixedVF << ".\n");
3392 ORE->emit([&]() {
3393 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3394 TheLoop->getStartLoc(),
3395 TheLoop->getHeader())
3396 << "User-specified vectorization factor "
3397 << ore::NV("UserVectorizationFactor", UserVF)
3398 << " is unsafe, clamping to maximum safe vectorization factor "
3399 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3400 });
3401 return MaxSafeFixedVF;
3402 }
3403
3405 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3406 << " is ignored because scalable vectors are not "
3407 "available.\n");
3408 ORE->emit([&]() {
3409 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3410 TheLoop->getStartLoc(),
3411 TheLoop->getHeader())
3412 << "User-specified vectorization factor "
3413 << ore::NV("UserVectorizationFactor", UserVF)
3414 << " is ignored because the target does not support scalable "
3415 "vectors. The compiler will pick a more suitable value.";
3416 });
3417 } else {
3418 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3419 << " is unsafe. Ignoring scalable UserVF.\n");
3420 ORE->emit([&]() {
3421 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3422 TheLoop->getStartLoc(),
3423 TheLoop->getHeader())
3424 << "User-specified vectorization factor "
3425 << ore::NV("UserVectorizationFactor", UserVF)
3426 << " is unsafe. Ignoring the hint to let the compiler pick a "
3427 "more suitable value.";
3428 });
3429 }
3430 }
3431
3432 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3433 << " / " << WidestType << " bits.\n");
3434
3435 FixedScalableVFPair Result(ElementCount::getFixed(1),
3437 if (auto MaxVF =
3438 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3439 MaxSafeFixedVF, UserIC, FoldTailByMasking))
3440 Result.FixedVF = MaxVF;
3441
3442 if (auto MaxVF =
3443 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3444 MaxSafeScalableVF, UserIC, FoldTailByMasking))
3445 if (MaxVF.isScalable()) {
3446 Result.ScalableVF = MaxVF;
3447 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3448 << "\n");
3449 }
3450
3451 return Result;
3452}
3453
3454FixedScalableVFPair
3456 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3457 // TODO: It may be useful to do since it's still likely to be dynamically
3458 // uniform if the target can skip.
3460 "Not inserting runtime ptr check for divergent target",
3461 "runtime pointer checks needed. Not enabled for divergent target",
3462 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3464 }
3465
3466 ScalarEvolution *SE = PSE.getSE();
3468 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3469 if (!MaxTC && ScalarEpilogueStatus == CM_ScalarEpilogueAllowed)
3471 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3472 if (TC != ElementCount::getFixed(MaxTC))
3473 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3474 if (TC.isScalar()) {
3475 reportVectorizationFailure("Single iteration (non) loop",
3476 "loop trip count is one, irrelevant for vectorization",
3477 "SingleIterationLoop", ORE, TheLoop);
3479 }
3480
3481 // If BTC matches the widest induction type and is -1 then the trip count
3482 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3483 // to vectorize.
3484 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3485 if (!isa<SCEVCouldNotCompute>(BTC) &&
3486 BTC->getType()->getScalarSizeInBits() >=
3487 Legal->getWidestInductionType()->getScalarSizeInBits() &&
3489 SE->getMinusOne(BTC->getType()))) {
3491 "Trip count computation wrapped",
3492 "backedge-taken count is -1, loop trip count wrapped to 0",
3493 "TripCountWrapped", ORE, TheLoop);
3495 }
3496
3497 switch (ScalarEpilogueStatus) {
3499 return computeFeasibleMaxVF(MaxTC, UserVF, UserIC, false);
3501 [[fallthrough]];
3503 LLVM_DEBUG(
3504 dbgs() << "LV: vector predicate hint/switch found.\n"
3505 << "LV: Not allowing scalar epilogue, creating predicated "
3506 << "vector loop.\n");
3507 break;
3509 // fallthrough as a special case of OptForSize
3511 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3512 LLVM_DEBUG(
3513 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3514 else
3515 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3516 << "count.\n");
3517
3518 // Bail if runtime checks are required, which are not good when optimising
3519 // for size.
3522
3523 break;
3524 }
3525
3526 // Now try the tail folding
3527
3528 // Invalidate interleave groups that require an epilogue if we can't mask
3529 // the interleave-group.
3531 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3532 "No decisions should have been taken at this point");
3533 // Note: There is no need to invalidate any cost modeling decisions here, as
3534 // none were taken so far.
3535 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3536 }
3537
3538 FixedScalableVFPair MaxFactors =
3539 computeFeasibleMaxVF(MaxTC, UserVF, UserIC, true);
3540
3541 // Avoid tail folding if the trip count is known to be a multiple of any VF
3542 // we choose.
3543 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3544 MaxFactors.FixedVF.getFixedValue();
3545 if (MaxFactors.ScalableVF) {
3546 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3547 if (MaxVScale) {
3548 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3549 *MaxPowerOf2RuntimeVF,
3550 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3551 } else
3552 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3553 }
3554
3555 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3556 // Return false if the loop is neither a single-latch-exit loop nor an
3557 // early-exit loop as tail-folding is not supported in that case.
3558 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3559 !Legal->hasUncountableEarlyExit())
3560 return false;
3561 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3562 ScalarEvolution *SE = PSE.getSE();
3563 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3564 // with uncountable exits. For countable loops, the symbolic maximum must
3565 // remain identical to the known back-edge taken count.
3566 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3567 assert((Legal->hasUncountableEarlyExit() ||
3568 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3569 "Invalid loop count");
3570 const SCEV *ExitCount = SE->getAddExpr(
3571 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3572 const SCEV *Rem = SE->getURemExpr(
3573 SE->applyLoopGuards(ExitCount, TheLoop),
3574 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
3575 return Rem->isZero();
3576 };
3577
3578 if (MaxPowerOf2RuntimeVF > 0u) {
3579 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3580 "MaxFixedVF must be a power of 2");
3581 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3582 // Accept MaxFixedVF if we do not have a tail.
3583 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3584 return MaxFactors;
3585 }
3586 }
3587
3588 auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
3589 if (ExpectedTC && ExpectedTC->isFixed() &&
3590 ExpectedTC->getFixedValue() <=
3591 TTI.getMinTripCountTailFoldingThreshold()) {
3592 if (MaxPowerOf2RuntimeVF > 0u) {
3593 // If we have a low-trip-count, and the fixed-width VF is known to divide
3594 // the trip count but the scalable factor does not, use the fixed-width
3595 // factor in preference to allow the generation of a non-predicated loop.
3596 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3597 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3598 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3599 "remain for any chosen VF.\n");
3600 MaxFactors.ScalableVF = ElementCount::getScalable(0);
3601 return MaxFactors;
3602 }
3603 }
3604
3606 "The trip count is below the minial threshold value.",
3607 "loop trip count is too low, avoiding vectorization", "LowTripCount",
3608 ORE, TheLoop);
3610 }
3611
3612 // If we don't know the precise trip count, or if the trip count that we
3613 // found modulo the vectorization factor is not zero, try to fold the tail
3614 // by masking.
3615 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3616 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3617 setTailFoldingStyle(ContainsScalableVF, UserIC);
3618 if (foldTailByMasking()) {
3619 if (foldTailWithEVL()) {
3620 LLVM_DEBUG(
3621 dbgs()
3622 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3623 "try to generate VP Intrinsics with scalable vector "
3624 "factors only.\n");
3625 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3626 // for now.
3627 // TODO: extend it for fixed vectors, if required.
3628 assert(ContainsScalableVF && "Expected scalable vector factor.");
3629
3630 MaxFactors.FixedVF = ElementCount::getFixed(1);
3631 }
3632 return MaxFactors;
3633 }
3634
3635 // If there was a tail-folding hint/switch, but we can't fold the tail by
3636 // masking, fallback to a vectorization with a scalar epilogue.
3637 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3638 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3639 "scalar epilogue instead.\n");
3640 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3641 return MaxFactors;
3642 }
3643
3644 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3645 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3647 }
3648
3649 if (TC.isZero()) {
3651 "unable to calculate the loop count due to complex control flow",
3652 "UnknownLoopCountComplexCFG", ORE, TheLoop);
3654 }
3655
3657 "Cannot optimize for size and vectorize at the same time.",
3658 "cannot optimize for size and vectorize at the same time. "
3659 "Enable vectorization of this loop with '#pragma clang loop "
3660 "vectorize(enable)' when compiling with -Os/-Oz",
3661 "NoTailLoopWithOptForSize", ORE, TheLoop);
3663}
3664
3666 ElementCount VF) {
3667 if (ConsiderRegPressure.getNumOccurrences())
3668 return ConsiderRegPressure;
3669
3670 // TODO: We should eventually consider register pressure for all targets. The
3671 // TTI hook is temporary whilst target-specific issues are being fixed.
3672 if (TTI.shouldConsiderVectorizationRegPressure())
3673 return true;
3674
3675 if (!useMaxBandwidth(VF.isScalable()
3678 return false;
3679 // Only calculate register pressure for VFs enabled by MaxBandwidth.
3681 VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
3683}
3684
3687 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3688 (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
3690 Legal->hasVectorCallVariants())));
3691}
3692
3693ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
3694 ElementCount VF, unsigned MaxTripCount, unsigned UserIC,
3695 bool FoldTailByMasking) const {
3696 unsigned EstimatedVF = VF.getKnownMinValue();
3697 if (VF.isScalable() && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
3698 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
3699 auto Min = Attr.getVScaleRangeMin();
3700 EstimatedVF *= Min;
3701 }
3702
3703 // When a scalar epilogue is required, at least one iteration of the scalar
3704 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3705 // max VF that results in a dead vector loop.
3706 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
3707 MaxTripCount -= 1;
3708
3709 // When the user specifies an interleave count, we need to ensure that
3710 // VF * UserIC <= MaxTripCount to avoid a dead vector loop.
3711 unsigned IC = UserIC > 0 ? UserIC : 1;
3712 unsigned EstimatedVFTimesIC = EstimatedVF * IC;
3713
3714 if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
3715 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
3716 // If upper bound loop trip count (TC) is known at compile time there is no
3717 // point in choosing VF greater than TC / IC (as done in the loop below).
3718 // Select maximum power of two which doesn't exceed TC / IC. If VF is
3719 // scalable, we only fall back on a fixed VF when the TC is less than or
3720 // equal to the known number of lanes.
3721 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount / IC);
3722 if (ClampedUpperTripCount == 0)
3723 ClampedUpperTripCount = 1;
3724 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3725 "exceeding the constant trip count"
3726 << (UserIC > 0 ? " divided by UserIC" : "") << ": "
3727 << ClampedUpperTripCount << "\n");
3728 return ElementCount::get(ClampedUpperTripCount,
3729 FoldTailByMasking ? VF.isScalable() : false);
3730 }
3731 return VF;
3732}
3733
3734ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3735 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3736 ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking) {
3737 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3738 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3739 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3741
3742 // Convenience function to return the minimum of two ElementCounts.
3743 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3744 assert((LHS.isScalable() == RHS.isScalable()) &&
3745 "Scalable flags must match");
3746 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3747 };
3748
3749 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3750 // Note that both WidestRegister and WidestType may not be a powers of 2.
3751 auto MaxVectorElementCount = ElementCount::get(
3752 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
3753 ComputeScalableMaxVF);
3754 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3755 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3756 << (MaxVectorElementCount * WidestType) << " bits.\n");
3757
3758 if (!MaxVectorElementCount) {
3759 LLVM_DEBUG(dbgs() << "LV: The target has no "
3760 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3761 << " vector registers.\n");
3762 return ElementCount::getFixed(1);
3763 }
3764
3765 ElementCount MaxVF = clampVFByMaxTripCount(
3766 MaxVectorElementCount, MaxTripCount, UserIC, FoldTailByMasking);
3767 // If the MaxVF was already clamped, there's no point in trying to pick a
3768 // larger one.
3769 if (MaxVF != MaxVectorElementCount)
3770 return MaxVF;
3771
3773 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3775
3776 if (MaxVF.isScalable())
3777 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
3778 else
3779 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
3780
3781 if (useMaxBandwidth(RegKind)) {
3782 auto MaxVectorElementCountMaxBW = ElementCount::get(
3783 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
3784 ComputeScalableMaxVF);
3785 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3786
3787 if (ElementCount MinVF =
3788 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
3789 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
3790 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3791 << ") with target's minimum: " << MinVF << '\n');
3792 MaxVF = MinVF;
3793 }
3794 }
3795
3796 MaxVF =
3797 clampVFByMaxTripCount(MaxVF, MaxTripCount, UserIC, FoldTailByMasking);
3798
3799 if (MaxVectorElementCount != MaxVF) {
3800 // Invalidate any widening decisions we might have made, in case the loop
3801 // requires prediction (decided later), but we have already made some
3802 // load/store widening decisions.
3803 invalidateCostModelingDecisions();
3804 }
3805 }
3806 return MaxVF;
3807}
3808
3809bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3810 const VectorizationFactor &B,
3811 const unsigned MaxTripCount,
3812 bool HasTail,
3813 bool IsEpilogue) const {
3814 InstructionCost CostA = A.Cost;
3815 InstructionCost CostB = B.Cost;
3816
3817 // Improve estimate for the vector width if it is scalable.
3818 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3819 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3820 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3821 if (A.Width.isScalable())
3822 EstimatedWidthA *= *VScale;
3823 if (B.Width.isScalable())
3824 EstimatedWidthB *= *VScale;
3825 }
3826
3827 // When optimizing for size choose whichever is smallest, which will be the
3828 // one with the smallest cost for the whole loop. On a tie pick the larger
3829 // vector width, on the assumption that throughput will be greater.
3830 if (CM.CostKind == TTI::TCK_CodeSize)
3831 return CostA < CostB ||
3832 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3833
3834 // Assume vscale may be larger than 1 (or the value being tuned for),
3835 // so that scalable vectorization is slightly favorable over fixed-width
3836 // vectorization.
3837 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
3838 A.Width.isScalable() && !B.Width.isScalable();
3839
3840 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3841 const InstructionCost &RHS) {
3842 return PreferScalable ? LHS <= RHS : LHS < RHS;
3843 };
3844
3845 // To avoid the need for FP division:
3846 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3847 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3848 bool LowerCostWithoutTC =
3849 CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3850 if (!MaxTripCount)
3851 return LowerCostWithoutTC;
3852
3853 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3854 InstructionCost VectorCost,
3855 InstructionCost ScalarCost) {
3856 // If the trip count is a known (possibly small) constant, the trip count
3857 // will be rounded up to an integer number of iterations under
3858 // FoldTailByMasking. The total cost in that case will be
3859 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3860 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3861 // some extra overheads, but for the purpose of comparing the costs of
3862 // different VFs we can use this to compare the total loop-body cost
3863 // expected after vectorization.
3864 if (HasTail)
3865 return VectorCost * (MaxTripCount / VF) +
3866 ScalarCost * (MaxTripCount % VF);
3867 return VectorCost * divideCeil(MaxTripCount, VF);
3868 };
3869
3870 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3871 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3872 bool LowerCostWithTC = CmpFn(RTCostA, RTCostB);
3873 LLVM_DEBUG(if (LowerCostWithTC != LowerCostWithoutTC) {
3874 dbgs() << "LV: VF " << (LowerCostWithTC ? A.Width : B.Width)
3875 << " has lower cost than VF "
3876 << (LowerCostWithTC ? B.Width : A.Width)
3877 << " when taking the cost of the remaining scalar loop iterations "
3878 "into consideration for a maximum trip count of "
3879 << MaxTripCount << ".\n";
3880 });
3881 return LowerCostWithTC;
3882}
3883
3884bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3885 const VectorizationFactor &B,
3886 bool HasTail,
3887 bool IsEpilogue) const {
3888 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
3889 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
3890 IsEpilogue);
3891}
3892
3895 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3896 SmallVector<RecipeVFPair> InvalidCosts;
3897 for (const auto &Plan : VPlans) {
3898 for (ElementCount VF : Plan->vectorFactors()) {
3899 // The VPlan-based cost model is designed for computing vector cost.
3900 // Querying VPlan-based cost model with a scarlar VF will cause some
3901 // errors because we expect the VF is vector for most of the widen
3902 // recipes.
3903 if (VF.isScalar())
3904 continue;
3905
3906 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
3907 OrigLoop);
3908 precomputeCosts(*Plan, VF, CostCtx);
3909 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
3911 for (auto &R : *VPBB) {
3912 if (!R.cost(VF, CostCtx).isValid())
3913 InvalidCosts.emplace_back(&R, VF);
3914 }
3915 }
3916 }
3917 }
3918 if (InvalidCosts.empty())
3919 return;
3920
3921 // Emit a report of VFs with invalid costs in the loop.
3922
3923 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
3925 unsigned I = 0;
3926 for (auto &Pair : InvalidCosts)
3927 if (Numbering.try_emplace(Pair.first, I).second)
3928 ++I;
3929
3930 // Sort the list, first on recipe(number) then on VF.
3931 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
3932 unsigned NA = Numbering[A.first];
3933 unsigned NB = Numbering[B.first];
3934 if (NA != NB)
3935 return NA < NB;
3936 return ElementCount::isKnownLT(A.second, B.second);
3937 });
3938
3939 // For a list of ordered recipe-VF pairs:
3940 // [(load, VF1), (load, VF2), (store, VF1)]
3941 // group the recipes together to emit separate remarks for:
3942 // load (VF1, VF2)
3943 // store (VF1)
3944 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
3945 auto Subset = ArrayRef<RecipeVFPair>();
3946 do {
3947 if (Subset.empty())
3948 Subset = Tail.take_front(1);
3949
3950 VPRecipeBase *R = Subset.front().first;
3951
3952 unsigned Opcode =
3954 .Case([](const VPHeaderPHIRecipe *R) { return Instruction::PHI; })
3955 .Case(
3956 [](const VPWidenStoreRecipe *R) { return Instruction::Store; })
3957 .Case([](const VPWidenLoadRecipe *R) { return Instruction::Load; })
3958 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
3959 [](const auto *R) { return Instruction::Call; })
3962 [](const auto *R) { return R->getOpcode(); })
3963 .Case([](const VPInterleaveRecipe *R) {
3964 return R->getStoredValues().empty() ? Instruction::Load
3965 : Instruction::Store;
3966 })
3967 .Case([](const VPReductionRecipe *R) {
3968 return RecurrenceDescriptor::getOpcode(R->getRecurrenceKind());
3969 });
3970
3971 // If the next recipe is different, or if there are no other pairs,
3972 // emit a remark for the collated subset. e.g.
3973 // [(load, VF1), (load, VF2))]
3974 // to emit:
3975 // remark: invalid costs for 'load' at VF=(VF1, VF2)
3976 if (Subset == Tail || Tail[Subset.size()].first != R) {
3977 std::string OutString;
3978 raw_string_ostream OS(OutString);
3979 assert(!Subset.empty() && "Unexpected empty range");
3980 OS << "Recipe with invalid costs prevented vectorization at VF=(";
3981 for (const auto &Pair : Subset)
3982 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
3983 OS << "):";
3984 if (Opcode == Instruction::Call) {
3985 StringRef Name = "";
3986 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
3987 Name = Int->getIntrinsicName();
3988 } else {
3989 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
3990 Function *CalledFn =
3991 WidenCall ? WidenCall->getCalledScalarFunction()
3992 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
3993 ->getLiveInIRValue());
3994 Name = CalledFn->getName();
3995 }
3996 OS << " call to " << Name;
3997 } else
3998 OS << " " << Instruction::getOpcodeName(Opcode);
3999 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4000 R->getDebugLoc());
4001 Tail = Tail.drop_front(Subset.size());
4002 Subset = {};
4003 } else
4004 // Grow the subset by one element
4005 Subset = Tail.take_front(Subset.size() + 1);
4006 } while (!Tail.empty());
4007}
4008
4009/// Check if any recipe of \p Plan will generate a vector value, which will be
4010/// assigned a vector register.
4012 const TargetTransformInfo &TTI) {
4013 assert(VF.isVector() && "Checking a scalar VF?");
4014 VPTypeAnalysis TypeInfo(Plan);
4015 DenseSet<VPRecipeBase *> EphemeralRecipes;
4016 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4017 // Set of already visited types.
4018 DenseSet<Type *> Visited;
4021 for (VPRecipeBase &R : *VPBB) {
4022 if (EphemeralRecipes.contains(&R))
4023 continue;
4024 // Continue early if the recipe is considered to not produce a vector
4025 // result. Note that this includes VPInstruction where some opcodes may
4026 // produce a vector, to preserve existing behavior as VPInstructions model
4027 // aspects not directly mapped to existing IR instructions.
4028 switch (R.getVPRecipeID()) {
4029 case VPRecipeBase::VPDerivedIVSC:
4030 case VPRecipeBase::VPScalarIVStepsSC:
4031 case VPRecipeBase::VPReplicateSC:
4032 case VPRecipeBase::VPInstructionSC:
4033 case VPRecipeBase::VPCanonicalIVPHISC:
4034 case VPRecipeBase::VPCurrentIterationPHISC:
4035 case VPRecipeBase::VPVectorPointerSC:
4036 case VPRecipeBase::VPVectorEndPointerSC:
4037 case VPRecipeBase::VPExpandSCEVSC:
4038 case VPRecipeBase::VPPredInstPHISC:
4039 case VPRecipeBase::VPBranchOnMaskSC:
4040 continue;
4041 case VPRecipeBase::VPReductionSC:
4042 case VPRecipeBase::VPActiveLaneMaskPHISC:
4043 case VPRecipeBase::VPWidenCallSC:
4044 case VPRecipeBase::VPWidenCanonicalIVSC:
4045 case VPRecipeBase::VPWidenCastSC:
4046 case VPRecipeBase::VPWidenGEPSC:
4047 case VPRecipeBase::VPWidenIntrinsicSC:
4048 case VPRecipeBase::VPWidenSC:
4049 case VPRecipeBase::VPBlendSC:
4050 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
4051 case VPRecipeBase::VPHistogramSC:
4052 case VPRecipeBase::VPWidenPHISC:
4053 case VPRecipeBase::VPWidenIntOrFpInductionSC:
4054 case VPRecipeBase::VPWidenPointerInductionSC:
4055 case VPRecipeBase::VPReductionPHISC:
4056 case VPRecipeBase::VPInterleaveEVLSC:
4057 case VPRecipeBase::VPInterleaveSC:
4058 case VPRecipeBase::VPWidenLoadEVLSC:
4059 case VPRecipeBase::VPWidenLoadSC:
4060 case VPRecipeBase::VPWidenStoreEVLSC:
4061 case VPRecipeBase::VPWidenStoreSC:
4062 break;
4063 default:
4064 llvm_unreachable("unhandled recipe");
4065 }
4066
4067 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4068 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4069 if (!NumLegalParts)
4070 return false;
4071 if (VF.isScalable()) {
4072 // <vscale x 1 x iN> is assumed to be profitable over iN because
4073 // scalable registers are a distinct register class from scalar
4074 // ones. If we ever find a target which wants to lower scalable
4075 // vectors back to scalars, we'll need to update this code to
4076 // explicitly ask TTI about the register class uses for each part.
4077 return NumLegalParts <= VF.getKnownMinValue();
4078 }
4079 // Two or more elements that share a register - are vectorized.
4080 return NumLegalParts < VF.getFixedValue();
4081 };
4082
4083 // If no def nor is a store, e.g., branches, continue - no value to check.
4084 if (R.getNumDefinedValues() == 0 &&
4086 continue;
4087 // For multi-def recipes, currently only interleaved loads, suffice to
4088 // check first def only.
4089 // For stores check their stored value; for interleaved stores suffice
4090 // the check first stored value only. In all cases this is the second
4091 // operand.
4092 VPValue *ToCheck =
4093 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4094 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4095 if (!Visited.insert({ScalarTy}).second)
4096 continue;
4097 Type *WideTy = toVectorizedTy(ScalarTy, VF);
4098 if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
4099 return true;
4100 }
4101 }
4102
4103 return false;
4104}
4105
4106static bool hasReplicatorRegion(VPlan &Plan) {
4108 Plan.getVectorLoopRegion()->getEntry())),
4109 [](auto *VPRB) { return VPRB->isReplicator(); });
4110}
4111
4112#ifndef NDEBUG
4113VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4114 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4115 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4116 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4117 assert(
4118 any_of(VPlans,
4119 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4120 "Expected Scalar VF to be a candidate");
4121
4122 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4123 ExpectedCost);
4124 VectorizationFactor ChosenFactor = ScalarCost;
4125
4126 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4127 if (ForceVectorization &&
4128 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4129 // Ignore scalar width, because the user explicitly wants vectorization.
4130 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4131 // evaluation.
4132 ChosenFactor.Cost = InstructionCost::getMax();
4133 }
4134
4135 for (auto &P : VPlans) {
4136 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4137 P->vectorFactors().end());
4138
4140 if (any_of(VFs, [this](ElementCount VF) {
4141 return CM.shouldConsiderRegPressureForVF(VF);
4142 }))
4143 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4144
4145 for (unsigned I = 0; I < VFs.size(); I++) {
4146 ElementCount VF = VFs[I];
4147 // The cost for scalar VF=1 is already calculated, so ignore it.
4148 if (VF.isScalar())
4149 continue;
4150
4151 InstructionCost C = CM.expectedCost(VF);
4152
4153 // Add on other costs that are modelled in VPlan, but not in the legacy
4154 // cost model.
4155 VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, CM.PSE,
4156 OrigLoop);
4157 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4158 assert(VectorRegion && "Expected to have a vector region!");
4159 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4160 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4161 for (VPRecipeBase &R : *VPBB) {
4162 auto *VPI = dyn_cast<VPInstruction>(&R);
4163 if (!VPI)
4164 continue;
4165 switch (VPI->getOpcode()) {
4166 // Selects are only modelled in the legacy cost model for safe
4167 // divisors.
4168 case Instruction::Select: {
4169 if (auto *WR =
4170 dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
4171 switch (WR->getOpcode()) {
4172 case Instruction::UDiv:
4173 case Instruction::SDiv:
4174 case Instruction::URem:
4175 case Instruction::SRem:
4176 continue;
4177 default:
4178 break;
4179 }
4180 }
4181 C += VPI->cost(VF, CostCtx);
4182 break;
4183 }
4185 unsigned Multiplier =
4186 cast<VPConstantInt>(VPI->getOperand(2))->getZExtValue();
4187 C += VPI->cost(VF * Multiplier, CostCtx);
4188 break;
4189 }
4192 C += VPI->cost(VF, CostCtx);
4193 break;
4194 default:
4195 break;
4196 }
4197 }
4198 }
4199
4200 // Add the cost of any spills due to excess register usage
4201 if (CM.shouldConsiderRegPressureForVF(VF))
4202 C += RUs[I].spillCost(CostCtx, ForceTargetNumVectorRegs);
4203
4204 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4205 unsigned Width =
4206 estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
4207 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4208 << " costs: " << (Candidate.Cost / Width));
4209 if (VF.isScalable())
4210 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4211 << CM.getVScaleForTuning().value_or(1) << ")");
4212 LLVM_DEBUG(dbgs() << ".\n");
4213
4214 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4215 LLVM_DEBUG(
4216 dbgs()
4217 << "LV: Not considering vector loop of width " << VF
4218 << " because it will not generate any vector instructions.\n");
4219 continue;
4220 }
4221
4222 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4223 LLVM_DEBUG(
4224 dbgs()
4225 << "LV: Not considering vector loop of width " << VF
4226 << " because it would cause replicated blocks to be generated,"
4227 << " which isn't allowed when optimizing for size.\n");
4228 continue;
4229 }
4230
4231 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4232 ChosenFactor = Candidate;
4233 }
4234 }
4235
4236 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4238 "There are conditional stores.",
4239 "store that is conditionally executed prevents vectorization",
4240 "ConditionalStore", ORE, OrigLoop);
4241 ChosenFactor = ScalarCost;
4242 }
4243
4244 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4245 !isMoreProfitable(ChosenFactor, ScalarCost,
4246 !CM.foldTailByMasking())) dbgs()
4247 << "LV: Vectorization seems to be not beneficial, "
4248 << "but was forced by a user.\n");
4249 return ChosenFactor;
4250}
4251#endif
4252
4253/// Returns true if the VPlan contains a VPReductionPHIRecipe with
4254/// FindLast recurrence kind.
4255static bool hasFindLastReductionPhi(VPlan &Plan) {
4257 [](VPRecipeBase &R) {
4258 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(&R);
4259 return RedPhi &&
4260 RecurrenceDescriptor::isFindLastRecurrenceKind(
4261 RedPhi->getRecurrenceKind());
4262 });
4263}
4264
4265/// Returns true if the VPlan contains header phi recipes that are not currently
4266/// supported for epilogue vectorization.
4268 return any_of(
4270 [](VPRecipeBase &R) {
4271 if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
4272 return !WidenInd->getPHINode();
4273 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(&R);
4274 return RedPhi && (RecurrenceDescriptor::isFindLastRecurrenceKind(
4275 RedPhi->getRecurrenceKind()) ||
4276 !RedPhi->getUnderlyingValue());
4277 });
4278}
4279
4280bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4281 ElementCount VF) const {
4282 // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4283 // reductions need special handling and are currently unsupported.
4284 if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
4285 if (!Legal->isReductionVariable(&Phi))
4286 return Legal->isFixedOrderRecurrence(&Phi);
4287 RecurKind Kind =
4288 Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
4289 return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind);
4290 }))
4291 return false;
4292
4293 // FindLast reductions and inductions without underlying PHI require special
4294 // handling and are currently not supported for epilogue vectorization.
4295 if (hasUnsupportedHeaderPhiRecipe(getPlanFor(VF)))
4296 return false;
4297
4298 // Phis with uses outside of the loop require special handling and are
4299 // currently unsupported.
4300 for (const auto &Entry : Legal->getInductionVars()) {
4301 // Look for uses of the value of the induction at the last iteration.
4302 Value *PostInc =
4303 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4304 for (User *U : PostInc->users())
4305 if (!OrigLoop->contains(cast<Instruction>(U)))
4306 return false;
4307 // Look for uses of penultimate value of the induction.
4308 for (User *U : Entry.first->users())
4309 if (!OrigLoop->contains(cast<Instruction>(U)))
4310 return false;
4311 }
4312
4313 // Epilogue vectorization code has not been auditted to ensure it handles
4314 // non-latch exits properly. It may be fine, but it needs auditted and
4315 // tested.
4316 // TODO: Add support for loops with an early exit.
4317 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4318 return false;
4319
4320 return true;
4321}
4322
4324 const ElementCount VF, const unsigned IC) const {
4325 // FIXME: We need a much better cost-model to take different parameters such
4326 // as register pressure, code size increase and cost of extra branches into
4327 // account. For now we apply a very crude heuristic and only consider loops
4328 // with vectorization factors larger than a certain value.
4329
4330 // Allow the target to opt out.
4331 if (!TTI.preferEpilogueVectorization(VF * IC))
4332 return false;
4333
4334 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4336 : TTI.getEpilogueVectorizationMinVF();
4337 return estimateElementCount(VF * IC, VScaleForTuning) >= MinVFThreshold;
4338}
4339
4341 ElementCount MainLoopVF, unsigned IC) {
4344 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4345 return Result;
4346 }
4347
4348 if (!CM.isScalarEpilogueAllowed()) {
4349 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4350 "epilogue is allowed.\n");
4351 return Result;
4352 }
4353
4354 // Not really a cost consideration, but check for unsupported cases here to
4355 // simplify the logic.
4356 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4357 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4358 "is not a supported candidate.\n");
4359 return Result;
4360 }
4361
4363 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4365 if (hasPlanWithVF(ForcedEC))
4366 return {ForcedEC, 0, 0};
4367
4368 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4369 "viable.\n");
4370 return Result;
4371 }
4372
4373 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4374 LLVM_DEBUG(
4375 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4376 return Result;
4377 }
4378
4379 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4380 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4381 "this loop\n");
4382 return Result;
4383 }
4384
4385 // Check if a plan's vector loop processes fewer iterations than VF (e.g. when
4386 // interleave groups have been narrowed) narrowInterleaveGroups) and return
4387 // the adjusted, effective VF.
4388 using namespace VPlanPatternMatch;
4389 auto GetEffectiveVF = [](VPlan &Plan, ElementCount VF) -> ElementCount {
4390 auto *Exiting = Plan.getVectorLoopRegion()->getExitingBasicBlock();
4391 if (match(&Exiting->back(),
4392 m_BranchOnCount(m_Add(m_CanonicalIV(), m_Specific(&Plan.getUF())),
4393 m_VPValue())))
4394 return ElementCount::get(1, VF.isScalable());
4395 return VF;
4396 };
4397
4398 // Check if the main loop processes fewer than MainLoopVF elements per
4399 // iteration (e.g. due to narrowing interleave groups). Adjust MainLoopVF
4400 // as needed.
4401 VPlan &MainPlan = getPlanFor(MainLoopVF);
4402 MainLoopVF = GetEffectiveVF(MainPlan, MainLoopVF);
4403
4404 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4405 // the main loop handles 8 lanes per iteration. We could still benefit from
4406 // vectorizing the epilogue loop with VF=4.
4407 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4408 estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));
4409
4410 Type *TCType = Legal->getWidestInductionType();
4411 const SCEV *RemainingIterations = nullptr;
4412 unsigned MaxTripCount = 0;
4413 const SCEV *TC = vputils::getSCEVExprForVPValue(MainPlan.getTripCount(), PSE);
4414 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
4415 const SCEV *KnownMinTC;
4416 bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale()));
4417 bool ScalableRemIter = false;
4418 ScalarEvolution &SE = *PSE.getSE();
4419 // Use versions of TC and VF in which both are either scalable or fixed.
4420 if (ScalableTC == MainLoopVF.isScalable()) {
4421 ScalableRemIter = ScalableTC;
4422 RemainingIterations =
4423 SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
4424 } else if (ScalableTC) {
4425 const SCEV *EstimatedTC = SE.getMulExpr(
4426 KnownMinTC,
4427 SE.getConstant(TCType, CM.getVScaleForTuning().value_or(1)));
4428 RemainingIterations = SE.getURemExpr(
4429 EstimatedTC, SE.getElementCount(TCType, MainLoopVF * IC));
4430 } else
4431 RemainingIterations =
4432 SE.getURemExpr(TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC));
4433
4434 // No iterations left to process in the epilogue.
4435 if (RemainingIterations->isZero())
4436 return Result;
4437
4438 if (MainLoopVF.isFixed()) {
4439 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4440 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4441 SE.getConstant(TCType, MaxTripCount))) {
4442 MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4443 }
4444 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4445 << MaxTripCount << "\n");
4446 }
4447
4448 auto SkipVF = [&](const SCEV *VF, const SCEV *RemIter) -> bool {
4449 return SE.isKnownPredicate(CmpInst::ICMP_UGT, VF, RemIter);
4450 };
4451 for (auto &NextVF : ProfitableVFs) {
4452 // Skip candidate VFs without a corresponding VPlan.
4453 if (!hasPlanWithVF(NextVF.Width))
4454 continue;
4455
4456 ElementCount EffectiveVF =
4457 GetEffectiveVF(getPlanFor(NextVF.Width), NextVF.Width);
4458 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4459 // vectors) or > the VF of the main loop (fixed vectors).
4460 if ((!EffectiveVF.isScalable() && MainLoopVF.isScalable() &&
4461 ElementCount::isKnownGE(EffectiveVF, EstimatedRuntimeVF)) ||
4462 (EffectiveVF.isScalable() &&
4463 ElementCount::isKnownGE(EffectiveVF, MainLoopVF)) ||
4464 (!EffectiveVF.isScalable() && !MainLoopVF.isScalable() &&
4465 ElementCount::isKnownGT(EffectiveVF, MainLoopVF)))
4466 continue;
4467
4468 // If EffectiveVF is greater than the number of remaining iterations, the
4469 // epilogue loop would be dead. Skip such factors. If the epilogue plan
4470 // also has narrowed interleave groups, use the effective VF since
4471 // the epilogue step will be reduced to its IC.
4472 // TODO: We should also consider comparing against a scalable
4473 // RemainingIterations when SCEV be able to evaluate non-canonical
4474 // vscale-based expressions.
4475 if (!ScalableRemIter) {
4476 // Handle the case where EffectiveVF and RemainingIterations are in
4477 // different numerical spaces.
4478 if (EffectiveVF.isScalable())
4479 EffectiveVF = ElementCount::getFixed(
4480 estimateElementCount(EffectiveVF, CM.getVScaleForTuning()));
4481 if (SkipVF(SE.getElementCount(TCType, EffectiveVF), RemainingIterations))
4482 continue;
4483 }
4484
4485 if (Result.Width.isScalar() ||
4486 isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
4487 /*IsEpilogue*/ true))
4488 Result = NextVF;
4489 }
4490
4491 if (Result != VectorizationFactor::Disabled())
4492 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4493 << Result.Width << "\n");
4494 return Result;
4495}
4496
4497std::pair<unsigned, unsigned>
4499 unsigned MinWidth = -1U;
4500 unsigned MaxWidth = 8;
4501 const DataLayout &DL = TheFunction->getDataLayout();
4502 // For in-loop reductions, no element types are added to ElementTypesInLoop
4503 // if there are no loads/stores in the loop. In this case, check through the
4504 // reduction variables to determine the maximum width.
4505 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4506 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4507 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4508 // When finding the min width used by the recurrence we need to account
4509 // for casts on the input operands of the recurrence.
4510 MinWidth = std::min(
4511 MinWidth,
4512 std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4514 MaxWidth = std::max(MaxWidth,
4516 }
4517 } else {
4518 for (Type *T : ElementTypesInLoop) {
4519 MinWidth = std::min<unsigned>(
4520 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4521 MaxWidth = std::max<unsigned>(
4522 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4523 }
4524 }
4525 return {MinWidth, MaxWidth};
4526}
4527
4529 ElementTypesInLoop.clear();
4530 // For each block.
4531 for (BasicBlock *BB : TheLoop->blocks()) {
4532 // For each instruction in the loop.
4533 for (Instruction &I : *BB) {
4534 Type *T = I.getType();
4535
4536 // Skip ignored values.
4537 if (ValuesToIgnore.count(&I))
4538 continue;
4539
4540 // Only examine Loads, Stores and PHINodes.
4541 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4542 continue;
4543
4544 // Examine PHI nodes that are reduction variables. Update the type to
4545 // account for the recurrence type.
4546 if (auto *PN = dyn_cast<PHINode>(&I)) {
4547 if (!Legal->isReductionVariable(PN))
4548 continue;
4549 const RecurrenceDescriptor &RdxDesc =
4550 Legal->getRecurrenceDescriptor(PN);
4552 TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(),
4553 RdxDesc.getRecurrenceType()))
4554 continue;
4555 T = RdxDesc.getRecurrenceType();
4556 }
4557
4558 // Examine the stored values.
4559 if (auto *ST = dyn_cast<StoreInst>(&I))
4560 T = ST->getValueOperand()->getType();
4561
4562 assert(T->isSized() &&
4563 "Expected the load/store/recurrence type to be sized");
4564
4565 ElementTypesInLoop.insert(T);
4566 }
4567 }
4568}
4569
4570unsigned
4572 InstructionCost LoopCost) {
4573 // -- The interleave heuristics --
4574 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4575 // There are many micro-architectural considerations that we can't predict
4576 // at this level. For example, frontend pressure (on decode or fetch) due to
4577 // code size, or the number and capabilities of the execution ports.
4578 //
4579 // We use the following heuristics to select the interleave count:
4580 // 1. If the code has reductions, then we interleave to break the cross
4581 // iteration dependency.
4582 // 2. If the loop is really small, then we interleave to reduce the loop
4583 // overhead.
4584 // 3. We don't interleave if we think that we will spill registers to memory
4585 // due to the increased register pressure.
4586
4587 // Only interleave tail-folded loops if wide lane masks are requested, as the
4588 // overhead of multiple instructions to calculate the predicate is likely
4589 // not beneficial. If a scalar epilogue is not allowed for any other reason,
4590 // do not interleave.
4591 if (!CM.isScalarEpilogueAllowed() &&
4592 !(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask()))
4593 return 1;
4594
4597 LLVM_DEBUG(dbgs() << "LV: Loop requires variable-length step. "
4598 "Unroll factor forced to be 1.\n");
4599 return 1;
4600 }
4601
4602 // We used the distance for the interleave count.
4603 if (!Legal->isSafeForAnyVectorWidth())
4604 return 1;
4605
4606 // We don't attempt to perform interleaving for loops with uncountable early
4607 // exits because the VPInstruction::AnyOf code cannot currently handle
4608 // multiple parts.
4609 if (Plan.hasEarlyExit())
4610 return 1;
4611
4612 const bool HasReductions =
4615
4616 // FIXME: implement interleaving for FindLast transform correctly.
4617 if (hasFindLastReductionPhi(Plan))
4618 return 1;
4619
4620 VPRegisterUsage R =
4621 calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
4622
4623 // If we did not calculate the cost for VF (because the user selected the VF)
4624 // then we calculate the cost of VF here.
4625 if (LoopCost == 0) {
4626 if (VF.isScalar())
4627 LoopCost = CM.expectedCost(VF);
4628 else
4629 LoopCost = cost(Plan, VF, &R);
4630 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4631
4632 // Loop body is free and there is no need for interleaving.
4633 if (LoopCost == 0)
4634 return 1;
4635 }
4636
4637 // We divide by these constants so assume that we have at least one
4638 // instruction that uses at least one register.
4639 for (auto &Pair : R.MaxLocalUsers) {
4640 Pair.second = std::max(Pair.second, 1U);
4641 }
4642
4643 // We calculate the interleave count using the following formula.
4644 // Subtract the number of loop invariants from the number of available
4645 // registers. These registers are used by all of the interleaved instances.
4646 // Next, divide the remaining registers by the number of registers that is
4647 // required by the loop, in order to estimate how many parallel instances
4648 // fit without causing spills. All of this is rounded down if necessary to be
4649 // a power of two. We want power of two interleave count to simplify any
4650 // addressing operations or alignment considerations.
4651 // We also want power of two interleave counts to ensure that the induction
4652 // variable of the vector loop wraps to zero, when tail is folded by masking;
4653 // this currently happens when OptForSize, in which case IC is set to 1 above.
4654 unsigned IC = UINT_MAX;
4655
4656 for (const auto &Pair : R.MaxLocalUsers) {
4657 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4658 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4659 << " registers of "
4660 << TTI.getRegisterClassName(Pair.first)
4661 << " register class\n");
4662 if (VF.isScalar()) {
4663 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4664 TargetNumRegisters = ForceTargetNumScalarRegs;
4665 } else {
4666 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4667 TargetNumRegisters = ForceTargetNumVectorRegs;
4668 }
4669 unsigned MaxLocalUsers = Pair.second;
4670 unsigned LoopInvariantRegs = 0;
4671 if (R.LoopInvariantRegs.contains(Pair.first))
4672 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4673
4674 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4675 MaxLocalUsers);
4676 // Don't count the induction variable as interleaved.
4678 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4679 std::max(1U, (MaxLocalUsers - 1)));
4680 }
4681
4682 IC = std::min(IC, TmpIC);
4683 }
4684
4685 // Clamp the interleave ranges to reasonable counts.
4686 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4687 LLVM_DEBUG(dbgs() << "LV: MaxInterleaveFactor for the target is "
4688 << MaxInterleaveCount << "\n");
4689
4690 // Check if the user has overridden the max.
4691 if (VF.isScalar()) {
4692 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4693 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4694 } else {
4695 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4696 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4697 }
4698
4699 // Try to get the exact trip count, or an estimate based on profiling data or
4700 // ConstantMax from PSE, failing that.
4701 auto BestKnownTC =
4702 getSmallBestKnownTC(PSE, OrigLoop,
4703 /*CanUseConstantMax=*/true,
4704 /*CanExcludeZeroTrips=*/CM.isScalarEpilogueAllowed());
4705
4706 // For fixed length VFs treat a scalable trip count as unknown.
4707 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
4708 // Re-evaluate trip counts and VFs to be in the same numerical space.
4709 unsigned AvailableTC =
4710 estimateElementCount(*BestKnownTC, CM.getVScaleForTuning());
4711 unsigned EstimatedVF = estimateElementCount(VF, CM.getVScaleForTuning());
4712
4713 // At least one iteration must be scalar when this constraint holds. So the
4714 // maximum available iterations for interleaving is one less.
4715 if (CM.requiresScalarEpilogue(VF.isVector()))
4716 --AvailableTC;
4717
4718 unsigned InterleaveCountLB = bit_floor(std::max(
4719 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4720
4721 if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) {
4722 // If the best known trip count is exact, we select between two
4723 // prospective ICs, where
4724 //
4725 // 1) the aggressive IC is capped by the trip count divided by VF
4726 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4727 //
4728 // The final IC is selected in a way that the epilogue loop trip count is
4729 // minimized while maximizing the IC itself, so that we either run the
4730 // vector loop at least once if it generates a small epilogue loop, or
4731 // else we run the vector loop at least twice.
4732
4733 unsigned InterleaveCountUB = bit_floor(std::max(
4734 1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4735 MaxInterleaveCount = InterleaveCountLB;
4736
4737 if (InterleaveCountUB != InterleaveCountLB) {
4738 unsigned TailTripCountUB =
4739 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4740 unsigned TailTripCountLB =
4741 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4742 // If both produce same scalar tail, maximize the IC to do the same work
4743 // in fewer vector loop iterations
4744 if (TailTripCountUB == TailTripCountLB)
4745 MaxInterleaveCount = InterleaveCountUB;
4746 }
4747 } else {
4748 // If trip count is an estimated compile time constant, limit the
4749 // IC to be capped by the trip count divided by VF * 2, such that the
4750 // vector loop runs at least twice to make interleaving seem profitable
4751 // when there is an epilogue loop present. Since exact Trip count is not
4752 // known we choose to be conservative in our IC estimate.
4753 MaxInterleaveCount = InterleaveCountLB;
4754 }
4755 }
4756
4757 assert(MaxInterleaveCount > 0 &&
4758 "Maximum interleave count must be greater than 0");
4759
4760 // Clamp the calculated IC to be between the 1 and the max interleave count
4761 // that the target and trip count allows.
4762 if (IC > MaxInterleaveCount)
4763 IC = MaxInterleaveCount;
4764 else
4765 // Make sure IC is greater than 0.
4766 IC = std::max(1u, IC);
4767
4768 assert(IC > 0 && "Interleave count must be greater than 0.");
4769
4770 // Interleave if we vectorized this loop and there is a reduction that could
4771 // benefit from interleaving.
4772 if (VF.isVector() && HasReductions) {
4773 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4774 return IC;
4775 }
4776
4777 // For any scalar loop that either requires runtime checks or predication we
4778 // are better off leaving this to the unroller. Note that if we've already
4779 // vectorized the loop we will have done the runtime check and so interleaving
4780 // won't require further checks.
4781 bool ScalarInterleavingRequiresPredication =
4782 (VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
4783 return Legal->blockNeedsPredication(BB);
4784 }));
4785 bool ScalarInterleavingRequiresRuntimePointerCheck =
4786 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4787
4788 // We want to interleave small loops in order to reduce the loop overhead and
4789 // potentially expose ILP opportunities.
4790 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4791 << "LV: IC is " << IC << '\n'
4792 << "LV: VF is " << VF << '\n');
4793 const bool AggressivelyInterleave =
4794 TTI.enableAggressiveInterleaving(HasReductions);
4795 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4796 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4797 // We assume that the cost overhead is 1 and we use the cost model
4798 // to estimate the cost of the loop and interleave until the cost of the
4799 // loop overhead is about 5% of the cost of the loop.
4800 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
4801 SmallLoopCost / LoopCost.getValue()));
4802
4803 // Interleave until store/load ports (estimated by max interleave count) are
4804 // saturated.
4805 unsigned NumStores = 0;
4806 unsigned NumLoads = 0;
4809 for (VPRecipeBase &R : *VPBB) {
4811 NumLoads++;
4812 continue;
4813 }
4815 NumStores++;
4816 continue;
4817 }
4818
4819 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4820 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4821 NumStores += StoreOps;
4822 else
4823 NumLoads += InterleaveR->getNumDefinedValues();
4824 continue;
4825 }
4826 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4827 NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr());
4828 NumStores += isa<StoreInst>(RepR->getUnderlyingInstr());
4829 continue;
4830 }
4831 if (isa<VPHistogramRecipe>(&R)) {
4832 NumLoads++;
4833 NumStores++;
4834 continue;
4835 }
4836 }
4837 }
4838 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4839 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4840
4841 // There is little point in interleaving for reductions containing selects
4842 // and compares when VF=1 since it may just create more overhead than it's
4843 // worth for loops with small trip counts. This is because we still have to
4844 // do the final reduction after the loop.
4845 bool HasSelectCmpReductions =
4846 HasReductions &&
4848 [](VPRecipeBase &R) {
4849 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4850 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4851 RedR->getRecurrenceKind()) ||
4852 RecurrenceDescriptor::isFindIVRecurrenceKind(
4853 RedR->getRecurrenceKind()));
4854 });
4855 if (HasSelectCmpReductions) {
4856 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4857 return 1;
4858 }
4859
4860 // If we have a scalar reduction (vector reductions are already dealt with
4861 // by this point), we can increase the critical path length if the loop
4862 // we're interleaving is inside another loop. For tree-wise reductions
4863 // set the limit to 2, and for ordered reductions it's best to disable
4864 // interleaving entirely.
4865 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
4866 bool HasOrderedReductions =
4868 [](VPRecipeBase &R) {
4869 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4870
4871 return RedR && RedR->isOrdered();
4872 });
4873 if (HasOrderedReductions) {
4874 LLVM_DEBUG(
4875 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4876 return 1;
4877 }
4878
4879 unsigned F = MaxNestedScalarReductionIC;
4880 SmallIC = std::min(SmallIC, F);
4881 StoresIC = std::min(StoresIC, F);
4882 LoadsIC = std::min(LoadsIC, F);
4883 }
4884
4886 std::max(StoresIC, LoadsIC) > SmallIC) {
4887 LLVM_DEBUG(
4888 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4889 return std::max(StoresIC, LoadsIC);
4890 }
4891
4892 // If there are scalar reductions and TTI has enabled aggressive
4893 // interleaving for reductions, we will interleave to expose ILP.
4894 if (VF.isScalar() && AggressivelyInterleave) {
4895 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4896 // Interleave no less than SmallIC but not as aggressive as the normal IC
4897 // to satisfy the rare situation when resources are too limited.
4898 return std::max(IC / 2, SmallIC);
4899 }
4900
4901 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4902 return SmallIC;
4903 }
4904
4905 // Interleave if this is a large loop (small loops are already dealt with by
4906 // this point) that could benefit from interleaving.
4907 if (AggressivelyInterleave) {
4908 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4909 return IC;
4910 }
4911
4912 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4913 return 1;
4914}
4915
4917 ElementCount VF) {
4918 // TODO: Cost model for emulated masked load/store is completely
4919 // broken. This hack guides the cost model to use an artificially
4920 // high enough value to practically disable vectorization with such
4921 // operations, except where previously deployed legality hack allowed
4922 // using very low cost values. This is to avoid regressions coming simply
4923 // from moving "masked load/store" check from legality to cost model.
4924 // Masked Load/Gather emulation was previously never allowed.
4925 // Limited number of Masked Store/Scatter emulation was allowed.
4927 "Expecting a scalar emulated instruction");
4928 return isa<LoadInst>(I) ||
4929 (isa<StoreInst>(I) &&
4930 NumPredStores > NumberOfStoresToPredicate);
4931}
4932
4934 assert(VF.isVector() && "Expected VF >= 2");
4935
4936 // If we've already collected the instructions to scalarize or the predicated
4937 // BBs after vectorization, there's nothing to do. Collection may already have
4938 // occurred if we have a user-selected VF and are now computing the expected
4939 // cost for interleaving.
4940 if (InstsToScalarize.contains(VF) ||
4941 PredicatedBBsAfterVectorization.contains(VF))
4942 return;
4943
4944 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
4945 // not profitable to scalarize any instructions, the presence of VF in the
4946 // map will indicate that we've analyzed it already.
4947 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
4948
4949 // Find all the instructions that are scalar with predication in the loop and
4950 // determine if it would be better to not if-convert the blocks they are in.
4951 // If so, we also record the instructions to scalarize.
4952 for (BasicBlock *BB : TheLoop->blocks()) {
4954 continue;
4955 for (Instruction &I : *BB)
4956 if (isScalarWithPredication(&I, VF)) {
4957 ScalarCostsTy ScalarCosts;
4958 // Do not apply discount logic for:
4959 // 1. Scalars after vectorization, as there will only be a single copy
4960 // of the instruction.
4961 // 2. Scalable VF, as that would lead to invalid scalarization costs.
4962 // 3. Emulated masked memrefs, if a hacked cost is needed.
4963 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
4965 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
4966 for (const auto &[I, IC] : ScalarCosts)
4967 ScalarCostsVF.insert({I, IC});
4968 // Check if we decided to scalarize a call. If so, update the widening
4969 // decision of the call to CM_Scalarize with the computed scalar cost.
4970 for (const auto &[I, Cost] : ScalarCosts) {
4971 auto *CI = dyn_cast<CallInst>(I);
4972 if (!CI || !CallWideningDecisions.contains({CI, VF}))
4973 continue;
4974 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
4975 CallWideningDecisions[{CI, VF}].Cost = Cost;
4976 }
4977 }
4978 // Remember that BB will remain after vectorization.
4979 PredicatedBBsAfterVectorization[VF].insert(BB);
4980 for (auto *Pred : predecessors(BB)) {
4981 if (Pred->getSingleSuccessor() == BB)
4982 PredicatedBBsAfterVectorization[VF].insert(Pred);
4983 }
4984 }
4985 }
4986}
4987
4988InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4989 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4990 assert(!isUniformAfterVectorization(PredInst, VF) &&
4991 "Instruction marked uniform-after-vectorization will be predicated");
4992
4993 // Initialize the discount to zero, meaning that the scalar version and the
4994 // vector version cost the same.
4995 InstructionCost Discount = 0;
4996
4997 // Holds instructions to analyze. The instructions we visit are mapped in
4998 // ScalarCosts. Those instructions are the ones that would be scalarized if
4999 // we find that the scalar version costs less.
5001
5002 // Returns true if the given instruction can be scalarized.
5003 auto CanBeScalarized = [&](Instruction *I) -> bool {
5004 // We only attempt to scalarize instructions forming a single-use chain
5005 // from the original predicated block that would otherwise be vectorized.
5006 // Although not strictly necessary, we give up on instructions we know will
5007 // already be scalar to avoid traversing chains that are unlikely to be
5008 // beneficial.
5009 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5010 isScalarAfterVectorization(I, VF))
5011 return false;
5012
5013 // If the instruction is scalar with predication, it will be analyzed
5014 // separately. We ignore it within the context of PredInst.
5015 if (isScalarWithPredication(I, VF))
5016 return false;
5017
5018 // If any of the instruction's operands are uniform after vectorization,
5019 // the instruction cannot be scalarized. This prevents, for example, a
5020 // masked load from being scalarized.
5021 //
5022 // We assume we will only emit a value for lane zero of an instruction
5023 // marked uniform after vectorization, rather than VF identical values.
5024 // Thus, if we scalarize an instruction that uses a uniform, we would
5025 // create uses of values corresponding to the lanes we aren't emitting code
5026 // for. This behavior can be changed by allowing getScalarValue to clone
5027 // the lane zero values for uniforms rather than asserting.
5028 for (Use &U : I->operands())
5029 if (auto *J = dyn_cast<Instruction>(U.get()))
5030 if (isUniformAfterVectorization(J, VF))
5031 return false;
5032
5033 // Otherwise, we can scalarize the instruction.
5034 return true;
5035 };
5036
5037 // Compute the expected cost discount from scalarizing the entire expression
5038 // feeding the predicated instruction. We currently only consider expressions
5039 // that are single-use instruction chains.
5040 Worklist.push_back(PredInst);
5041 while (!Worklist.empty()) {
5042 Instruction *I = Worklist.pop_back_val();
5043
5044 // If we've already analyzed the instruction, there's nothing to do.
5045 if (ScalarCosts.contains(I))
5046 continue;
5047
5048 // Cannot scalarize fixed-order recurrence phis at the moment.
5049 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5050 continue;
5051
5052 // Compute the cost of the vector instruction. Note that this cost already
5053 // includes the scalarization overhead of the predicated instruction.
5054 InstructionCost VectorCost = getInstructionCost(I, VF);
5055
5056 // Compute the cost of the scalarized instruction. This cost is the cost of
5057 // the instruction as if it wasn't if-converted and instead remained in the
5058 // predicated block. We will scale this cost by block probability after
5059 // computing the scalarization overhead.
5060 InstructionCost ScalarCost =
5061 VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5062
5063 // Compute the scalarization overhead of needed insertelement instructions
5064 // and phi nodes.
5065 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5066 Type *WideTy = toVectorizedTy(I->getType(), VF);
5067 for (Type *VectorTy : getContainedTypes(WideTy)) {
5068 ScalarCost += TTI.getScalarizationOverhead(
5070 /*Insert=*/true,
5071 /*Extract=*/false, CostKind);
5072 }
5073 ScalarCost +=
5074 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5075 }
5076
5077 // Compute the scalarization overhead of needed extractelement
5078 // instructions. For each of the instruction's operands, if the operand can
5079 // be scalarized, add it to the worklist; otherwise, account for the
5080 // overhead.
5081 for (Use &U : I->operands())
5082 if (auto *J = dyn_cast<Instruction>(U.get())) {
5083 assert(canVectorizeTy(J->getType()) &&
5084 "Instruction has non-scalar type");
5085 if (CanBeScalarized(J))
5086 Worklist.push_back(J);
5087 else if (needsExtract(J, VF)) {
5088 Type *WideTy = toVectorizedTy(J->getType(), VF);
5089 for (Type *VectorTy : getContainedTypes(WideTy)) {
5090 ScalarCost += TTI.getScalarizationOverhead(
5091 cast<VectorType>(VectorTy),
5092 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5093 /*Extract*/ true, CostKind);
5094 }
5095 }
5096 }
5097
5098 // Scale the total scalar cost by block probability.
5099 ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent());
5100
5101 // Compute the discount. A non-negative discount means the vector version
5102 // of the instruction costs more, and scalarizing would be beneficial.
5103 Discount += VectorCost - ScalarCost;
5104 ScalarCosts[I] = ScalarCost;
5105 }
5106
5107 return Discount;
5108}
5109
5112
5113 // If the vector loop gets executed exactly once with the given VF, ignore the
5114 // costs of comparison and induction instructions, as they'll get simplified
5115 // away.
5116 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5117 auto TC = getSmallConstantTripCount(PSE.getSE(), TheLoop);
5118 if (TC == VF && !foldTailByMasking())
5120 ValuesToIgnoreForVF);
5121
5122 // For each block.
5123 for (BasicBlock *BB : TheLoop->blocks()) {
5124 InstructionCost BlockCost;
5125
5126 // For each instruction in the old loop.
5127 for (Instruction &I : *BB) {
5128 // Skip ignored values.
5129 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5130 (VF.isVector() && VecValuesToIgnore.count(&I)))
5131 continue;
5132
5134
5135 // Check if we should override the cost.
5136 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) {
5137 // For interleave groups, use ForceTargetInstructionCost once for the
5138 // whole group.
5139 if (VF.isVector() && getWideningDecision(&I, VF) == CM_Interleave) {
5140 if (getInterleavedAccessGroup(&I)->getInsertPos() == &I)
5142 else
5143 C = InstructionCost(0);
5144 } else {
5146 }
5147 }
5148
5149 BlockCost += C;
5150 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5151 << VF << " For instruction: " << I << '\n');
5152 }
5153
5154 // If we are vectorizing a predicated block, it will have been
5155 // if-converted. This means that the block's instructions (aside from
5156 // stores and instructions that may divide by zero) will now be
5157 // unconditionally executed. For the scalar case, we may not always execute
5158 // the predicated block, if it is an if-else block. Thus, scale the block's
5159 // cost by the probability of executing it.
5160 // getPredBlockCostDivisor will return 1 for blocks that are only predicated
5161 // by the header mask when folding the tail.
5162 if (VF.isScalar())
5163 BlockCost /= getPredBlockCostDivisor(CostKind, BB);
5164
5165 Cost += BlockCost;
5166 }
5167
5168 return Cost;
5169}
5170
5171/// Gets the address access SCEV for Ptr, if it should be used for cost modeling
5172/// according to isAddressSCEVForCost.
5173///
5174/// This SCEV can be sent to the Target in order to estimate the address
5175/// calculation cost.
5177 Value *Ptr,
5179 const Loop *TheLoop) {
5180 const SCEV *Addr = PSE.getSCEV(Ptr);
5181 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), TheLoop) ? Addr
5182 : nullptr;
5183}
5184
5186LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5187 ElementCount VF) {
5188 assert(VF.isVector() &&
5189 "Scalarization cost of instruction implies vectorization.");
5190 if (VF.isScalable())
5191 return InstructionCost::getInvalid();
5192
5193 Type *ValTy = getLoadStoreType(I);
5194 auto *SE = PSE.getSE();
5195
5196 unsigned AS = getLoadStoreAddressSpace(I);
5198 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5199 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5200 // that it is being called from this specific place.
5201
5202 // Figure out whether the access is strided and get the stride value
5203 // if it's known in compile time
5204 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, PSE, TheLoop);
5205
5206 // Get the cost of the scalar memory instruction and address computation.
5208 PtrTy, SE, PtrSCEV, CostKind);
5209
5210 // Don't pass *I here, since it is scalar but will actually be part of a
5211 // vectorized loop where the user of it is a vectorized instruction.
5212 const Align Alignment = getLoadStoreAlignment(I);
5213 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5214 Cost += VF.getFixedValue() *
5215 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5216 AS, CostKind, OpInfo);
5217
5218 // Get the overhead of the extractelement and insertelement instructions
5219 // we might create due to scalarization.
5221
5222 // If we have a predicated load/store, it will need extra i1 extracts and
5223 // conditional branches, but may not be executed for each vector lane. Scale
5224 // the cost by the probability of executing the predicated block.
5225 if (isPredicatedInst(I)) {
5226 Cost /= getPredBlockCostDivisor(CostKind, I->getParent());
5227
5228 // Add the cost of an i1 extract and a branch
5229 auto *VecI1Ty =
5230 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5232 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
5233 /*Insert=*/false, /*Extract=*/true, CostKind);
5234 Cost += TTI.getCFInstrCost(Instruction::CondBr, CostKind);
5235
5236 if (useEmulatedMaskMemRefHack(I, VF))
5237 // Artificially setting to a high enough value to practically disable
5238 // vectorization with such operations.
5239 Cost = 3000000;
5240 }
5241
5242 return Cost;
5243}
5244
5246LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5247 ElementCount VF) {
5248 Type *ValTy = getLoadStoreType(I);
5249 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5251 unsigned AS = getLoadStoreAddressSpace(I);
5252 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5253
5254 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5255 "Stride should be 1 or -1 for consecutive memory access");
5256 const Align Alignment = getLoadStoreAlignment(I);
5258 if (isMaskRequired(I)) {
5259 unsigned IID = I->getOpcode() == Instruction::Load
5260 ? Intrinsic::masked_load
5261 : Intrinsic::masked_store;
5263 MemIntrinsicCostAttributes(IID, VectorTy, Alignment, AS), CostKind);
5264 } else {
5265 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5266 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5267 CostKind, OpInfo, I);
5268 }
5269
5270 bool Reverse = ConsecutiveStride < 0;
5271 if (Reverse)
5273 VectorTy, {}, CostKind, 0);
5274 return Cost;
5275}
5276
5278LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5279 ElementCount VF) {
5280 assert(Legal->isUniformMemOp(*I, VF));
5281
5282 Type *ValTy = getLoadStoreType(I);
5284 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5285 const Align Alignment = getLoadStoreAlignment(I);
5286 unsigned AS = getLoadStoreAddressSpace(I);
5287 if (isa<LoadInst>(I)) {
5288 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5289 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5290 CostKind) +
5292 VectorTy, {}, CostKind);
5293 }
5294 StoreInst *SI = cast<StoreInst>(I);
5295
5296 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5297 // TODO: We have existing tests that request the cost of extracting element
5298 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5299 // the actual generated code, which involves extracting the last element of
5300 // a scalable vector where the lane to extract is unknown at compile time.
5302 TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5303 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
5304 if (!IsLoopInvariantStoreValue)
5305 Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
5306 VectorTy, CostKind, 0);
5307 return Cost;
5308}
5309
5311LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5312 ElementCount VF) {
5313 Type *ValTy = getLoadStoreType(I);
5314 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5315 const Align Alignment = getLoadStoreAlignment(I);
5317 Type *PtrTy = Ptr->getType();
5318
5319 if (!Legal->isUniform(Ptr, VF))
5320 PtrTy = toVectorTy(PtrTy, VF);
5321
5322 unsigned IID = I->getOpcode() == Instruction::Load
5323 ? Intrinsic::masked_gather
5324 : Intrinsic::masked_scatter;
5325 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5327 MemIntrinsicCostAttributes(IID, VectorTy, Ptr, isMaskRequired(I),
5328 Alignment, I),
5329 CostKind);
5330}
5331
5333LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5334 ElementCount VF) {
5335 const auto *Group = getInterleavedAccessGroup(I);
5336 assert(Group && "Fail to get an interleaved access group.");
5337
5338 Instruction *InsertPos = Group->getInsertPos();
5339 Type *ValTy = getLoadStoreType(InsertPos);
5340 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5341 unsigned AS = getLoadStoreAddressSpace(InsertPos);
5342
5343 unsigned InterleaveFactor = Group->getFactor();
5344 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5345
5346 // Holds the indices of existing members in the interleaved group.
5347 SmallVector<unsigned, 4> Indices;
5348 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5349 if (Group->getMember(IF))
5350 Indices.push_back(IF);
5351
5352 // Calculate the cost of the whole interleaved group.
5353 bool UseMaskForGaps =
5354 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5355 (isa<StoreInst>(I) && !Group->isFull());
5357 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5358 Group->getAlign(), AS, CostKind, isMaskRequired(I), UseMaskForGaps);
5359
5360 if (Group->isReverse()) {
5361 // TODO: Add support for reversed masked interleaved access.
5362 assert(!isMaskRequired(I) &&
5363 "Reverse masked interleaved access not supported.");
5364 Cost += Group->getNumMembers() *
5366 VectorTy, {}, CostKind, 0);
5367 }
5368 return Cost;
5369}
5370
5371std::optional<InstructionCost>
5373 ElementCount VF,
5374 Type *Ty) const {
5375 using namespace llvm::PatternMatch;
5376 // Early exit for no inloop reductions
5377 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5378 return std::nullopt;
5379 auto *VectorTy = cast<VectorType>(Ty);
5380
5381 // We are looking for a pattern of, and finding the minimal acceptable cost:
5382 // reduce(mul(ext(A), ext(B))) or
5383 // reduce(mul(A, B)) or
5384 // reduce(ext(A)) or
5385 // reduce(A).
5386 // The basic idea is that we walk down the tree to do that, finding the root
5387 // reduction instruction in InLoopReductionImmediateChains. From there we find
5388 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5389 // of the components. If the reduction cost is lower then we return it for the
5390 // reduction instruction and 0 for the other instructions in the pattern. If
5391 // it is not we return an invalid cost specifying the orignal cost method
5392 // should be used.
5393 Instruction *RetI = I;
5394 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5395 if (!RetI->hasOneUser())
5396 return std::nullopt;
5397 RetI = RetI->user_back();
5398 }
5399
5400 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5401 RetI->user_back()->getOpcode() == Instruction::Add) {
5402 RetI = RetI->user_back();
5403 }
5404
5405 // Test if the found instruction is a reduction, and if not return an invalid
5406 // cost specifying the parent to use the original cost modelling.
5407 Instruction *LastChain = InLoopReductionImmediateChains.lookup(RetI);
5408 if (!LastChain)
5409 return std::nullopt;
5410
5411 // Find the reduction this chain is a part of and calculate the basic cost of
5412 // the reduction on its own.
5413 Instruction *ReductionPhi = LastChain;
5414 while (!isa<PHINode>(ReductionPhi))
5415 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5416
5417 const RecurrenceDescriptor &RdxDesc =
5418 Legal->getRecurrenceDescriptor(cast<PHINode>(ReductionPhi));
5419
5420 InstructionCost BaseCost;
5421 RecurKind RK = RdxDesc.getRecurrenceKind();
5424 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5425 RdxDesc.getFastMathFlags(), CostKind);
5426 } else {
5427 BaseCost = TTI.getArithmeticReductionCost(
5428 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5429 }
5430
5431 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5432 // normal fmul instruction to the cost of the fadd reduction.
5433 if (RK == RecurKind::FMulAdd)
5434 BaseCost +=
5435 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5436
5437 // If we're using ordered reductions then we can just return the base cost
5438 // here, since getArithmeticReductionCost calculates the full ordered
5439 // reduction cost when FP reassociation is not allowed.
5440 if (useOrderedReductions(RdxDesc))
5441 return BaseCost;
5442
5443 // Get the operand that was not the reduction chain and match it to one of the
5444 // patterns, returning the better cost if it is found.
5445 Instruction *RedOp = RetI->getOperand(1) == LastChain
5448
5449 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5450
5451 Instruction *Op0, *Op1;
5452 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5453 match(RedOp,
5455 match(Op0, m_ZExtOrSExt(m_Value())) &&
5456 Op0->getOpcode() == Op1->getOpcode() &&
5457 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5458 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5459 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5460
5461 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5462 // Note that the extend opcodes need to all match, or if A==B they will have
5463 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5464 // which is equally fine.
5465 bool IsUnsigned = isa<ZExtInst>(Op0);
5466 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5467 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5468
5469 InstructionCost ExtCost =
5470 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5472 InstructionCost MulCost =
5473 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5474 InstructionCost Ext2Cost =
5475 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5477
5478 InstructionCost RedCost = TTI.getMulAccReductionCost(
5479 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
5480 CostKind);
5481
5482 if (RedCost.isValid() &&
5483 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5484 return I == RetI ? RedCost : 0;
5485 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5486 !TheLoop->isLoopInvariant(RedOp)) {
5487 // Matched reduce(ext(A))
5488 bool IsUnsigned = isa<ZExtInst>(RedOp);
5489 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5490 InstructionCost RedCost = TTI.getExtendedReductionCost(
5491 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5492 RdxDesc.getFastMathFlags(), CostKind);
5493
5494 InstructionCost ExtCost =
5495 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5497 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5498 return I == RetI ? RedCost : 0;
5499 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5500 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5501 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5502 Op0->getOpcode() == Op1->getOpcode() &&
5503 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
5504 bool IsUnsigned = isa<ZExtInst>(Op0);
5505 Type *Op0Ty = Op0->getOperand(0)->getType();
5506 Type *Op1Ty = Op1->getOperand(0)->getType();
5507 Type *LargestOpTy =
5508 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5509 : Op0Ty;
5510 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5511
5512 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5513 // different sizes. We take the largest type as the ext to reduce, and add
5514 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5515 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5516 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5518 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5519 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5521 InstructionCost MulCost =
5522 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5523
5524 InstructionCost RedCost = TTI.getMulAccReductionCost(
5525 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
5526 CostKind);
5527 InstructionCost ExtraExtCost = 0;
5528 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5529 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5530 ExtraExtCost = TTI.getCastInstrCost(
5531 ExtraExtOp->getOpcode(), ExtType,
5532 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5534 }
5535
5536 if (RedCost.isValid() &&
5537 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5538 return I == RetI ? RedCost : 0;
5539 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5540 // Matched reduce.add(mul())
5541 InstructionCost MulCost =
5542 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5543
5544 InstructionCost RedCost = TTI.getMulAccReductionCost(
5545 true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy,
5546 CostKind);
5547
5548 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5549 return I == RetI ? RedCost : 0;
5550 }
5551 }
5552
5553 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5554}
5555
5557LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5558 ElementCount VF) {
5559 // Calculate scalar cost only. Vectorization cost should be ready at this
5560 // moment.
5561 if (VF.isScalar()) {
5562 Type *ValTy = getLoadStoreType(I);
5564 const Align Alignment = getLoadStoreAlignment(I);
5565 unsigned AS = getLoadStoreAddressSpace(I);
5566
5567 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5568 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5569 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5570 OpInfo, I);
5571 }
5572 return getWideningCost(I, VF);
5573}
5574
5576LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5577 ElementCount VF) const {
5578
5579 // There is no mechanism yet to create a scalable scalarization loop,
5580 // so this is currently Invalid.
5581 if (VF.isScalable())
5582 return InstructionCost::getInvalid();
5583
5584 if (VF.isScalar())
5585 return 0;
5586
5588 Type *RetTy = toVectorizedTy(I->getType(), VF);
5589 if (!RetTy->isVoidTy() &&
5591
5593 if (isa<LoadInst>(I))
5595 else if (isa<StoreInst>(I))
5597
5598 for (Type *VectorTy : getContainedTypes(RetTy)) {
5601 /*Insert=*/true, /*Extract=*/false, CostKind,
5602 /*ForPoisonSrc=*/true, {}, VIC);
5603 }
5604 }
5605
5606 // Some targets keep addresses scalar.
5608 return Cost;
5609
5610 // Some targets support efficient element stores.
5612 return Cost;
5613
5614 // Collect operands to consider.
5615 CallInst *CI = dyn_cast<CallInst>(I);
5616 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5617
5618 // Skip operands that do not require extraction/scalarization and do not incur
5619 // any overhead.
5621 for (auto *V : filterExtractingOperands(Ops, VF))
5622 Tys.push_back(maybeVectorizeType(V->getType(), VF));
5623
5627 return Cost + TTI.getOperandsScalarizationOverhead(Tys, CostKind, OperandVIC);
5628}
5629
5631 if (VF.isScalar())
5632 return;
5633 NumPredStores = 0;
5634 for (BasicBlock *BB : TheLoop->blocks()) {
5635 // For each instruction in the old loop.
5636 for (Instruction &I : *BB) {
5638 if (!Ptr)
5639 continue;
5640
5641 // TODO: We should generate better code and update the cost model for
5642 // predicated uniform stores. Today they are treated as any other
5643 // predicated store (see added test cases in
5644 // invariant-store-vectorization.ll).
5646 NumPredStores++;
5647
5648 if (Legal->isUniformMemOp(I, VF)) {
5649 auto IsLegalToScalarize = [&]() {
5650 if (!VF.isScalable())
5651 // Scalarization of fixed length vectors "just works".
5652 return true;
5653
5654 // We have dedicated lowering for unpredicated uniform loads and
5655 // stores. Note that even with tail folding we know that at least
5656 // one lane is active (i.e. generalized predication is not possible
5657 // here), and the logic below depends on this fact.
5658 if (!foldTailByMasking())
5659 return true;
5660
5661 // For scalable vectors, a uniform memop load is always
5662 // uniform-by-parts and we know how to scalarize that.
5663 if (isa<LoadInst>(I))
5664 return true;
5665
5666 // A uniform store isn't neccessarily uniform-by-part
5667 // and we can't assume scalarization.
5668 auto &SI = cast<StoreInst>(I);
5669 return TheLoop->isLoopInvariant(SI.getValueOperand());
5670 };
5671
5672 const InstructionCost GatherScatterCost =
5674 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
5675
5676 // Load: Scalar load + broadcast
5677 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5678 // FIXME: This cost is a significant under-estimate for tail folded
5679 // memory ops.
5680 const InstructionCost ScalarizationCost =
5681 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
5683
5684 // Choose better solution for the current VF, Note that Invalid
5685 // costs compare as maximumal large. If both are invalid, we get
5686 // scalable invalid which signals a failure and a vectorization abort.
5687 if (GatherScatterCost < ScalarizationCost)
5688 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
5689 else
5690 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
5691 continue;
5692 }
5693
5694 // We assume that widening is the best solution when possible.
5695 if (memoryInstructionCanBeWidened(&I, VF)) {
5696 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
5697 int ConsecutiveStride = Legal->isConsecutivePtr(
5699 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5700 "Expected consecutive stride.");
5701 InstWidening Decision =
5702 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5703 setWideningDecision(&I, VF, Decision, Cost);
5704 continue;
5705 }
5706
5707 // Choose between Interleaving, Gather/Scatter or Scalarization.
5709 unsigned NumAccesses = 1;
5710 if (isAccessInterleaved(&I)) {
5711 const auto *Group = getInterleavedAccessGroup(&I);
5712 assert(Group && "Fail to get an interleaved access group.");
5713
5714 // Make one decision for the whole group.
5715 if (getWideningDecision(&I, VF) != CM_Unknown)
5716 continue;
5717
5718 NumAccesses = Group->getNumMembers();
5720 InterleaveCost = getInterleaveGroupCost(&I, VF);
5721 }
5722
5723 InstructionCost GatherScatterCost =
5725 ? getGatherScatterCost(&I, VF) * NumAccesses
5727
5728 InstructionCost ScalarizationCost =
5729 getMemInstScalarizationCost(&I, VF) * NumAccesses;
5730
5731 // Choose better solution for the current VF,
5732 // write down this decision and use it during vectorization.
5734 InstWidening Decision;
5735 if (InterleaveCost <= GatherScatterCost &&
5736 InterleaveCost < ScalarizationCost) {
5737 Decision = CM_Interleave;
5738 Cost = InterleaveCost;
5739 } else if (GatherScatterCost < ScalarizationCost) {
5740 Decision = CM_GatherScatter;
5741 Cost = GatherScatterCost;
5742 } else {
5743 Decision = CM_Scalarize;
5744 Cost = ScalarizationCost;
5745 }
5746 // If the instructions belongs to an interleave group, the whole group
5747 // receives the same decision. The whole group receives the cost, but
5748 // the cost will actually be assigned to one instruction.
5749 if (const auto *Group = getInterleavedAccessGroup(&I)) {
5750 if (Decision == CM_Scalarize) {
5751 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5752 if (auto *I = Group->getMember(Idx)) {
5753 setWideningDecision(I, VF, Decision,
5754 getMemInstScalarizationCost(I, VF));
5755 }
5756 }
5757 } else {
5758 setWideningDecision(Group, VF, Decision, Cost);
5759 }
5760 } else
5761 setWideningDecision(&I, VF, Decision, Cost);
5762 }
5763 }
5764
5765 // Make sure that any load of address and any other address computation
5766 // remains scalar unless there is gather/scatter support. This avoids
5767 // inevitable extracts into address registers, and also has the benefit of
5768 // activating LSR more, since that pass can't optimize vectorized
5769 // addresses.
5770 if (TTI.prefersVectorizedAddressing())
5771 return;
5772
5773 // Start with all scalar pointer uses.
5775 for (BasicBlock *BB : TheLoop->blocks())
5776 for (Instruction &I : *BB) {
5777 Instruction *PtrDef =
5779 if (PtrDef && TheLoop->contains(PtrDef) &&
5781 AddrDefs.insert(PtrDef);
5782 }
5783
5784 // Add all instructions used to generate the addresses.
5786 append_range(Worklist, AddrDefs);
5787 while (!Worklist.empty()) {
5788 Instruction *I = Worklist.pop_back_val();
5789 for (auto &Op : I->operands())
5790 if (auto *InstOp = dyn_cast<Instruction>(Op))
5791 if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) &&
5792 AddrDefs.insert(InstOp).second)
5793 Worklist.push_back(InstOp);
5794 }
5795
5796 auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
5797 // If there are direct memory op users of the newly scalarized load,
5798 // their cost may have changed because there's no scalarization
5799 // overhead for the operand. Update it.
5800 for (User *U : LI->users()) {
5802 continue;
5804 continue;
5807 getMemInstScalarizationCost(cast<Instruction>(U), VF));
5808 }
5809 };
5810 for (auto *I : AddrDefs) {
5811 if (isa<LoadInst>(I)) {
5812 // Setting the desired widening decision should ideally be handled in
5813 // by cost functions, but since this involves the task of finding out
5814 // if the loaded register is involved in an address computation, it is
5815 // instead changed here when we know this is the case.
5816 InstWidening Decision = getWideningDecision(I, VF);
5817 if (!isPredicatedInst(I) &&
5818 (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
5819 (!Legal->isUniformMemOp(*I, VF) && Decision == CM_Scalarize))) {
5820 // Scalarize a widened load of address or update the cost of a scalar
5821 // load of an address.
5823 I, VF, CM_Scalarize,
5824 (VF.getKnownMinValue() *
5825 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
5826 UpdateMemOpUserCost(cast<LoadInst>(I));
5827 } else if (const auto *Group = getInterleavedAccessGroup(I)) {
5828 // Scalarize all members of this interleaved group when any member
5829 // is used as an address. The address-used load skips scalarization
5830 // overhead, other members include it.
5831 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5832 if (Instruction *Member = Group->getMember(Idx)) {
5834 AddrDefs.contains(Member)
5835 ? (VF.getKnownMinValue() *
5836 getMemoryInstructionCost(Member,
5838 : getMemInstScalarizationCost(Member, VF);
5840 UpdateMemOpUserCost(cast<LoadInst>(Member));
5841 }
5842 }
5843 }
5844 } else {
5845 // Cannot scalarize fixed-order recurrence phis at the moment.
5846 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5847 continue;
5848
5849 // Make sure I gets scalarized and a cost estimate without
5850 // scalarization overhead.
5851 ForcedScalars[VF].insert(I);
5852 }
5853 }
5854}
5855
5857 assert(!VF.isScalar() &&
5858 "Trying to set a vectorization decision for a scalar VF");
5859
5860 auto ForcedScalar = ForcedScalars.find(VF);
5861 for (BasicBlock *BB : TheLoop->blocks()) {
5862 // For each instruction in the old loop.
5863 for (Instruction &I : *BB) {
5865
5866 if (!CI)
5867 continue;
5868
5872 Function *ScalarFunc = CI->getCalledFunction();
5873 Type *ScalarRetTy = CI->getType();
5874 SmallVector<Type *, 4> Tys, ScalarTys;
5875 for (auto &ArgOp : CI->args())
5876 ScalarTys.push_back(ArgOp->getType());
5877
5878 // Estimate cost of scalarized vector call. The source operands are
5879 // assumed to be vectors, so we need to extract individual elements from
5880 // there, execute VF scalar calls, and then gather the result into the
5881 // vector return value.
5882 if (VF.isFixed()) {
5883 InstructionCost ScalarCallCost =
5884 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
5885
5886 // Compute costs of unpacking argument values for the scalar calls and
5887 // packing the return values to a vector.
5888 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
5889 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5890 } else {
5891 // There is no point attempting to calculate the scalar cost for a
5892 // scalable VF as we know it will be Invalid.
5894 "Unexpected valid cost for scalarizing scalable vectors");
5895 ScalarCost = InstructionCost::getInvalid();
5896 }
5897
5898 // Honor ForcedScalars and UniformAfterVectorization decisions.
5899 // TODO: For calls, it might still be more profitable to widen. Use
5900 // VPlan-based cost model to compare different options.
5901 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5902 ForcedScalar->second.contains(CI)) ||
5903 isUniformAfterVectorization(CI, VF))) {
5904 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
5905 Intrinsic::not_intrinsic, std::nullopt,
5906 ScalarCost);
5907 continue;
5908 }
5909
5910 bool MaskRequired = isMaskRequired(CI);
5911 // Compute corresponding vector type for return value and arguments.
5912 Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
5913 for (Type *ScalarTy : ScalarTys)
5914 Tys.push_back(toVectorizedTy(ScalarTy, VF));
5915
5916 // An in-loop reduction using an fmuladd intrinsic is a special case;
5917 // we don't want the normal cost for that intrinsic.
5919 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
5922 std::nullopt, *RedCost);
5923 continue;
5924 }
5925
5926 // Find the cost of vectorizing the call, if we can find a suitable
5927 // vector variant of the function.
5928 VFInfo FuncInfo;
5929 Function *VecFunc = nullptr;
5930 // Search through any available variants for one we can use at this VF.
5931 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
5932 // Must match requested VF.
5933 if (Info.Shape.VF != VF)
5934 continue;
5935
5936 // Must take a mask argument if one is required
5937 if (MaskRequired && !Info.isMasked())
5938 continue;
5939
5940 // Check that all parameter kinds are supported
5941 bool ParamsOk = true;
5942 for (VFParameter Param : Info.Shape.Parameters) {
5943 switch (Param.ParamKind) {
5945 break;
5947 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5948 // Make sure the scalar parameter in the loop is invariant.
5949 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
5950 TheLoop))
5951 ParamsOk = false;
5952 break;
5953 }
5955 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5956 // Find the stride for the scalar parameter in this loop and see if
5957 // it matches the stride for the variant.
5958 // TODO: do we need to figure out the cost of an extract to get the
5959 // first lane? Or do we hope that it will be folded away?
5960 ScalarEvolution *SE = PSE.getSE();
5961 if (!match(SE->getSCEV(ScalarParam),
5963 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
5965 ParamsOk = false;
5966 break;
5967 }
5969 break;
5970 default:
5971 ParamsOk = false;
5972 break;
5973 }
5974 }
5975
5976 if (!ParamsOk)
5977 continue;
5978
5979 // Found a suitable candidate, stop here.
5980 VecFunc = CI->getModule()->getFunction(Info.VectorName);
5981 FuncInfo = Info;
5982 break;
5983 }
5984
5985 if (TLI && VecFunc && !CI->isNoBuiltin())
5986 VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
5987
5988 // Find the cost of an intrinsic; some targets may have instructions that
5989 // perform the operation without needing an actual call.
5991 if (IID != Intrinsic::not_intrinsic)
5993
5994 InstructionCost Cost = ScalarCost;
5995 InstWidening Decision = CM_Scalarize;
5996
5997 if (VectorCost.isValid() && VectorCost <= Cost) {
5998 Cost = VectorCost;
5999 Decision = CM_VectorCall;
6000 }
6001
6002 if (IntrinsicCost.isValid() && IntrinsicCost <= Cost) {
6004 Decision = CM_IntrinsicCall;
6005 }
6006
6007 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6009 }
6010 }
6011}
6012
6014 if (!Legal->isInvariant(Op))
6015 return false;
6016 // Consider Op invariant, if it or its operands aren't predicated
6017 // instruction in the loop. In that case, it is not trivially hoistable.
6018 auto *OpI = dyn_cast<Instruction>(Op);
6019 return !OpI || !TheLoop->contains(OpI) ||
6020 (!isPredicatedInst(OpI) &&
6021 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6022 all_of(OpI->operands(),
6023 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6024}
6025
6028 ElementCount VF) {
6029 // If we know that this instruction will remain uniform, check the cost of
6030 // the scalar version.
6032 VF = ElementCount::getFixed(1);
6033
6034 if (VF.isVector() && isProfitableToScalarize(I, VF))
6035 return InstsToScalarize[VF][I];
6036
6037 // Forced scalars do not have any scalarization overhead.
6038 auto ForcedScalar = ForcedScalars.find(VF);
6039 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6040 auto InstSet = ForcedScalar->second;
6041 if (InstSet.count(I))
6043 VF.getKnownMinValue();
6044 }
6045
6046 Type *RetTy = I->getType();
6048 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6049 auto *SE = PSE.getSE();
6050
6051 Type *VectorTy;
6052 if (isScalarAfterVectorization(I, VF)) {
6053 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
6054 [this](Instruction *I, ElementCount VF) -> bool {
6055 if (VF.isScalar())
6056 return true;
6057
6058 auto Scalarized = InstsToScalarize.find(VF);
6059 assert(Scalarized != InstsToScalarize.end() &&
6060 "VF not yet analyzed for scalarization profitability");
6061 return !Scalarized->second.count(I) &&
6062 llvm::all_of(I->users(), [&](User *U) {
6063 auto *UI = cast<Instruction>(U);
6064 return !Scalarized->second.count(UI);
6065 });
6066 };
6067
6068 // With the exception of GEPs and PHIs, after scalarization there should
6069 // only be one copy of the instruction generated in the loop. This is
6070 // because the VF is either 1, or any instructions that need scalarizing
6071 // have already been dealt with by the time we get here. As a result,
6072 // it means we don't have to multiply the instruction cost by VF.
6073 assert(I->getOpcode() == Instruction::GetElementPtr ||
6074 I->getOpcode() == Instruction::PHI ||
6075 (I->getOpcode() == Instruction::BitCast &&
6076 I->getType()->isPointerTy()) ||
6077 HasSingleCopyAfterVectorization(I, VF));
6078 VectorTy = RetTy;
6079 } else
6080 VectorTy = toVectorizedTy(RetTy, VF);
6081
6082 if (VF.isVector() && VectorTy->isVectorTy() &&
6083 !TTI.getNumberOfParts(VectorTy))
6085
6086 // TODO: We need to estimate the cost of intrinsic calls.
6087 switch (I->getOpcode()) {
6088 case Instruction::GetElementPtr:
6089 // We mark this instruction as zero-cost because the cost of GEPs in
6090 // vectorized code depends on whether the corresponding memory instruction
6091 // is scalarized or not. Therefore, we handle GEPs with the memory
6092 // instruction cost.
6093 return 0;
6094 case Instruction::UncondBr:
6095 case Instruction::CondBr: {
6096 // In cases of scalarized and predicated instructions, there will be VF
6097 // predicated blocks in the vectorized loop. Each branch around these
6098 // blocks requires also an extract of its vector compare i1 element.
6099 // Note that the conditional branch from the loop latch will be replaced by
6100 // a single branch controlling the loop, so there is no extra overhead from
6101 // scalarization.
6102 bool ScalarPredicatedBB = false;
6104 if (VF.isVector() && BI &&
6105 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6106 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6107 BI->getParent() != TheLoop->getLoopLatch())
6108 ScalarPredicatedBB = true;
6109
6110 if (ScalarPredicatedBB) {
6111 // Not possible to scalarize scalable vector with predicated instructions.
6112 if (VF.isScalable())
6114 // Return cost for branches around scalarized and predicated blocks.
6115 auto *VecI1Ty =
6117 return (TTI.getScalarizationOverhead(
6118 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6119 /*Insert*/ false, /*Extract*/ true, CostKind) +
6120 (TTI.getCFInstrCost(Instruction::CondBr, CostKind) *
6121 VF.getFixedValue()));
6122 }
6123
6124 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6125 // The back-edge branch will remain, as will all scalar branches.
6126 return TTI.getCFInstrCost(Instruction::UncondBr, CostKind);
6127
6128 // This branch will be eliminated by if-conversion.
6129 return 0;
6130 // Note: We currently assume zero cost for an unconditional branch inside
6131 // a predicated block since it will become a fall-through, although we
6132 // may decide in the future to call TTI for all branches.
6133 }
6134 case Instruction::Switch: {
6135 if (VF.isScalar())
6136 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6137 auto *Switch = cast<SwitchInst>(I);
6138 return Switch->getNumCases() *
6139 TTI.getCmpSelInstrCost(
6140 Instruction::ICmp,
6141 toVectorTy(Switch->getCondition()->getType(), VF),
6142 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6144 }
6145 case Instruction::PHI: {
6146 auto *Phi = cast<PHINode>(I);
6147
6148 // First-order recurrences are replaced by vector shuffles inside the loop.
6149 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6151 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6152 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6153 cast<VectorType>(VectorTy),
6154 cast<VectorType>(VectorTy), Mask, CostKind,
6155 VF.getKnownMinValue() - 1);
6156 }
6157
6158 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6159 // converted into select instructions. We require N - 1 selects per phi
6160 // node, where N is the number of incoming values.
6161 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6162 Type *ResultTy = Phi->getType();
6163
6164 // All instructions in an Any-of reduction chain are narrowed to bool.
6165 // Check if that is the case for this phi node.
6166 auto *HeaderUser = cast_if_present<PHINode>(
6167 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6168 auto *Phi = dyn_cast<PHINode>(U);
6169 if (Phi && Phi->getParent() == TheLoop->getHeader())
6170 return Phi;
6171 return nullptr;
6172 }));
6173 if (HeaderUser) {
6174 auto &ReductionVars = Legal->getReductionVars();
6175 auto Iter = ReductionVars.find(HeaderUser);
6176 if (Iter != ReductionVars.end() &&
6178 Iter->second.getRecurrenceKind()))
6179 ResultTy = Type::getInt1Ty(Phi->getContext());
6180 }
6181 return (Phi->getNumIncomingValues() - 1) *
6182 TTI.getCmpSelInstrCost(
6183 Instruction::Select, toVectorTy(ResultTy, VF),
6184 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6186 }
6187
6188 // When tail folding with EVL, if the phi is part of an out of loop
6189 // reduction then it will be transformed into a wide vp_merge.
6190 if (VF.isVector() && foldTailWithEVL() &&
6191 Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6193 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6194 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6195 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6196 }
6197
6198 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6199 }
6200 case Instruction::UDiv:
6201 case Instruction::SDiv:
6202 case Instruction::URem:
6203 case Instruction::SRem:
6204 if (VF.isVector() && isPredicatedInst(I)) {
6205 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6206 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6207 ScalarCost : SafeDivisorCost;
6208 }
6209 // We've proven all lanes safe to speculate, fall through.
6210 [[fallthrough]];
6211 case Instruction::Add:
6212 case Instruction::Sub: {
6213 auto Info = Legal->getHistogramInfo(I);
6214 if (Info && VF.isVector()) {
6215 const HistogramInfo *HGram = Info.value();
6216 // Assume that a non-constant update value (or a constant != 1) requires
6217 // a multiply, and add that into the cost.
6219 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6220 if (!RHS || RHS->getZExtValue() != 1)
6221 MulCost =
6222 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6223
6224 // Find the cost of the histogram operation itself.
6225 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6226 Type *ScalarTy = I->getType();
6227 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6228 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6229 Type::getVoidTy(I->getContext()),
6230 {PtrTy, ScalarTy, MaskTy});
6231
6232 // Add the costs together with the add/sub operation.
6233 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6234 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6235 }
6236 [[fallthrough]];
6237 }
6238 case Instruction::FAdd:
6239 case Instruction::FSub:
6240 case Instruction::Mul:
6241 case Instruction::FMul:
6242 case Instruction::FDiv:
6243 case Instruction::FRem:
6244 case Instruction::Shl:
6245 case Instruction::LShr:
6246 case Instruction::AShr:
6247 case Instruction::And:
6248 case Instruction::Or:
6249 case Instruction::Xor: {
6250 // If we're speculating on the stride being 1, the multiplication may
6251 // fold away. We can generalize this for all operations using the notion
6252 // of neutral elements. (TODO)
6253 if (I->getOpcode() == Instruction::Mul &&
6254 ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
6255 PSE.getSCEV(I->getOperand(0))->isOne()) ||
6256 (TheLoop->isLoopInvariant(I->getOperand(1)) &&
6257 PSE.getSCEV(I->getOperand(1))->isOne())))
6258 return 0;
6259
6260 // Detect reduction patterns
6261 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6262 return *RedCost;
6263
6264 // Certain instructions can be cheaper to vectorize if they have a constant
6265 // second vector operand. One example of this are shifts on x86.
6266 Value *Op2 = I->getOperand(1);
6267 if (!isa<Constant>(Op2) && TheLoop->isLoopInvariant(Op2) &&
6268 PSE.getSE()->isSCEVable(Op2->getType()) &&
6269 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6270 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6271 }
6272 auto Op2Info = TTI.getOperandInfo(Op2);
6273 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6276
6277 SmallVector<const Value *, 4> Operands(I->operand_values());
6278 return TTI.getArithmeticInstrCost(
6279 I->getOpcode(), VectorTy, CostKind,
6280 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6281 Op2Info, Operands, I, TLI);
6282 }
6283 case Instruction::FNeg: {
6284 return TTI.getArithmeticInstrCost(
6285 I->getOpcode(), VectorTy, CostKind,
6286 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6287 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6288 I->getOperand(0), I);
6289 }
6290 case Instruction::Select: {
6292 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6293 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6294
6295 const Value *Op0, *Op1;
6296 using namespace llvm::PatternMatch;
6297 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6298 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6299 // select x, y, false --> x & y
6300 // select x, true, y --> x | y
6301 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6302 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6303 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6304 Op1->getType()->getScalarSizeInBits() == 1);
6305
6306 return TTI.getArithmeticInstrCost(
6307 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And,
6308 VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1}, I);
6309 }
6310
6311 Type *CondTy = SI->getCondition()->getType();
6312 if (!ScalarCond)
6313 CondTy = VectorType::get(CondTy, VF);
6314
6316 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6317 Pred = Cmp->getPredicate();
6318 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6319 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6320 {TTI::OK_AnyValue, TTI::OP_None}, I);
6321 }
6322 case Instruction::ICmp:
6323 case Instruction::FCmp: {
6324 Type *ValTy = I->getOperand(0)->getType();
6325
6327 [[maybe_unused]] Instruction *Op0AsInstruction =
6328 dyn_cast<Instruction>(I->getOperand(0));
6329 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6330 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6331 "if both the operand and the compare are marked for "
6332 "truncation, they must have the same bitwidth");
6333 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6334 }
6335
6336 VectorTy = toVectorTy(ValTy, VF);
6337 return TTI.getCmpSelInstrCost(
6338 I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
6339 cast<CmpInst>(I)->getPredicate(), CostKind,
6340 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
6341 }
6342 case Instruction::Store:
6343 case Instruction::Load: {
6344 ElementCount Width = VF;
6345 if (Width.isVector()) {
6346 InstWidening Decision = getWideningDecision(I, Width);
6347 assert(Decision != CM_Unknown &&
6348 "CM decision should be taken at this point");
6351 if (Decision == CM_Scalarize)
6352 Width = ElementCount::getFixed(1);
6353 }
6354 VectorTy = toVectorTy(getLoadStoreType(I), Width);
6355 return getMemoryInstructionCost(I, VF);
6356 }
6357 case Instruction::BitCast:
6358 if (I->getType()->isPointerTy())
6359 return 0;
6360 [[fallthrough]];
6361 case Instruction::ZExt:
6362 case Instruction::SExt:
6363 case Instruction::FPToUI:
6364 case Instruction::FPToSI:
6365 case Instruction::FPExt:
6366 case Instruction::PtrToInt:
6367 case Instruction::IntToPtr:
6368 case Instruction::SIToFP:
6369 case Instruction::UIToFP:
6370 case Instruction::Trunc:
6371 case Instruction::FPTrunc: {
6372 // Computes the CastContextHint from a Load/Store instruction.
6373 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6375 "Expected a load or a store!");
6376
6377 if (VF.isScalar() || !TheLoop->contains(I))
6379
6380 switch (getWideningDecision(I, VF)) {
6392 llvm_unreachable("Instr did not go through cost modelling?");
6395 llvm_unreachable_internal("Instr has invalid widening decision");
6396 }
6397
6398 llvm_unreachable("Unhandled case!");
6399 };
6400
6401 unsigned Opcode = I->getOpcode();
6403 // For Trunc, the context is the only user, which must be a StoreInst.
6404 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6405 if (I->hasOneUse())
6406 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6407 CCH = ComputeCCH(Store);
6408 }
6409 // For Z/Sext, the context is the operand, which must be a LoadInst.
6410 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6411 Opcode == Instruction::FPExt) {
6412 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6413 CCH = ComputeCCH(Load);
6414 }
6415
6416 // We optimize the truncation of induction variables having constant
6417 // integer steps. The cost of these truncations is the same as the scalar
6418 // operation.
6419 if (isOptimizableIVTruncate(I, VF)) {
6420 auto *Trunc = cast<TruncInst>(I);
6421 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6422 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6423 }
6424
6425 // Detect reduction patterns
6426 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6427 return *RedCost;
6428
6429 Type *SrcScalarTy = I->getOperand(0)->getType();
6430 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6431 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6432 SrcScalarTy =
6433 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6434 Type *SrcVecTy =
6435 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6436
6438 // If the result type is <= the source type, there will be no extend
6439 // after truncating the users to the minimal required bitwidth.
6440 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6441 (I->getOpcode() == Instruction::ZExt ||
6442 I->getOpcode() == Instruction::SExt))
6443 return 0;
6444 }
6445
6446 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6447 }
6448 case Instruction::Call:
6449 return getVectorCallCost(cast<CallInst>(I), VF);
6450 case Instruction::ExtractValue:
6451 return TTI.getInstructionCost(I, CostKind);
6452 case Instruction::Alloca:
6453 // We cannot easily widen alloca to a scalable alloca, as
6454 // the result would need to be a vector of pointers.
6455 if (VF.isScalable())
6457 return TTI.getArithmeticInstrCost(Instruction::Mul, RetTy, CostKind);
6458 default:
6459 // This opcode is unknown. Assume that it is the same as 'mul'.
6460 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6461 } // end of switch.
6462}
6463
6465 // Ignore ephemeral values.
6467
6468 SmallVector<Value *, 4> DeadInterleavePointerOps;
6470
6471 // If a scalar epilogue is required, users outside the loop won't use
6472 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6473 // that is the case.
6474 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6475 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6476 return RequiresScalarEpilogue &&
6477 !TheLoop->contains(cast<Instruction>(U)->getParent());
6478 };
6479
6481 DFS.perform(LI);
6482 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6483 for (Instruction &I : reverse(*BB)) {
6484 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6485 continue;
6486
6487 // Add instructions that would be trivially dead and are only used by
6488 // values already ignored to DeadOps to seed worklist.
6490 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6491 return VecValuesToIgnore.contains(U) ||
6492 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6493 }))
6494 DeadOps.push_back(&I);
6495
6496 // For interleave groups, we only create a pointer for the start of the
6497 // interleave group. Queue up addresses of group members except the insert
6498 // position for further processing.
6499 if (isAccessInterleaved(&I)) {
6500 auto *Group = getInterleavedAccessGroup(&I);
6501 if (Group->getInsertPos() == &I)
6502 continue;
6503 Value *PointerOp = getLoadStorePointerOperand(&I);
6504 DeadInterleavePointerOps.push_back(PointerOp);
6505 }
6506
6507 // Queue branches for analysis. They are dead, if their successors only
6508 // contain dead instructions.
6509 if (isa<CondBrInst>(&I))
6510 DeadOps.push_back(&I);
6511 }
6512
6513 // Mark ops feeding interleave group members as free, if they are only used
6514 // by other dead computations.
6515 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6516 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6517 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6518 Instruction *UI = cast<Instruction>(U);
6519 return !VecValuesToIgnore.contains(U) &&
6520 (!isAccessInterleaved(UI) ||
6521 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6522 }))
6523 continue;
6524 VecValuesToIgnore.insert(Op);
6525 append_range(DeadInterleavePointerOps, Op->operands());
6526 }
6527
6528 // Mark ops that would be trivially dead and are only used by ignored
6529 // instructions as free.
6530 BasicBlock *Header = TheLoop->getHeader();
6531
6532 // Returns true if the block contains only dead instructions. Such blocks will
6533 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6534 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6535 auto IsEmptyBlock = [this](BasicBlock *BB) {
6536 return all_of(*BB, [this](Instruction &I) {
6537 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6539 });
6540 };
6541 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6542 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6543
6544 // Check if the branch should be considered dead.
6545 if (auto *Br = dyn_cast_or_null<CondBrInst>(Op)) {
6546 BasicBlock *ThenBB = Br->getSuccessor(0);
6547 BasicBlock *ElseBB = Br->getSuccessor(1);
6548 // Don't considers branches leaving the loop for simplification.
6549 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6550 continue;
6551 bool ThenEmpty = IsEmptyBlock(ThenBB);
6552 bool ElseEmpty = IsEmptyBlock(ElseBB);
6553 if ((ThenEmpty && ElseEmpty) ||
6554 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6555 ElseBB->phis().empty()) ||
6556 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6557 ThenBB->phis().empty())) {
6558 VecValuesToIgnore.insert(Br);
6559 DeadOps.push_back(Br->getCondition());
6560 }
6561 continue;
6562 }
6563
6564 // Skip any op that shouldn't be considered dead.
6565 if (!Op || !TheLoop->contains(Op) ||
6566 (isa<PHINode>(Op) && Op->getParent() == Header) ||
6568 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6569 return !VecValuesToIgnore.contains(U) &&
6570 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6571 }))
6572 continue;
6573
6574 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6575 // which applies for both scalar and vector versions. Otherwise it is only
6576 // dead in vector versions, so only add it to VecValuesToIgnore.
6577 if (all_of(Op->users(),
6578 [this](User *U) { return ValuesToIgnore.contains(U); }))
6579 ValuesToIgnore.insert(Op);
6580
6581 VecValuesToIgnore.insert(Op);
6582 append_range(DeadOps, Op->operands());
6583 }
6584
6585 // Ignore type-promoting instructions we identified during reduction
6586 // detection.
6587 for (const auto &Reduction : Legal->getReductionVars()) {
6588 const RecurrenceDescriptor &RedDes = Reduction.second;
6589 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6590 VecValuesToIgnore.insert_range(Casts);
6591 }
6592 // Ignore type-casting instructions we identified during induction
6593 // detection.
6594 for (const auto &Induction : Legal->getInductionVars()) {
6595 const InductionDescriptor &IndDes = Induction.second;
6596 VecValuesToIgnore.insert_range(IndDes.getCastInsts());
6597 }
6598}
6599
6601 // Avoid duplicating work finding in-loop reductions.
6602 if (!InLoopReductions.empty())
6603 return;
6604
6605 for (const auto &Reduction : Legal->getReductionVars()) {
6606 PHINode *Phi = Reduction.first;
6607 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6608
6609 // Multi-use reductions (e.g., used in FindLastIV patterns) are handled
6610 // separately and should not be considered for in-loop reductions.
6611 if (RdxDesc.hasUsesOutsideReductionChain())
6612 continue;
6613
6614 // We don't collect reductions that are type promoted (yet).
6615 if (RdxDesc.getRecurrenceType() != Phi->getType())
6616 continue;
6617
6618 // In-loop AnyOf and FindIV reductions are not yet supported.
6619 RecurKind Kind = RdxDesc.getRecurrenceKind();
6623 continue;
6624
6625 // If the target would prefer this reduction to happen "in-loop", then we
6626 // want to record it as such.
6627 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6628 !TTI.preferInLoopReduction(Kind, Phi->getType()))
6629 continue;
6630
6631 // Check that we can correctly put the reductions into the loop, by
6632 // finding the chain of operations that leads from the phi to the loop
6633 // exit value.
6634 SmallVector<Instruction *, 4> ReductionOperations =
6635 RdxDesc.getReductionOpChain(Phi, TheLoop);
6636 bool InLoop = !ReductionOperations.empty();
6637
6638 if (InLoop) {
6639 InLoopReductions.insert(Phi);
6640 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6641 Instruction *LastChain = Phi;
6642 for (auto *I : ReductionOperations) {
6643 InLoopReductionImmediateChains[I] = LastChain;
6644 LastChain = I;
6645 }
6646 }
6647 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6648 << " reduction for phi: " << *Phi << "\n");
6649 }
6650}
6651
6652// This function will select a scalable VF if the target supports scalable
6653// vectors and a fixed one otherwise.
6654// TODO: we could return a pair of values that specify the max VF and
6655// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6656// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6657// doesn't have a cost model that can choose which plan to execute if
6658// more than one is generated.
6661 unsigned WidestType;
6662 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6663
6665 TTI.enableScalableVectorization()
6668
6669 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
6670 unsigned N = RegSize.getKnownMinValue() / WidestType;
6671 return ElementCount::get(N, RegSize.isScalable());
6672}
6673
6676 ElementCount VF = UserVF;
6677 // Outer loop handling: They may require CFG and instruction level
6678 // transformations before even evaluating whether vectorization is profitable.
6679 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6680 // the vectorization pipeline.
6681 if (!OrigLoop->isInnermost()) {
6682 // If the user doesn't provide a vectorization factor, determine a
6683 // reasonable one.
6684 if (UserVF.isZero()) {
6685 VF = determineVPlanVF(TTI, CM);
6686 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6687
6688 // Make sure we have a VF > 1 for stress testing.
6689 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6690 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6691 << "overriding computed VF.\n");
6692 VF = ElementCount::getFixed(4);
6693 }
6694 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6696 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6697 << "not supported by the target.\n");
6699 "Scalable vectorization requested but not supported by the target",
6700 "the scalable user-specified vectorization width for outer-loop "
6701 "vectorization cannot be used because the target does not support "
6702 "scalable vectors.",
6703 "ScalableVFUnfeasible", ORE, OrigLoop);
6705 }
6706 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6708 "VF needs to be a power of two");
6709 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6710 << "VF " << VF << " to build VPlans.\n");
6711 buildVPlans(VF, VF);
6712
6713 if (VPlans.empty())
6715
6716 // For VPlan build stress testing, we bail out after VPlan construction.
6719
6720 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6721 }
6722
6723 LLVM_DEBUG(
6724 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6725 "VPlan-native path.\n");
6727}
6728
6729void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6730 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6731 CM.collectValuesToIgnore();
6732 CM.collectElementTypesForWidening();
6733
6734 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6735 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6736 return;
6737
6738 // Invalidate interleave groups if all blocks of loop will be predicated.
6739 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
6741 LLVM_DEBUG(
6742 dbgs()
6743 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6744 "which requires masked-interleaved support.\n");
6745 if (CM.InterleaveInfo.invalidateGroups())
6746 // Invalidating interleave groups also requires invalidating all decisions
6747 // based on them, which includes widening decisions and uniform and scalar
6748 // values.
6749 CM.invalidateCostModelingDecisions();
6750 }
6751
6752 if (CM.foldTailByMasking())
6753 Legal->prepareToFoldTailByMasking();
6754
6755 ElementCount MaxUserVF =
6756 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6757 if (UserVF) {
6758 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
6760 "UserVF ignored because it may be larger than the maximal safe VF",
6761 "InvalidUserVF", ORE, OrigLoop);
6762 } else {
6764 "VF needs to be a power of two");
6765 // Collect the instructions (and their associated costs) that will be more
6766 // profitable to scalarize.
6767 CM.collectInLoopReductions();
6768 if (CM.selectUserVectorizationFactor(UserVF)) {
6769 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6770 buildVPlansWithVPRecipes(UserVF, UserVF);
6772 return;
6773 }
6774 reportVectorizationInfo("UserVF ignored because of invalid costs.",
6775 "InvalidCost", ORE, OrigLoop);
6776 }
6777 }
6778
6779 // Collect the Vectorization Factor Candidates.
6780 SmallVector<ElementCount> VFCandidates;
6781 for (auto VF = ElementCount::getFixed(1);
6782 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
6783 VFCandidates.push_back(VF);
6784 for (auto VF = ElementCount::getScalable(1);
6785 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
6786 VFCandidates.push_back(VF);
6787
6788 CM.collectInLoopReductions();
6789 for (const auto &VF : VFCandidates) {
6790 // Collect Uniform and Scalar instructions after vectorization with VF.
6791 CM.collectNonVectorizedAndSetWideningDecisions(VF);
6792 }
6793
6794 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
6795 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
6796
6798}
6799
6801 ElementCount VF) const {
6802 InstructionCost Cost = CM.getInstructionCost(UI, VF);
6803 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
6805 return Cost;
6806}
6807
6809 ElementCount VF) const {
6810 return CM.isUniformAfterVectorization(I, VF);
6811}
6812
6813bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6814 return CM.ValuesToIgnore.contains(UI) ||
6815 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
6816 SkipCostComputation.contains(UI);
6817}
6818
6820 return CM.getPredBlockCostDivisor(CostKind, BB);
6821}
6822
6824LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6825 VPCostContext &CostCtx) const {
6827 // Cost modeling for inductions is inaccurate in the legacy cost model
6828 // compared to the recipes that are generated. To match here initially during
6829 // VPlan cost model bring up directly use the induction costs from the legacy
6830 // cost model. Note that we do this as pre-processing; the VPlan may not have
6831 // any recipes associated with the original induction increment instruction
6832 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6833 // the cost of induction phis and increments (both that are represented by
6834 // recipes and those that are not), to avoid distinguishing between them here,
6835 // and skip all recipes that represent induction phis and increments (the
6836 // former case) later on, if they exist, to avoid counting them twice.
6837 // Similarly we pre-compute the cost of any optimized truncates.
6838 // TODO: Switch to more accurate costing based on VPlan.
6839 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6841 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
6842 SmallVector<Instruction *> IVInsts = {IVInc};
6843 for (unsigned I = 0; I != IVInsts.size(); I++) {
6844 for (Value *Op : IVInsts[I]->operands()) {
6845 auto *OpI = dyn_cast<Instruction>(Op);
6846 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
6847 continue;
6848 IVInsts.push_back(OpI);
6849 }
6850 }
6851 IVInsts.push_back(IV);
6852 for (User *U : IV->users()) {
6853 auto *CI = cast<Instruction>(U);
6854 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
6855 continue;
6856 IVInsts.push_back(CI);
6857 }
6858
6859 // If the vector loop gets executed exactly once with the given VF, ignore
6860 // the costs of comparison and induction instructions, as they'll get
6861 // simplified away.
6862 // TODO: Remove this code after stepping away from the legacy cost model and
6863 // adding code to simplify VPlans before calculating their costs.
6864 auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
6865 if (TC == VF && !CM.foldTailByMasking())
6866 addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
6867 CostCtx.SkipCostComputation);
6868
6869 for (Instruction *IVInst : IVInsts) {
6870 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
6871 continue;
6872 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
6873 LLVM_DEBUG({
6874 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6875 << ": induction instruction " << *IVInst << "\n";
6876 });
6877 Cost += InductionCost;
6878 CostCtx.SkipCostComputation.insert(IVInst);
6879 }
6880 }
6881
6882 /// Compute the cost of all exiting conditions of the loop using the legacy
6883 /// cost model. This is to match the legacy behavior, which adds the cost of
6884 /// all exit conditions. Note that this over-estimates the cost, as there will
6885 /// be a single condition to control the vector loop.
6887 CM.TheLoop->getExitingBlocks(Exiting);
6888 SetVector<Instruction *> ExitInstrs;
6889 // Collect all exit conditions.
6890 for (BasicBlock *EB : Exiting) {
6891 auto *Term = dyn_cast<CondBrInst>(EB->getTerminator());
6892 if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
6893 continue;
6894 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
6895 ExitInstrs.insert(CondI);
6896 }
6897 }
6898 // Compute the cost of all instructions only feeding the exit conditions.
6899 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6900 Instruction *CondI = ExitInstrs[I];
6901 if (!OrigLoop->contains(CondI) ||
6902 !CostCtx.SkipCostComputation.insert(CondI).second)
6903 continue;
6904 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
6905 LLVM_DEBUG({
6906 dbgs() << "Cost of " << CondICost << " for VF " << VF
6907 << ": exit condition instruction " << *CondI << "\n";
6908 });
6909 Cost += CondICost;
6910 for (Value *Op : CondI->operands()) {
6911 auto *OpI = dyn_cast<Instruction>(Op);
6912 if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
6913 any_of(OpI->users(), [&ExitInstrs](User *U) {
6914 return !ExitInstrs.contains(cast<Instruction>(U));
6915 }))
6916 continue;
6917 ExitInstrs.insert(OpI);
6918 }
6919 }
6920
6921 // Pre-compute the costs for branches except for the backedge, as the number
6922 // of replicate regions in a VPlan may not directly match the number of
6923 // branches, which would lead to different decisions.
6924 // TODO: Compute cost of branches for each replicate region in the VPlan,
6925 // which is more accurate than the legacy cost model.
6926 for (BasicBlock *BB : OrigLoop->blocks()) {
6927 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
6928 continue;
6929 CostCtx.SkipCostComputation.insert(BB->getTerminator());
6930 if (BB == OrigLoop->getLoopLatch())
6931 continue;
6932 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
6933 Cost += BranchCost;
6934 }
6935
6936 // Don't apply special costs when instruction cost is forced to make sure the
6937 // forced cost is used for each recipe.
6938 if (ForceTargetInstructionCost.getNumOccurrences())
6939 return Cost;
6940
6941 // Pre-compute costs for instructions that are forced-scalar or profitable to
6942 // scalarize. Their costs will be computed separately in the legacy cost
6943 // model.
6944 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
6945 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
6946 continue;
6947 CostCtx.SkipCostComputation.insert(ForcedScalar);
6948 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
6949 LLVM_DEBUG({
6950 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
6951 << ": forced scalar " << *ForcedScalar << "\n";
6952 });
6953 Cost += ForcedCost;
6954 }
6955 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
6956 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
6957 continue;
6958 CostCtx.SkipCostComputation.insert(Scalarized);
6959 LLVM_DEBUG({
6960 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
6961 << ": profitable to scalarize " << *Scalarized << "\n";
6962 });
6963 Cost += ScalarCost;
6964 }
6965
6966 return Cost;
6967}
6968
6969InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
6970 VPRegisterUsage *RU) const {
6971 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, PSE, OrigLoop);
6972 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
6973
6974 // Now compute and add the VPlan-based cost.
6975 Cost += Plan.cost(VF, CostCtx);
6976
6977 // Add the cost of spills due to excess register usage
6978 if (CM.shouldConsiderRegPressureForVF(VF))
6979 Cost += RU->spillCost(CostCtx, ForceTargetNumVectorRegs);
6980
6981#ifndef NDEBUG
6982 unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
6983 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
6984 << " (Estimated cost per lane: ");
6985 if (Cost.isValid()) {
6986 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
6987 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
6988 } else /* No point dividing an invalid cost - it will still be invalid */
6989 LLVM_DEBUG(dbgs() << "Invalid");
6990 LLVM_DEBUG(dbgs() << ")\n");
6991#endif
6992 return Cost;
6993}
6994
6995#ifndef NDEBUG
6996/// Return true if the original loop \ TheLoop contains any instructions that do
6997/// not have corresponding recipes in \p Plan and are not marked to be ignored
6998/// in \p CostCtx. This means the VPlan contains simplification that the legacy
6999/// cost-model did not account for.
7001 VPCostContext &CostCtx,
7002 Loop *TheLoop,
7003 ElementCount VF) {
7004 using namespace VPlanPatternMatch;
7005 // First collect all instructions for the recipes in Plan.
7006 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7007 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7008 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7009 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7010 return &WidenMem->getIngredient();
7011 return nullptr;
7012 };
7013
7014 // Check if a select for a safe divisor was hoisted to the pre-header. If so,
7015 // the select doesn't need to be considered for the vector loop cost; go with
7016 // the more accurate VPlan-based cost model.
7017 for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
7018 auto *VPI = dyn_cast<VPInstruction>(&R);
7019 if (!VPI || VPI->getOpcode() != Instruction::Select)
7020 continue;
7021
7022 if (auto *WR = dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
7023 switch (WR->getOpcode()) {
7024 case Instruction::UDiv:
7025 case Instruction::SDiv:
7026 case Instruction::URem:
7027 case Instruction::SRem:
7028 return true;
7029 default:
7030 break;
7031 }
7032 }
7033 }
7034
7035 DenseSet<Instruction *> SeenInstrs;
7036 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7038 for (VPRecipeBase &R : *VPBB) {
7039 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7040 auto *IG = IR->getInterleaveGroup();
7041 unsigned NumMembers = IG->getNumMembers();
7042 for (unsigned I = 0; I != NumMembers; ++I) {
7043 if (Instruction *M = IG->getMember(I))
7044 SeenInstrs.insert(M);
7045 }
7046 continue;
7047 }
7048 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
7049 // cost model won't cost it whilst the legacy will.
7050 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
7051 if (none_of(FOR->users(),
7052 match_fn(m_VPInstruction<
7054 return true;
7055 }
7056 // The VPlan-based cost model is more accurate for partial reductions and
7057 // comparing against the legacy cost isn't desirable.
7058 if (auto *VPR = dyn_cast<VPReductionRecipe>(&R))
7059 if (VPR->isPartialReduction())
7060 return true;
7061
7062 // The VPlan-based cost model can analyze if recipes are scalar
7063 // recursively, but the legacy cost model cannot.
7064 if (auto *WidenMemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
7065 auto *AddrI = dyn_cast<Instruction>(
7066 getLoadStorePointerOperand(&WidenMemR->getIngredient()));
7067 if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
7068 CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
7069 return true;
7070
7071 if (WidenMemR->isReverse()) {
7072 // If the stored value of a reverse store is invariant, LICM will
7073 // hoist the reverse operation to the preheader. In this case, the
7074 // result of the VPlan-based cost model will diverge from that of
7075 // the legacy model.
7076 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(WidenMemR))
7077 if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7078 return true;
7079
7080 if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(WidenMemR))
7081 if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7082 return true;
7083 }
7084 }
7085
7086 // The legacy cost model costs non-header phis with a scalar VF as a phi,
7087 // but scalar unrolled VPlans will have VPBlendRecipes which emit selects.
7088 if (isa<VPBlendRecipe>(&R) &&
7089 vputils::onlyFirstLaneUsed(R.getVPSingleValue()))
7090 return true;
7091
7092 // The legacy cost model won't calculate the cost of the LogicalAnd which
7093 // will be replaced with vp_merge.
7095 return true;
7096
7097 /// If a VPlan transform folded a recipe to one producing a single-scalar,
7098 /// but the original instruction wasn't uniform-after-vectorization in the
7099 /// legacy cost model, the legacy cost overestimates the actual cost.
7100 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7101 if (RepR->isSingleScalar() &&
7103 RepR->getUnderlyingInstr(), VF))
7104 return true;
7105 }
7106 if (Instruction *UI = GetInstructionForCost(&R)) {
7107 // If we adjusted the predicate of the recipe, the cost in the legacy
7108 // cost model may be different.
7109 CmpPredicate Pred;
7110 if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
7111 cast<VPRecipeWithIRFlags>(R).getPredicate() !=
7112 cast<CmpInst>(UI)->getPredicate())
7113 return true;
7114
7115 // Recipes with underlying instructions being moved out of the loop
7116 // region by LICM may cause discrepancies between the legacy cost model
7117 // and the VPlan-based cost model.
7118 if (!VPBB->getEnclosingLoopRegion())
7119 return true;
7120
7121 SeenInstrs.insert(UI);
7122 }
7123 }
7124 }
7125
7126 // If a reverse recipe has been sunk to the middle block (e.g., for a load
7127 // whose result is only used as a live-out), VPlan avoids the per-iteration
7128 // reverse shuffle cost that the legacy model accounts for.
7129 if (any_of(*Plan.getMiddleBlock(), [](const VPRecipeBase &R) {
7130 return match(&R, m_VPInstruction<VPInstruction::Reverse>());
7131 }))
7132 return true;
7133
7134 // Return true if the loop contains any instructions that are not also part of
7135 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7136 // that the VPlan contains extra simplifications.
7137 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7138 TheLoop](BasicBlock *BB) {
7139 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7140 // Skip induction phis when checking for simplifications, as they may not
7141 // be lowered directly be lowered to a corresponding PHI recipe.
7142 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
7143 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
7144 return false;
7145 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7146 });
7147 });
7148}
7149#endif
7150
7152 if (VPlans.empty())
7154 // If there is a single VPlan with a single VF, return it directly.
7155 VPlan &FirstPlan = *VPlans[0];
7156 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7157 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7158
7159 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7160 << (CM.CostKind == TTI::TCK_RecipThroughput
7161 ? "Reciprocal Throughput\n"
7162 : CM.CostKind == TTI::TCK_Latency
7163 ? "Instruction Latency\n"
7164 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7165 : CM.CostKind == TTI::TCK_SizeAndLatency
7166 ? "Code Size and Latency\n"
7167 : "Unknown\n"));
7168
7170 assert(hasPlanWithVF(ScalarVF) &&
7171 "More than a single plan/VF w/o any plan having scalar VF");
7172
7173 // TODO: Compute scalar cost using VPlan-based cost model.
7174 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7175 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7176 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7177 VectorizationFactor BestFactor = ScalarFactor;
7178
7179 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7180 if (ForceVectorization) {
7181 // Ignore scalar width, because the user explicitly wants vectorization.
7182 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7183 // evaluation.
7184 BestFactor.Cost = InstructionCost::getMax();
7185 }
7186
7187 for (auto &P : VPlans) {
7188 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7189 P->vectorFactors().end());
7190
7192 bool ConsiderRegPressure = any_of(VFs, [this](ElementCount VF) {
7193 return CM.shouldConsiderRegPressureForVF(VF);
7194 });
7196 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
7197
7198 for (unsigned I = 0; I < VFs.size(); I++) {
7199 ElementCount VF = VFs[I];
7200 if (VF.isScalar())
7201 continue;
7202 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7203 LLVM_DEBUG(
7204 dbgs()
7205 << "LV: Not considering vector loop of width " << VF
7206 << " because it will not generate any vector instructions.\n");
7207 continue;
7208 }
7209 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
7210 LLVM_DEBUG(
7211 dbgs()
7212 << "LV: Not considering vector loop of width " << VF
7213 << " because it would cause replicated blocks to be generated,"
7214 << " which isn't allowed when optimizing for size.\n");
7215 continue;
7216 }
7217
7219 cost(*P, VF, ConsiderRegPressure ? &RUs[I] : nullptr);
7220 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7221
7222 if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7223 BestFactor = CurrentFactor;
7224
7225 // If profitable add it to ProfitableVF list.
7226 if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
7227 ProfitableVFs.push_back(CurrentFactor);
7228 }
7229 }
7230
7231#ifndef NDEBUG
7232 // Select the optimal vectorization factor according to the legacy cost-model.
7233 // This is now only used to verify the decisions by the new VPlan-based
7234 // cost-model and will be retired once the VPlan-based cost-model is
7235 // stabilized.
7236 VectorizationFactor LegacyVF = selectVectorizationFactor();
7237 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7238
7239 // Pre-compute the cost and use it to check if BestPlan contains any
7240 // simplifications not accounted for in the legacy cost model. If that's the
7241 // case, don't trigger the assertion, as the extra simplifications may cause a
7242 // different VF to be picked by the VPlan-based cost model.
7243 VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, CM.PSE,
7244 OrigLoop);
7245 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7246 // Verify that the VPlan-based and legacy cost models agree, except for
7247 // * VPlans with early exits,
7248 // * VPlans with additional VPlan simplifications,
7249 // * EVL-based VPlans with gather/scatters (the VPlan-based cost model uses
7250 // vp_scatter/vp_gather).
7251 // The legacy cost model doesn't properly model costs for such loops.
7252 bool UsesEVLGatherScatter =
7254 BestPlan.getVectorLoopRegion()->getEntry())),
7255 [](VPBasicBlock *VPBB) {
7256 return any_of(*VPBB, [](VPRecipeBase &R) {
7257 return isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R) &&
7258 !cast<VPWidenMemoryRecipe>(&R)->isConsecutive();
7259 });
7260 });
7261 assert(
7262 (BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7263 !Legal->getLAI()->getSymbolicStrides().empty() || UsesEVLGatherScatter ||
7265 getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) ||
7267 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7268 " VPlan cost model and legacy cost model disagreed");
7269 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7270 "when vectorizing, the scalar cost must be computed.");
7271#endif
7272
7273 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7274 return BestFactor;
7275}
7276
7278 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7280 EpilogueVectorizationKind EpilogueVecKind) {
7281 assert(BestVPlan.hasVF(BestVF) &&
7282 "Trying to execute plan with unsupported VF");
7283 assert(BestVPlan.hasUF(BestUF) &&
7284 "Trying to execute plan with unsupported UF");
7285 if (BestVPlan.hasEarlyExit())
7286 ++LoopsEarlyExitVectorized;
7287 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7288 // cost model is complete for better cost estimates.
7289 RUN_VPLAN_PASS(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
7293 bool HasBranchWeights =
7294 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
7295 if (HasBranchWeights) {
7296 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7298 BestVPlan, BestVF, VScale);
7299 }
7300
7301 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7302 VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
7303
7304 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7306 if (EpilogueVecKind == EpilogueVectorizationKind::None)
7308 if (BestVPlan.getEntry()->getSingleSuccessor() ==
7309 BestVPlan.getScalarPreheader()) {
7310 // TODO: The vector loop would be dead, should not even try to vectorize.
7311 ORE->emit([&]() {
7312 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationDead",
7313 OrigLoop->getStartLoc(),
7314 OrigLoop->getHeader())
7315 << "Created vector loop never executes due to insufficient trip "
7316 "count.";
7317 });
7319 }
7320
7322
7324 // Convert the exit condition to AVLNext == 0 for EVL tail folded loops.
7326 // Regions are dissolved after optimizing for VF and UF, which completely
7327 // removes unneeded loop regions first.
7329 // Expand BranchOnTwoConds after dissolution, when latch has direct access to
7330 // its successors.
7332 // Convert loops with variable-length stepping after regions are dissolved.
7334 // Remove dead back-edges for single-iteration loops with BranchOnCond(true).
7335 // Only process loop latches to avoid removing edges from the middle block,
7336 // which may be needed for epilogue vectorization.
7337 VPlanTransforms::removeBranchOnConst(BestVPlan, /*OnlyLatches=*/true);
7340 BestVPlan, VectorPH, CM.foldTailByMasking(),
7341 CM.requiresScalarEpilogue(BestVF.isVector()), &BestVPlan.getVFxUF());
7342 VPlanTransforms::materializeFactors(BestVPlan, VectorPH, BestVF);
7343 VPlanTransforms::cse(BestVPlan);
7345 VPlanTransforms::simplifyKnownEVL(BestVPlan, BestVF, PSE);
7346
7347 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7348 // making any changes to the CFG.
7349 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
7350 VPlanTransforms::expandSCEVs(BestVPlan, *PSE.getSE());
7351
7352 // Perform the actual loop transformation.
7353 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7354 OrigLoop->getParentLoop(),
7355 Legal->getWidestInductionType());
7356
7357#ifdef EXPENSIVE_CHECKS
7358 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7359#endif
7360
7361 // 1. Set up the skeleton for vectorization, including vector pre-header and
7362 // middle block. The vector loop is created during VPlan execution.
7363 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7365 State.CFG.PrevBB->getSingleSuccessor(), &BestVPlan);
7367
7368 assert(verifyVPlanIsValid(BestVPlan) && "final VPlan is invalid");
7369
7370 // After vectorization, the exit blocks of the original loop will have
7371 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
7372 // looked through single-entry phis.
7373 ScalarEvolution &SE = *PSE.getSE();
7374 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
7375 if (!Exit->hasPredecessors())
7376 continue;
7377 for (VPRecipeBase &PhiR : Exit->phis())
7379 &cast<VPIRPhi>(PhiR).getIRPhi());
7380 }
7381 // Forget the original loop and block dispositions.
7382 SE.forgetLoop(OrigLoop);
7384
7386
7387 //===------------------------------------------------===//
7388 //
7389 // Notice: any optimization or new instruction that go
7390 // into the code below should also be implemented in
7391 // the cost-model.
7392 //
7393 //===------------------------------------------------===//
7394
7395 // Retrieve loop information before executing the plan, which may remove the
7396 // original loop, if it becomes unreachable.
7397 MDNode *LID = OrigLoop->getLoopID();
7398 unsigned OrigLoopInvocationWeight = 0;
7399 std::optional<unsigned> OrigAverageTripCount =
7400 getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
7401
7402 BestVPlan.execute(&State);
7403
7404 // 2.6. Maintain Loop Hints
7405 // Keep all loop hints from the original loop on the vector loop (we'll
7406 // replace the vectorizer-specific hints below).
7407 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
7408 // Add metadata to disable runtime unrolling a scalar loop when there
7409 // are no runtime checks about strides and memory. A scalar loop that is
7410 // rarely used is not worth unrolling.
7411 bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
7413 HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB))
7414 : nullptr,
7415 HeaderVPBB, BestVPlan,
7416 EpilogueVecKind == EpilogueVectorizationKind::Epilogue, LID,
7417 OrigAverageTripCount, OrigLoopInvocationWeight,
7418 estimateElementCount(BestVF * BestUF, CM.getVScaleForTuning()),
7419 DisableRuntimeUnroll);
7420
7421 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7422 // predication, updating analyses.
7423 ILV.fixVectorizedLoop(State);
7424
7426
7427 return ExpandedSCEVs;
7428}
7429
7430//===--------------------------------------------------------------------===//
7431// EpilogueVectorizerMainLoop
7432//===--------------------------------------------------------------------===//
7433
7435 LLVM_DEBUG({
7436 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7437 << "Main Loop VF:" << EPI.MainLoopVF
7438 << ", Main Loop UF:" << EPI.MainLoopUF
7439 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7440 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7441 });
7442}
7443
7446 dbgs() << "intermediate fn:\n"
7447 << *OrigLoop->getHeader()->getParent() << "\n";
7448 });
7449}
7450
7451//===--------------------------------------------------------------------===//
7452// EpilogueVectorizerEpilogueLoop
7453//===--------------------------------------------------------------------===//
7454
7455/// This function creates a new scalar preheader, using the previous one as
7456/// entry block to the epilogue VPlan. The minimum iteration check is being
7457/// represented in VPlan.
7459 BasicBlock *NewScalarPH = createScalarPreheader("vec.epilog.");
7460 BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
7461 OriginalScalarPH->setName("vec.epilog.iter.check");
7462 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH);
7463 VPBasicBlock *OldEntry = Plan.getEntry();
7464 for (auto &R : make_early_inc_range(*OldEntry)) {
7465 // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
7466 // defining.
7467 if (isa<VPIRInstruction>(&R))
7468 continue;
7469 R.moveBefore(*NewEntry, NewEntry->end());
7470 }
7471
7472 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
7473 Plan.setEntry(NewEntry);
7474 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7475
7476 return OriginalScalarPH;
7477}
7478
7480 LLVM_DEBUG({
7481 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7482 << "Epilogue Loop VF:" << EPI.EpilogueVF
7483 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7484 });
7485}
7486
7489 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7490 });
7491}
7492
7493VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
7494 VFRange &Range) {
7495 assert((VPI->getOpcode() == Instruction::Load ||
7496 VPI->getOpcode() == Instruction::Store) &&
7497 "Must be called with either a load or store");
7499
7500 auto WillWiden = [&](ElementCount VF) -> bool {
7502 CM.getWideningDecision(I, VF);
7504 "CM decision should be taken at this point.");
7506 return true;
7507 if (CM.isScalarAfterVectorization(I, VF) ||
7508 CM.isProfitableToScalarize(I, VF))
7509 return false;
7511 };
7512
7514 return nullptr;
7515
7516 // If a mask is not required, drop it - use unmasked version for safe loads.
7517 // TODO: Determine if mask is needed in VPlan.
7518 VPValue *Mask = CM.isMaskRequired(I) ? VPI->getMask() : nullptr;
7519
7520 // Determine if the pointer operand of the access is either consecutive or
7521 // reverse consecutive.
7523 CM.getWideningDecision(I, Range.Start);
7525 bool Consecutive =
7527
7528 VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(0)
7529 : VPI->getOperand(1);
7530 if (Consecutive) {
7533 VPSingleDefRecipe *VectorPtr;
7534 if (Reverse) {
7535 // When folding the tail, we may compute an address that we don't in the
7536 // original scalar loop: drop the GEP no-wrap flags in this case.
7537 // Otherwise preserve existing flags without no-unsigned-wrap, as we will
7538 // emit negative indices.
7539 GEPNoWrapFlags Flags =
7540 CM.foldTailByMasking() || !GEP
7542 : GEP->getNoWrapFlags().withoutNoUnsignedWrap();
7543 VectorPtr = new VPVectorEndPointerRecipe(
7544 Ptr, &Plan.getVF(), getLoadStoreType(I),
7545 /*Stride*/ -1, Flags, VPI->getDebugLoc());
7546 } else {
7547 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7548 GEP ? GEP->getNoWrapFlags()
7550 VPI->getDebugLoc());
7551 }
7552 Builder.insert(VectorPtr);
7553 Ptr = VectorPtr;
7554 }
7555
7556 if (VPI->getOpcode() == Instruction::Load) {
7557 auto *Load = cast<LoadInst>(I);
7558 auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7559 *VPI, Load->getDebugLoc());
7560 if (Reverse) {
7561 Builder.insert(LoadR);
7562 return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
7563 LoadR->getDebugLoc());
7564 }
7565 return LoadR;
7566 }
7567
7568 StoreInst *Store = cast<StoreInst>(I);
7569 VPValue *StoredVal = VPI->getOperand(0);
7570 if (Reverse)
7571 StoredVal = Builder.createNaryOp(VPInstruction::Reverse, StoredVal,
7572 Store->getDebugLoc());
7573 return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
7574 Reverse, *VPI, Store->getDebugLoc());
7575}
7576
7578VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
7579 VFRange &Range) {
7580 auto *I = cast<TruncInst>(VPI->getUnderlyingInstr());
7581 // Optimize the special case where the source is a constant integer
7582 // induction variable. Notice that we can only optimize the 'trunc' case
7583 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7584 // (c) other casts depend on pointer size.
7585
7586 // Determine whether \p K is a truncation based on an induction variable that
7587 // can be optimized.
7590 I),
7591 Range))
7592 return nullptr;
7593
7595 VPI->getOperand(0)->getDefiningRecipe());
7596 PHINode *Phi = WidenIV->getPHINode();
7597 VPIRValue *Start = WidenIV->getStartValue();
7598 const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
7599
7600 // Wrap flags from the original induction do not apply to the truncated type,
7601 // so do not propagate them.
7602 VPIRFlags Flags = VPIRFlags::WrapFlagsTy(false, false);
7603 VPValue *Step =
7605 return new VPWidenIntOrFpInductionRecipe(
7606 Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
7607}
7608
7609VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
7610 VFRange &Range) {
7611 CallInst *CI = cast<CallInst>(VPI->getUnderlyingInstr());
7613 [this, CI](ElementCount VF) {
7614 return CM.isScalarWithPredication(CI, VF);
7615 },
7616 Range);
7617
7618 if (IsPredicated)
7619 return nullptr;
7620
7622 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7623 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7624 ID == Intrinsic::pseudoprobe ||
7625 ID == Intrinsic::experimental_noalias_scope_decl))
7626 return nullptr;
7627
7629 VPI->op_begin() + CI->arg_size());
7630
7631 // Is it beneficial to perform intrinsic call compared to lib call?
7632 bool ShouldUseVectorIntrinsic =
7634 [&](ElementCount VF) -> bool {
7635 return CM.getCallWideningDecision(CI, VF).Kind ==
7637 },
7638 Range);
7639 if (ShouldUseVectorIntrinsic)
7640 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI, *VPI,
7641 VPI->getDebugLoc());
7642
7643 Function *Variant = nullptr;
7644 std::optional<unsigned> MaskPos;
7645 // Is better to call a vectorized version of the function than to to scalarize
7646 // the call?
7647 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7648 [&](ElementCount VF) -> bool {
7649 // The following case may be scalarized depending on the VF.
7650 // The flag shows whether we can use a usual Call for vectorized
7651 // version of the instruction.
7652
7653 // If we've found a variant at a previous VF, then stop looking. A
7654 // vectorized variant of a function expects input in a certain shape
7655 // -- basically the number of input registers, the number of lanes
7656 // per register, and whether there's a mask required.
7657 // We store a pointer to the variant in the VPWidenCallRecipe, so
7658 // once we have an appropriate variant it's only valid for that VF.
7659 // This will force a different vplan to be generated for each VF that
7660 // finds a valid variant.
7661 if (Variant)
7662 return false;
7663 LoopVectorizationCostModel::CallWideningDecision Decision =
7664 CM.getCallWideningDecision(CI, VF);
7666 Variant = Decision.Variant;
7667 MaskPos = Decision.MaskPos;
7668 return true;
7669 }
7670
7671 return false;
7672 },
7673 Range);
7674 if (ShouldUseVectorCall) {
7675 if (MaskPos.has_value()) {
7676 // We have 2 cases that would require a mask:
7677 // 1) The call needs to be predicated, either due to a conditional
7678 // in the scalar loop or use of an active lane mask with
7679 // tail-folding, and we use the appropriate mask for the block.
7680 // 2) No mask is required for the call instruction, but the only
7681 // available vector variant at this VF requires a mask, so we
7682 // synthesize an all-true mask.
7683 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7684
7685 Ops.insert(Ops.begin() + *MaskPos, Mask);
7686 }
7687
7688 Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));
7689 return new VPWidenCallRecipe(CI, Variant, Ops, *VPI, *VPI,
7690 VPI->getDebugLoc());
7691 }
7692
7693 return nullptr;
7694}
7695
7696bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7698 "Instruction should have been handled earlier");
7699 // Instruction should be widened, unless it is scalar after vectorization,
7700 // scalarization is profitable or it is predicated.
7701 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7702 return CM.isScalarAfterVectorization(I, VF) ||
7703 CM.isProfitableToScalarize(I, VF) ||
7704 CM.isScalarWithPredication(I, VF);
7705 };
7707 Range);
7708}
7709
7710VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
7711 auto *I = VPI->getUnderlyingInstr();
7712 switch (VPI->getOpcode()) {
7713 default:
7714 return nullptr;
7715 case Instruction::SDiv:
7716 case Instruction::UDiv:
7717 case Instruction::SRem:
7718 case Instruction::URem: {
7719 // If not provably safe, use a select to form a safe divisor before widening the
7720 // div/rem operation itself. Otherwise fall through to general handling below.
7721 if (CM.isPredicatedInst(I)) {
7723 VPValue *Mask = VPI->getMask();
7724 VPValue *One = Plan.getConstantInt(I->getType(), 1u);
7725 auto *SafeRHS =
7726 Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc());
7727 Ops[1] = SafeRHS;
7728 return new VPWidenRecipe(*I, Ops, *VPI, *VPI, VPI->getDebugLoc());
7729 }
7730 [[fallthrough]];
7731 }
7732 case Instruction::Add:
7733 case Instruction::And:
7734 case Instruction::AShr:
7735 case Instruction::FAdd:
7736 case Instruction::FCmp:
7737 case Instruction::FDiv:
7738 case Instruction::FMul:
7739 case Instruction::FNeg:
7740 case Instruction::FRem:
7741 case Instruction::FSub:
7742 case Instruction::ICmp:
7743 case Instruction::LShr:
7744 case Instruction::Mul:
7745 case Instruction::Or:
7746 case Instruction::Select:
7747 case Instruction::Shl:
7748 case Instruction::Sub:
7749 case Instruction::Xor:
7750 case Instruction::Freeze:
7751 return new VPWidenRecipe(*I, VPI->operandsWithoutMask(), *VPI, *VPI,
7752 VPI->getDebugLoc());
7753 case Instruction::ExtractValue: {
7755 auto *EVI = cast<ExtractValueInst>(I);
7756 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7757 unsigned Idx = EVI->getIndices()[0];
7758 NewOps.push_back(Plan.getConstantInt(32, Idx));
7759 return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
7760 }
7761 };
7762}
7763
7764VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7765 VPInstruction *VPI) {
7766 // FIXME: Support other operations.
7767 unsigned Opcode = HI->Update->getOpcode();
7768 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7769 "Histogram update operation must be an Add or Sub");
7770
7772 // Bucket address.
7773 HGramOps.push_back(VPI->getOperand(1));
7774 // Increment value.
7775 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
7776
7777 // In case of predicated execution (due to tail-folding, or conditional
7778 // execution, or both), pass the relevant mask.
7779 if (CM.isMaskRequired(HI->Store))
7780 HGramOps.push_back(VPI->getMask());
7781
7782 return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
7783}
7784
7786 VFRange &Range) {
7787 auto *I = VPI->getUnderlyingInstr();
7789 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7790 Range);
7791
7792 bool IsPredicated = CM.isPredicatedInst(I);
7793
7794 // Even if the instruction is not marked as uniform, there are certain
7795 // intrinsic calls that can be effectively treated as such, so we check for
7796 // them here. Conservatively, we only do this for scalable vectors, since
7797 // for fixed-width VFs we can always fall back on full scalarization.
7798 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
7799 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
7800 case Intrinsic::assume:
7801 case Intrinsic::lifetime_start:
7802 case Intrinsic::lifetime_end:
7803 // For scalable vectors if one of the operands is variant then we still
7804 // want to mark as uniform, which will generate one instruction for just
7805 // the first lane of the vector. We can't scalarize the call in the same
7806 // way as for fixed-width vectors because we don't know how many lanes
7807 // there are.
7808 //
7809 // The reasons for doing it this way for scalable vectors are:
7810 // 1. For the assume intrinsic generating the instruction for the first
7811 // lane is still be better than not generating any at all. For
7812 // example, the input may be a splat across all lanes.
7813 // 2. For the lifetime start/end intrinsics the pointer operand only
7814 // does anything useful when the input comes from a stack object,
7815 // which suggests it should always be uniform. For non-stack objects
7816 // the effect is to poison the object, which still allows us to
7817 // remove the call.
7818 IsUniform = true;
7819 break;
7820 default:
7821 break;
7822 }
7823 }
7824 VPValue *BlockInMask = nullptr;
7825 if (!IsPredicated) {
7826 // Finalize the recipe for Instr, first if it is not predicated.
7827 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7828 } else {
7829 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7830 // Instructions marked for predication are replicated and a mask operand is
7831 // added initially. Masked replicate recipes will later be placed under an
7832 // if-then construct to prevent side-effects. Generate recipes to compute
7833 // the block mask for this region.
7834 BlockInMask = VPI->getMask();
7835 }
7836
7837 // Note that there is some custom logic to mark some intrinsics as uniform
7838 // manually above for scalable vectors, which this assert needs to account for
7839 // as well.
7840 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
7841 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
7842 "Should not predicate a uniform recipe");
7843 auto *Recipe =
7844 new VPReplicateRecipe(I, VPI->operandsWithoutMask(), IsUniform,
7845 BlockInMask, *VPI, *VPI, VPI->getDebugLoc());
7846 return Recipe;
7847}
7848
7851 VFRange &Range) {
7852 assert(!R->isPhi() && "phis must be handled earlier");
7853 // First, check for specific widening recipes that deal with optimizing
7854 // truncates, calls and memory operations.
7855
7856 VPRecipeBase *Recipe;
7857 auto *VPI = cast<VPInstruction>(R);
7858 if (VPI->getOpcode() == Instruction::Trunc &&
7859 (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
7860 return Recipe;
7861
7862 // All widen recipes below deal only with VF > 1.
7864 [&](ElementCount VF) { return VF.isScalar(); }, Range))
7865 return nullptr;
7866
7867 if (VPI->getOpcode() == Instruction::Call)
7868 return tryToWidenCall(VPI, Range);
7869
7870 Instruction *Instr = R->getUnderlyingInstr();
7871 if (VPI->getOpcode() == Instruction::Store)
7872 if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr)))
7873 return tryToWidenHistogram(*HistInfo, VPI);
7874
7875 if (VPI->getOpcode() == Instruction::Load ||
7876 VPI->getOpcode() == Instruction::Store)
7877 return tryToWidenMemory(VPI, Range);
7878
7879 if (!shouldWiden(Instr, Range))
7880 return nullptr;
7881
7882 if (VPI->getOpcode() == Instruction::GetElementPtr)
7883 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr),
7884 VPI->operandsWithoutMask(), *VPI,
7885 VPI->getDebugLoc());
7886
7887 if (Instruction::isCast(VPI->getOpcode())) {
7888 auto *CI = cast<CastInst>(Instr);
7889 auto *CastR = cast<VPInstructionWithType>(VPI);
7890 return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
7891 CastR->getResultType(), CI, *VPI, *VPI,
7892 VPI->getDebugLoc());
7893 }
7894
7895 return tryToWiden(VPI);
7896}
7897
7898void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
7899 ElementCount MaxVF) {
7900 if (ElementCount::isKnownGT(MinVF, MaxVF))
7901 return;
7902
7903 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7904
7905 const LoopAccessInfo *LAI = Legal->getLAI();
7907 OrigLoop, LI, DT, PSE.getSE());
7908 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
7910 // Only use noalias metadata when using memory checks guaranteeing no
7911 // overlap across all iterations.
7912 LVer.prepareNoAliasMetadata();
7913 }
7914
7915 // Create initial base VPlan0, to serve as common starting point for all
7916 // candidates built later for specific VF ranges.
7917 auto VPlan0 = VPlanTransforms::buildVPlan0(
7918 OrigLoop, *LI, Legal->getWidestInductionType(),
7919 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer);
7920
7921 // Create recipes for header phis.
7923 *VPlan0, PSE, *OrigLoop, Legal->getInductionVars(),
7924 Legal->getReductionVars(), Legal->getFixedOrderRecurrences(),
7925 CM.getInLoopReductions(), Hints.allowReordering());
7926
7928 // If we're vectorizing a loop with an uncountable exit, make sure that the
7929 // recipes are safe to handle.
7930 // TODO: Remove this once we can properly check the VPlan itself for both
7931 // the presence of an uncountable exit and the presence of stores in
7932 // the loop inside handleEarlyExits itself.
7934 if (Legal->hasUncountableEarlyExit())
7935 EEStyle = Legal->hasUncountableExitWithSideEffects()
7938
7939 if (!VPlanTransforms::handleEarlyExits(*VPlan0, EEStyle, OrigLoop, PSE, *DT,
7940 Legal->getAssumptionCache()))
7941 return;
7944 if (CM.foldTailByMasking())
7947 *VPlan0);
7948
7949 auto MaxVFTimes2 = MaxVF * 2;
7950 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7951 VFRange SubRange = {VF, MaxVFTimes2};
7952 if (auto Plan = tryToBuildVPlanWithVPRecipes(
7953 std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
7954 // Now optimize the initial VPlan.
7955 VPlanTransforms::hoistPredicatedLoads(*Plan, PSE, OrigLoop);
7956 VPlanTransforms::sinkPredicatedStores(*Plan, PSE, OrigLoop);
7958 CM.getMinimalBitwidths());
7960 // TODO: try to put addExplicitVectorLength close to addActiveLaneMask
7961 if (CM.foldTailWithEVL()) {
7963 CM.getMaxSafeElements());
7965 }
7966
7967 if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
7968 VPlans.push_back(std::move(P));
7969
7970 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
7971 VPlans.push_back(std::move(Plan));
7972 }
7973 VF = SubRange.End;
7974 }
7975}
7976
7977VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
7978 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
7979
7980 using namespace llvm::VPlanPatternMatch;
7981 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7982
7983 // ---------------------------------------------------------------------------
7984 // Build initial VPlan: Scan the body of the loop in a topological order to
7985 // visit each basic block after having visited its predecessor basic blocks.
7986 // ---------------------------------------------------------------------------
7987
7988 bool RequiresScalarEpilogueCheck =
7990 [this](ElementCount VF) {
7991 return !CM.requiresScalarEpilogue(VF.isVector());
7992 },
7993 Range);
7994 // Update the branch in the middle block if a scalar epilogue is required.
7995 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
7996 if (!RequiresScalarEpilogueCheck && MiddleVPBB->getNumSuccessors() == 2) {
7997 auto *BranchOnCond = cast<VPInstruction>(MiddleVPBB->getTerminator());
7998 assert(MiddleVPBB->getSuccessors()[1] == Plan->getScalarPreheader() &&
7999 "second successor must be scalar preheader");
8000 BranchOnCond->setOperand(0, Plan->getFalse());
8001 }
8002
8003 // Don't use getDecisionAndClampRange here, because we don't know the UF
8004 // so this function is better to be conservative, rather than to split
8005 // it up into different VPlans.
8006 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8007 bool IVUpdateMayOverflow = false;
8008 for (ElementCount VF : Range)
8009 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8010
8011 TailFoldingStyle Style = CM.getTailFoldingStyle();
8012 // Use NUW for the induction increment if we proved that it won't overflow in
8013 // the vector loop or when not folding the tail. In the later case, we know
8014 // that the canonical induction increment will not overflow as the vector trip
8015 // count is >= increment and a multiple of the increment.
8016 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8017 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8018 if (!HasNUW) {
8019 auto *IVInc =
8020 LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(0);
8021 assert(match(IVInc,
8022 m_VPInstruction<Instruction::Add>(
8023 m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
8024 "Did not find the canonical IV increment");
8025 cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
8026 }
8027
8028 // ---------------------------------------------------------------------------
8029 // Pre-construction: record ingredients whose recipes we'll need to further
8030 // process after constructing the initial VPlan.
8031 // ---------------------------------------------------------------------------
8032
8033 // For each interleave group which is relevant for this (possibly trimmed)
8034 // Range, add it to the set of groups to be later applied to the VPlan and add
8035 // placeholders for its members' Recipes which we'll be replacing with a
8036 // single VPInterleaveRecipe.
8037 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8038 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8039 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8040 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8042 // For scalable vectors, the interleave factors must be <= 8 since we
8043 // require the (de)interleaveN intrinsics instead of shufflevectors.
8044 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8045 "Unsupported interleave factor for scalable vectors");
8046 return Result;
8047 };
8048 if (!getDecisionAndClampRange(ApplyIG, Range))
8049 continue;
8050 InterleaveGroups.insert(IG);
8051 }
8052
8053 // ---------------------------------------------------------------------------
8054 // Construct wide recipes and apply predication for original scalar
8055 // VPInstructions in the loop.
8056 // ---------------------------------------------------------------------------
8057 VPRecipeBuilder RecipeBuilder(*Plan, TLI, Legal, CM, Builder);
8058
8059 // Scan the body of the loop in a topological order to visit each basic block
8060 // after having visited its predecessor basic blocks.
8061 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8062 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8063 HeaderVPBB);
8064
8065 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8066
8067 // Collect blocks that need predication for in-loop reduction recipes.
8068 DenseSet<BasicBlock *> BlocksNeedingPredication;
8069 for (BasicBlock *BB : OrigLoop->blocks())
8070 if (CM.blockNeedsPredicationForAnyReason(BB))
8071 BlocksNeedingPredication.insert(BB);
8072
8073 VPlanTransforms::createInLoopReductionRecipes(*Plan, BlocksNeedingPredication,
8074 Range.Start);
8075
8076 // Now process all other blocks and instructions.
8077 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
8078 // Convert input VPInstructions to widened recipes.
8079 for (VPRecipeBase &R : make_early_inc_range(
8080 make_range(VPBB->getFirstNonPhi(), VPBB->end()))) {
8081 // Skip recipes that do not need transforming.
8083 continue;
8084 auto *VPI = cast<VPInstruction>(&R);
8085 if (!VPI->getUnderlyingValue())
8086 continue;
8087
8088 // TODO: Gradually replace uses of underlying instruction by analyses on
8089 // VPlan. Migrate code relying on the underlying instruction from VPlan0
8090 // to construct recipes below to not use the underlying instruction.
8092 Builder.setInsertPoint(VPI);
8093
8094 // The stores with invariant address inside the loop will be deleted, and
8095 // in the exit block, a uniform store recipe will be created for the final
8096 // invariant store of the reduction.
8097 StoreInst *SI;
8098 if ((SI = dyn_cast<StoreInst>(Instr)) &&
8099 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
8100 // Only create recipe for the final invariant store of the reduction.
8101 if (Legal->isInvariantStoreOfReduction(SI)) {
8102 auto *Recipe = new VPReplicateRecipe(
8103 SI, VPI->operandsWithoutMask(), true /* IsUniform */,
8104 nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
8105 Recipe->insertBefore(*MiddleVPBB, MBIP);
8106 }
8107 R.eraseFromParent();
8108 continue;
8109 }
8110
8111 VPRecipeBase *Recipe =
8112 RecipeBuilder.tryToCreateWidenNonPhiRecipe(VPI, Range);
8113 if (!Recipe)
8114 Recipe =
8115 RecipeBuilder.handleReplication(cast<VPInstruction>(VPI), Range);
8116
8117 RecipeBuilder.setRecipe(Instr, Recipe);
8118 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
8119 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8120 // moved to the phi section in the header.
8121 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8122 } else {
8123 Builder.insert(Recipe);
8124 }
8125 if (Recipe->getNumDefinedValues() == 1) {
8126 VPI->replaceAllUsesWith(Recipe->getVPSingleValue());
8127 } else {
8128 assert(Recipe->getNumDefinedValues() == 0 &&
8129 "Unexpected multidef recipe");
8130 }
8131 R.eraseFromParent();
8132 }
8133 }
8134
8135 assert(isa<VPRegionBlock>(LoopRegion) &&
8136 !LoopRegion->getEntryBasicBlock()->empty() &&
8137 "entry block must be set to a VPRegionBlock having a non-empty entry "
8138 "VPBasicBlock");
8139
8140 // TODO: We can't call runPass on these transforms yet, due to verifier
8141 // failures.
8143
8144 // ---------------------------------------------------------------------------
8145 // Transform initial VPlan: Apply previously taken decisions, in order, to
8146 // bring the VPlan to its final state.
8147 // ---------------------------------------------------------------------------
8148
8149 addReductionResultComputation(Plan, RecipeBuilder, Range.Start);
8150
8151 // Optimize FindIV reductions to use sentinel-based approach when possible.
8153 *OrigLoop);
8155 CM.foldTailByMasking());
8156
8157 // Apply mandatory transformation to handle reductions with multiple in-loop
8158 // uses if possible, bail out otherwise.
8160 OrigLoop))
8161 return nullptr;
8162 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8163 // NaNs if possible, bail out otherwise.
8165 return nullptr;
8166
8167 // Create whole-vector selects for find-last recurrences.
8169 return nullptr;
8170
8171 // Create partial reduction recipes for scaled reductions and transform
8172 // recipes to abstract recipes if it is legal and beneficial and clamp the
8173 // range for better cost estimation.
8174 // TODO: Enable following transform when the EVL-version of extended-reduction
8175 // and mulacc-reduction are implemented.
8176 if (!CM.foldTailWithEVL()) {
8177 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
8178 OrigLoop);
8180 Range);
8182 Range);
8183 }
8184
8185 for (ElementCount VF : Range)
8186 Plan->addVF(VF);
8187 Plan->setName("Initial VPlan");
8188
8189 // Interleave memory: for each Interleave Group we marked earlier as relevant
8190 // for this VPlan, replace the Recipes widening its memory instructions with a
8191 // single VPInterleaveRecipe at its insertion point.
8193 InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
8194
8195 // Replace VPValues for known constant strides.
8197 Legal->getLAI()->getSymbolicStrides());
8198
8199 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8200 return Legal->blockNeedsPredication(BB);
8201 };
8203 BlockNeedsPredication);
8204
8205 // Sink users of fixed-order recurrence past the recipe defining the previous
8206 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8208 Builder))
8209 return nullptr;
8210
8211 if (useActiveLaneMask(Style)) {
8212 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8213 // TailFoldingStyle is visible there.
8214 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8215 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow);
8216 }
8217
8218 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8219 return Plan;
8220}
8221
8222VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8223 // Outer loop handling: They may require CFG and instruction level
8224 // transformations before even evaluating whether vectorization is profitable.
8225 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8226 // the vectorization pipeline.
8227 assert(!OrigLoop->isInnermost());
8228 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8229
8230 auto Plan = VPlanTransforms::buildVPlan0(
8231 OrigLoop, *LI, Legal->getWidestInductionType(),
8232 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
8233
8235 *Plan, PSE, *OrigLoop, Legal->getInductionVars(),
8236 MapVector<PHINode *, RecurrenceDescriptor>(),
8237 SmallPtrSet<const PHINode *, 1>(), SmallPtrSet<PHINode *, 1>(),
8238 /*AllowReordering=*/false);
8239 [[maybe_unused]] bool CanHandleExits = VPlanTransforms::handleEarlyExits(
8240 *Plan, UncountableExitStyle::NoUncountableExit, OrigLoop, PSE, *DT,
8241 Legal->getAssumptionCache());
8242 assert(CanHandleExits &&
8243 "early-exits are not supported in VPlan-native path");
8244 VPlanTransforms::addMiddleCheck(*Plan, /*TailFolded*/ false);
8245
8247
8248 for (ElementCount VF : Range)
8249 Plan->addVF(VF);
8250
8252 return nullptr;
8253
8254 // Optimize induction live-out users to use precomputed end values.
8256 /*FoldTail=*/false);
8257
8258 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8259 return Plan;
8260}
8261
8262void LoopVectorizationPlanner::addReductionResultComputation(
8263 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8264 using namespace VPlanPatternMatch;
8265 VPTypeAnalysis TypeInfo(*Plan);
8266 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8267 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8269 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8270 Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end())));
8271 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8272 for (VPRecipeBase &R :
8273 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8274 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8275 // TODO: Remove check for constant incoming value once removeDeadRecipes is
8276 // used on VPlan0.
8277 if (!PhiR || isa<VPIRValue>(PhiR->getOperand(1)))
8278 continue;
8279
8280 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
8281 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
8283 Type *PhiTy = TypeInfo.inferScalarType(PhiR);
8284 // If tail is folded by masking, introduce selects between the phi
8285 // and the users outside the vector region of each reduction, at the
8286 // beginning of the dedicated latch block.
8287 auto *OrigExitingVPV = PhiR->getBackedgeValue();
8288 auto *NewExitingVPV = PhiR->getBackedgeValue();
8289 // Don't output selects for partial reductions because they have an output
8290 // with fewer lanes than the VF. So the operands of the select would have
8291 // different numbers of lanes. Partial reductions mask the input instead.
8292 auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe());
8293 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
8294 (!RR || !RR->isPartialReduction())) {
8295 VPValue *Cond = vputils::findHeaderMask(*Plan);
8296 NewExitingVPV =
8297 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", *PhiR);
8298 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
8299 using namespace VPlanPatternMatch;
8300 return match(
8301 &U, m_CombineOr(
8302 m_VPInstruction<VPInstruction::ComputeAnyOfResult>(),
8303 m_VPInstruction<VPInstruction::ComputeReductionResult>()));
8304 });
8305
8306 if (CM.usePredicatedReductionSelect(RecurrenceKind))
8307 PhiR->setOperand(1, NewExitingVPV);
8308 }
8309
8310 // We want code in the middle block to appear to execute on the location of
8311 // the scalar loop's latch terminator because: (a) it is all compiler
8312 // generated, (b) these instructions are always executed after evaluating
8313 // the latch conditional branch, and (c) other passes may add new
8314 // predecessors which terminate on this line. This is the easiest way to
8315 // ensure we don't accidentally cause an extra step back into the loop while
8316 // debugging.
8317 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
8318
8319 // TODO: At the moment ComputeReductionResult also drives creation of the
8320 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
8321 // even for in-loop reductions, until the reduction resume value handling is
8322 // also modeled in VPlan.
8323 VPInstruction *FinalReductionResult;
8324 VPBuilder::InsertPointGuard Guard(Builder);
8325 Builder.setInsertPoint(MiddleVPBB, IP);
8326 // For AnyOf reductions, find the select among PhiR's users. This is used
8327 // both to find NewVal for ComputeAnyOfResult and to adjust the reduction.
8328 VPRecipeBase *AnyOfSelect = nullptr;
8329 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
8330 AnyOfSelect = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
8331 return match(U, m_Select(m_VPValue(), m_VPValue(), m_VPValue()));
8332 }));
8333 }
8334 if (AnyOfSelect) {
8335 VPValue *Start = PhiR->getStartValue();
8336 // NewVal is the non-phi operand of the select.
8337 VPValue *NewVal = AnyOfSelect->getOperand(1) == PhiR
8338 ? AnyOfSelect->getOperand(2)
8339 : AnyOfSelect->getOperand(1);
8340 FinalReductionResult =
8341 Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
8342 {Start, NewVal, NewExitingVPV}, ExitDL);
8343 } else {
8344 VPIRFlags Flags(RecurrenceKind, PhiR->isOrdered(), PhiR->isInLoop(),
8345 PhiR->getFastMathFlags());
8346 FinalReductionResult =
8347 Builder.createNaryOp(VPInstruction::ComputeReductionResult,
8348 {NewExitingVPV}, Flags, ExitDL);
8349 }
8350 // If the vector reduction can be performed in a smaller type, we truncate
8351 // then extend the loop exit value to enable InstCombine to evaluate the
8352 // entire expression in the smaller type.
8353 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
8355 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
8357 "Unexpected truncated min-max recurrence!");
8358 Type *RdxTy = RdxDesc.getRecurrenceType();
8359 VPWidenCastRecipe *Trunc;
8360 Instruction::CastOps ExtendOpc =
8361 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
8362 VPWidenCastRecipe *Extnd;
8363 {
8364 VPBuilder::InsertPointGuard Guard(Builder);
8365 Builder.setInsertPoint(
8366 NewExitingVPV->getDefiningRecipe()->getParent(),
8367 std::next(NewExitingVPV->getDefiningRecipe()->getIterator()));
8368 Trunc =
8369 Builder.createWidenCast(Instruction::Trunc, NewExitingVPV, RdxTy);
8370 Extnd = Builder.createWidenCast(ExtendOpc, Trunc, PhiTy);
8371 }
8372 if (PhiR->getOperand(1) == NewExitingVPV)
8373 PhiR->setOperand(1, Extnd->getVPSingleValue());
8374
8375 // Update ComputeReductionResult with the truncated exiting value and
8376 // extend its result. Operand 0 provides the values to be reduced.
8377 FinalReductionResult->setOperand(0, Trunc);
8378 FinalReductionResult =
8379 Builder.createScalarCast(ExtendOpc, FinalReductionResult, PhiTy, {});
8380 }
8381
8382 // Update all users outside the vector region. Also replace redundant
8383 // extracts.
8384 for (auto *U : to_vector(OrigExitingVPV->users())) {
8385 auto *Parent = cast<VPRecipeBase>(U)->getParent();
8386 if (FinalReductionResult == U || Parent->getParent())
8387 continue;
8388 // Skip FindIV reduction chain recipes (ComputeReductionResult, icmp).
8390 match(U, m_CombineOr(
8391 m_VPInstruction<VPInstruction::ComputeReductionResult>(),
8392 m_VPInstruction<Instruction::ICmp>())))
8393 continue;
8394 U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
8395
8396 // Look through ExtractLastPart.
8398 U = cast<VPInstruction>(U)->getSingleUser();
8399
8402 cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
8403 }
8404
8405 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8406 // with a boolean reduction phi node to check if the condition is true in
8407 // any iteration. The final value is selected by the final
8408 // ComputeReductionResult.
8409 if (AnyOfSelect) {
8410 VPValue *Cmp = AnyOfSelect->getOperand(0);
8411 // If the compare is checking the reduction PHI node, adjust it to check
8412 // the start value.
8413 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
8414 CmpR->replaceUsesOfWith(PhiR, PhiR->getStartValue());
8415 Builder.setInsertPoint(AnyOfSelect);
8416
8417 // If the true value of the select is the reduction phi, the new value is
8418 // selected if the negated condition is true in any iteration.
8419 if (AnyOfSelect->getOperand(1) == PhiR)
8420 Cmp = Builder.createNot(Cmp);
8421 VPValue *Or = Builder.createOr(PhiR, Cmp);
8422 AnyOfSelect->getVPSingleValue()->replaceAllUsesWith(Or);
8423 // Delete AnyOfSelect now that it has invalid types.
8424 ToDelete.push_back(AnyOfSelect);
8425
8426 // Convert the reduction phi to operate on bools.
8427 PhiR->setOperand(0, Plan->getFalse());
8428 continue;
8429 }
8430
8431 RecurKind RK = PhiR->getRecurrenceKind();
8436 VPBuilder PHBuilder(Plan->getVectorPreheader());
8437 VPValue *Iden = Plan->getOrAddLiveIn(
8438 getRecurrenceIdentity(RK, PhiTy, PhiR->getFastMathFlags()));
8439 auto *ScaleFactorVPV = Plan->getConstantInt(32, 1);
8440 VPValue *StartV = PHBuilder.createNaryOp(
8442 {PhiR->getStartValue(), Iden, ScaleFactorVPV}, *PhiR);
8443 PhiR->setOperand(0, StartV);
8444 }
8445 }
8446 for (VPRecipeBase *R : ToDelete)
8447 R->eraseFromParent();
8448
8450}
8451
8453 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
8454 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
8455 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
8456 assert((!CM.OptForSize ||
8457 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
8458 "Cannot SCEV check stride or overflow when optimizing for size");
8459 VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
8460 HasBranchWeights);
8461 }
8462 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
8463 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
8464 // VPlan-native path does not do any analysis for runtime checks
8465 // currently.
8466 assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
8467 "Runtime checks are not supported for outer loops yet");
8468
8469 if (CM.OptForSize) {
8470 assert(
8471 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
8472 "Cannot emit memory checks when optimizing for size, unless forced "
8473 "to vectorize.");
8474 ORE->emit([&]() {
8475 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
8476 OrigLoop->getStartLoc(),
8477 OrigLoop->getHeader())
8478 << "Code-size may be reduced by not forcing "
8479 "vectorization, or by source-code modifications "
8480 "eliminating the need for runtime checks "
8481 "(e.g., adding 'restrict').";
8482 });
8483 }
8484 VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
8485 HasBranchWeights);
8486 }
8487}
8488
8490 VPlan &Plan, ElementCount VF, unsigned UF,
8491 ElementCount MinProfitableTripCount) const {
8492 const uint32_t *BranchWeights =
8493 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
8495 : nullptr;
8497 Plan, VF, UF, MinProfitableTripCount,
8498 CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
8499 OrigLoop, BranchWeights,
8500 OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(), PSE);
8501}
8502
8503// Determine how to lower the scalar epilogue, which depends on 1) optimising
8504// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8505// predication, and 4) a TTI hook that analyses whether the loop is suitable
8506// for predication.
8508 Function *F, Loop *L, LoopVectorizeHints &Hints, bool OptForSize,
8511 // 1) OptSize takes precedence over all other options, i.e. if this is set,
8512 // don't look at hints or options, and don't request a scalar epilogue.
8513 if (F->hasOptSize() ||
8514 (OptForSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8516
8517 // 2) If set, obey the directives
8518 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
8526 };
8527 }
8528
8529 // 3) If set, obey the hints
8530 switch (Hints.getPredicate()) {
8535 };
8536
8537 // 4) if the TTI hook indicates this is profitable, request predication.
8538 TailFoldingInfo TFI(TLI, &LVL, IAI);
8539 if (TTI->preferPredicateOverEpilogue(&TFI))
8541
8543}
8544
8545// Process the loop in the VPlan-native vectorization path. This path builds
8546// VPlan upfront in the vectorization pipeline, which allows to apply
8547// VPlan-to-VPlan transformations from the very beginning without modifying the
8548// input LLVM IR.
8554 std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize,
8555 LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) {
8556
8558 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8559 return false;
8560 }
8561 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8562 Function *F = L->getHeader()->getParent();
8563 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8564
8566 getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, *LVL, &IAI);
8567
8568 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE,
8569 GetBFI, F, &Hints, IAI, OptForSize);
8570 // Use the planner for outer loop vectorization.
8571 // TODO: CM is not used at this point inside the planner. Turn CM into an
8572 // optional argument if we don't need it in the future.
8573 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
8574 ORE);
8575
8576 // Get user vectorization factor.
8577 ElementCount UserVF = Hints.getWidth();
8578
8580
8581 // Plan how to best vectorize, return the best VF and its cost.
8582 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8583
8584 // If we are stress testing VPlan builds, do not attempt to generate vector
8585 // code. Masked vector code generation support will follow soon.
8586 // Also, do not attempt to vectorize if no vector code will be produced.
8588 return false;
8589
8590 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
8591
8592 {
8593 GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
8594 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
8595 Checks, BestPlan);
8596 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8597 << L->getHeader()->getParent()->getName() << "\"\n");
8598 LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1,
8600 bool HasBranchWeights =
8601 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
8602 LVP.attachRuntimeChecks(BestPlan, Checks, HasBranchWeights);
8603
8604 LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT);
8605 }
8606
8607 reportVectorization(ORE, L, VF, 1);
8608
8609 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8610 return true;
8611}
8612
8613// Emit a remark if there are stores to floats that required a floating point
8614// extension. If the vectorized loop was generated with floating point there
8615// will be a performance penalty from the conversion overhead and the change in
8616// the vector width.
8619 for (BasicBlock *BB : L->getBlocks()) {
8620 for (Instruction &Inst : *BB) {
8621 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
8622 if (S->getValueOperand()->getType()->isFloatTy())
8623 Worklist.push_back(S);
8624 }
8625 }
8626 }
8627
8628 // Traverse the floating point stores upwards searching, for floating point
8629 // conversions.
8632 while (!Worklist.empty()) {
8633 auto *I = Worklist.pop_back_val();
8634 if (!L->contains(I))
8635 continue;
8636 if (!Visited.insert(I).second)
8637 continue;
8638
8639 // Emit a remark if the floating point store required a floating
8640 // point conversion.
8641 // TODO: More work could be done to identify the root cause such as a
8642 // constant or a function return type and point the user to it.
8643 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
8644 ORE->emit([&]() {
8645 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
8646 I->getDebugLoc(), L->getHeader())
8647 << "floating point conversion changes vector width. "
8648 << "Mixed floating point precision requires an up/down "
8649 << "cast that will negatively impact performance.";
8650 });
8651
8652 for (Use &Op : I->operands())
8653 if (auto *OpI = dyn_cast<Instruction>(Op))
8654 Worklist.push_back(OpI);
8655 }
8656}
8657
8658/// For loops with uncountable early exits, find the cost of doing work when
8659/// exiting the loop early, such as calculating the final exit values of
8660/// variables used outside the loop.
8661/// TODO: This is currently overly pessimistic because the loop may not take
8662/// the early exit, but better to keep this conservative for now. In future,
8663/// it might be possible to relax this by using branch probabilities.
8665 VPlan &Plan, ElementCount VF) {
8666 InstructionCost Cost = 0;
8667 for (auto *ExitVPBB : Plan.getExitBlocks()) {
8668 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
8669 // If the predecessor is not the middle.block, then it must be the
8670 // vector.early.exit block, which may contain work to calculate the exit
8671 // values of variables used outside the loop.
8672 if (PredVPBB != Plan.getMiddleBlock()) {
8673 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
8674 << PredVPBB->getName() << ":\n");
8675 Cost += PredVPBB->cost(VF, CostCtx);
8676 }
8677 }
8678 }
8679 return Cost;
8680}
8681
8682/// This function determines whether or not it's still profitable to vectorize
8683/// the loop given the extra work we have to do outside of the loop:
8684/// 1. Perform the runtime checks before entering the loop to ensure it's safe
8685/// to vectorize.
8686/// 2. In the case of loops with uncountable early exits, we may have to do
8687/// extra work when exiting the loop early, such as calculating the final
8688/// exit values of variables used outside the loop.
8689/// 3. The middle block.
8690static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
8691 VectorizationFactor &VF, Loop *L,
8693 VPCostContext &CostCtx, VPlan &Plan,
8695 std::optional<unsigned> VScale) {
8696 InstructionCost RtC = Checks.getCost();
8697 if (!RtC.isValid())
8698 return false;
8699
8700 // When interleaving only scalar and vector cost will be equal, which in turn
8701 // would lead to a divide by 0. Fall back to hard threshold.
8702 if (VF.Width.isScalar()) {
8703 // TODO: Should we rename VectorizeMemoryCheckThreshold?
8705 LLVM_DEBUG(
8706 dbgs()
8707 << "LV: Interleaving only is not profitable due to runtime checks\n");
8708 return false;
8709 }
8710 return true;
8711 }
8712
8713 // The scalar cost should only be 0 when vectorizing with a user specified
8714 // VF/IC. In those cases, runtime checks should always be generated.
8715 uint64_t ScalarC = VF.ScalarCost.getValue();
8716 if (ScalarC == 0)
8717 return true;
8718
8719 InstructionCost TotalCost = RtC;
8720 // Add on the cost of any work required in the vector early exit block, if
8721 // one exists.
8722 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
8723 TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx);
8724
8725 // First, compute the minimum iteration count required so that the vector
8726 // loop outperforms the scalar loop.
8727 // The total cost of the scalar loop is
8728 // ScalarC * TC
8729 // where
8730 // * TC is the actual trip count of the loop.
8731 // * ScalarC is the cost of a single scalar iteration.
8732 //
8733 // The total cost of the vector loop is
8734 // TotalCost + VecC * (TC / VF) + EpiC
8735 // where
8736 // * TotalCost is the sum of the costs cost of
8737 // - the generated runtime checks, i.e. RtC
8738 // - performing any additional work in the vector.early.exit block for
8739 // loops with uncountable early exits.
8740 // - the middle block, if ExpectedTC <= VF.Width.
8741 // * VecC is the cost of a single vector iteration.
8742 // * TC is the actual trip count of the loop
8743 // * VF is the vectorization factor
8744 // * EpiCost is the cost of the generated epilogue, including the cost
8745 // of the remaining scalar operations.
8746 //
8747 // Vectorization is profitable once the total vector cost is less than the
8748 // total scalar cost:
8749 // TotalCost + VecC * (TC / VF) + EpiC < ScalarC * TC
8750 //
8751 // Now we can compute the minimum required trip count TC as
8752 // VF * (TotalCost + EpiC) / (ScalarC * VF - VecC) < TC
8753 //
8754 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
8755 // the computations are performed on doubles, not integers and the result
8756 // is rounded up, hence we get an upper estimate of the TC.
8757 unsigned IntVF = estimateElementCount(VF.Width, VScale);
8758 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
8759 uint64_t MinTC1 =
8760 Div == 0 ? 0 : divideCeil(TotalCost.getValue() * IntVF, Div);
8761
8762 // Second, compute a minimum iteration count so that the cost of the
8763 // runtime checks is only a fraction of the total scalar loop cost. This
8764 // adds a loop-dependent bound on the overhead incurred if the runtime
8765 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
8766 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
8767 // cost, compute
8768 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
8769 uint64_t MinTC2 = divideCeil(RtC.getValue() * 10, ScalarC);
8770
8771 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
8772 // epilogue is allowed, choose the next closest multiple of VF. This should
8773 // partly compensate for ignoring the epilogue cost.
8774 uint64_t MinTC = std::max(MinTC1, MinTC2);
8775 if (SEL == CM_ScalarEpilogueAllowed)
8776 MinTC = alignTo(MinTC, IntVF);
8778
8779 LLVM_DEBUG(
8780 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
8781 << VF.MinProfitableTripCount << "\n");
8782
8783 // Skip vectorization if the expected trip count is less than the minimum
8784 // required trip count.
8785 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
8786 if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
8787 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
8788 "trip count < minimum profitable VF ("
8789 << *ExpectedTC << " < " << VF.MinProfitableTripCount
8790 << ")\n");
8791
8792 return false;
8793 }
8794 }
8795 return true;
8796}
8797
8799 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8801 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8803
8804/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
8805/// vectorization.
8808 using namespace VPlanPatternMatch;
8809 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
8810 // introduce multiple uses of undef/poison. If the reduction start value may
8811 // be undef or poison it needs to be frozen and the frozen start has to be
8812 // used when computing the reduction result. We also need to use the frozen
8813 // value in the resume phi generated by the main vector loop, as this is also
8814 // used to compute the reduction result after the epilogue vector loop.
8815 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
8816 bool UpdateResumePhis) {
8817 VPBuilder Builder(Plan.getEntry());
8818 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
8819 auto *VPI = dyn_cast<VPInstruction>(&R);
8820 if (!VPI)
8821 continue;
8822 VPValue *OrigStart;
8823 if (!matchFindIVResult(VPI, m_VPValue(), m_VPValue(OrigStart)))
8824 continue;
8826 continue;
8827 VPInstruction *Freeze =
8828 Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
8829 VPI->setOperand(2, Freeze);
8830 if (UpdateResumePhis)
8831 OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
8832 return Freeze != &U && isa<VPPhi>(&U);
8833 });
8834 }
8835 };
8836 AddFreezeForFindLastIVReductions(MainPlan, true);
8837 AddFreezeForFindLastIVReductions(EpiPlan, false);
8838
8839 VPValue *VectorTC = nullptr;
8840 auto *Term =
8842 [[maybe_unused]] bool MatchedTC =
8843 match(Term, m_BranchOnCount(m_VPValue(), m_VPValue(VectorTC)));
8844 assert(MatchedTC && "must match vector trip count");
8845
8846 // If there is a suitable resume value for the canonical induction in the
8847 // scalar (which will become vector) epilogue loop, use it and move it to the
8848 // beginning of the scalar preheader. Otherwise create it below.
8849 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
8850 auto ResumePhiIter =
8851 find_if(MainScalarPH->phis(), [VectorTC](VPRecipeBase &R) {
8852 return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
8853 m_ZeroInt()));
8854 });
8855 VPPhi *ResumePhi = nullptr;
8856 if (ResumePhiIter == MainScalarPH->phis().end()) {
8857 using namespace llvm::VPlanPatternMatch;
8858 assert(
8860 m_ZeroInt()) &&
8861 "canonical IV must start at 0");
8862 Type *Ty = VPTypeAnalysis(MainPlan).inferScalarType(VectorTC);
8863 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
8864 ResumePhi = ScalarPHBuilder.createScalarPhi(
8865 {VectorTC, MainPlan.getZero(Ty)}, {}, "vec.epilog.resume.val");
8866 } else {
8867 ResumePhi = cast<VPPhi>(&*ResumePhiIter);
8868 ResumePhi->setName("vec.epilog.resume.val");
8869 if (&MainScalarPH->front() != ResumePhi)
8870 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->begin());
8871 }
8872
8873 // Create a ResumeForEpilogue for the canonical IV resume as the
8874 // first non-phi, to keep it alive for the epilogue.
8875 VPBuilder ResumeBuilder(MainScalarPH);
8876 ResumeBuilder.createNaryOp(VPInstruction::ResumeForEpilogue, ResumePhi);
8877
8878 // Create ResumeForEpilogue instructions for the resume phis of the
8879 // VPIRPhis in the scalar header of the main plan and return them so they can
8880 // be used as resume values when vectorizing the epilogue.
8881 return to_vector(
8882 map_range(MainPlan.getScalarHeader()->phis(), [&](VPRecipeBase &R) {
8883 assert(isa<VPIRPhi>(R) &&
8884 "only VPIRPhis expected in the scalar header");
8885 return ResumeBuilder.createNaryOp(VPInstruction::ResumeForEpilogue,
8886 R.getOperand(0));
8887 }));
8888}
8889
8890/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
8891/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
8892/// reductions require creating new instructions to compute the resume values.
8893/// They are collected in a vector and returned. They must be moved to the
8894/// preheader of the vector epilogue loop, after created by the execution of \p
8895/// Plan.
8897 VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
8899 ScalarEvolution &SE) {
8900 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
8901 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
8902 Header->setName("vec.epilog.vector.body");
8903
8904 VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV();
8905 // When vectorizing the epilogue loop, the canonical induction needs to start
8906 // at the resume value from the main vector loop. Find the resume value
8907 // created during execution of the main VPlan. It must be the first phi in the
8908 // loop preheader. Add this resume value as an offset to the canonical IV of
8909 // the epilogue loop.
8910 using namespace llvm::PatternMatch;
8911 PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
8912 for (Value *Inc : EPResumeVal->incoming_values()) {
8913 if (match(Inc, m_SpecificInt(0)))
8914 continue;
8915 assert(!EPI.VectorTripCount &&
8916 "Must only have a single non-zero incoming value");
8917 EPI.VectorTripCount = Inc;
8918 }
8919 // If we didn't find a non-zero vector trip count, all incoming values
8920 // must be zero, which also means the vector trip count is zero. Pick the
8921 // first zero as vector trip count.
8922 // TODO: We should not choose VF * UF so the main vector loop is known to
8923 // be dead.
8924 if (!EPI.VectorTripCount) {
8925 assert(EPResumeVal->getNumIncomingValues() > 0 &&
8926 all_of(EPResumeVal->incoming_values(),
8927 [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
8928 "all incoming values must be 0");
8929 EPI.VectorTripCount = EPResumeVal->getOperand(0);
8930 }
8931 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
8932 assert(all_of(IV->users(),
8933 [](const VPUser *U) {
8934 return isa<VPScalarIVStepsRecipe>(U) ||
8935 isa<VPDerivedIVRecipe>(U) ||
8936 cast<VPRecipeBase>(U)->isScalarCast() ||
8937 cast<VPInstruction>(U)->getOpcode() ==
8938 Instruction::Add;
8939 }) &&
8940 "the canonical IV should only be used by its increment or "
8941 "ScalarIVSteps when resetting the start value");
8942 VPBuilder Builder(Header, Header->getFirstNonPhi());
8943 VPInstruction *Add = Builder.createAdd(IV, VPV);
8944 // Replace all users of the canonical IV and its increment with the offset
8945 // version, except for the Add itself and the canonical IV increment.
8946 auto *Increment = cast<VPInstruction>(IV->getBackedgeValue());
8947 IV->replaceUsesWithIf(Add, [Add, Increment](VPUser &U, unsigned) {
8948 return &U != Add && &U != Increment;
8949 });
8950 VPInstruction *OffsetIVInc =
8952 Increment->replaceUsesWithIf(OffsetIVInc,
8953 [IV](VPUser &U, unsigned) { return &U != IV; });
8954 OffsetIVInc->setOperand(0, Increment);
8955
8957 SmallVector<Instruction *> InstsToMove;
8958 // Ensure that the start values for all header phi recipes are updated before
8959 // vectorizing the epilogue loop. Skip the canonical IV, which has been
8960 // handled above.
8961 for (VPRecipeBase &R : drop_begin(Header->phis())) {
8962 Value *ResumeV = nullptr;
8963 // TODO: Move setting of resume values to prepareToExecute.
8964 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
8965 // Find the reduction result by searching users of the phi or its backedge
8966 // value.
8967 auto IsReductionResult = [](VPRecipeBase *R) {
8968 auto *VPI = dyn_cast<VPInstruction>(R);
8969 if (!VPI)
8970 return false;
8973 };
8974 auto *RdxResult = cast<VPInstruction>(
8975 vputils::findRecipe(ReductionPhi->getBackedgeValue(), IsReductionResult));
8976 assert(RdxResult && "expected to find reduction result");
8977
8978 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
8979 ->getIncomingValueForBlock(L->getLoopPreheader());
8980
8981 // Check for FindIV pattern by looking for icmp user of RdxResult.
8982 // The pattern is: select(icmp ne RdxResult, Sentinel), RdxResult, Start
8983 using namespace VPlanPatternMatch;
8984 VPValue *SentinelVPV = nullptr;
8985 bool IsFindIV = any_of(RdxResult->users(), [&](VPUser *U) {
8986 return match(U, VPlanPatternMatch::m_SpecificICmp(
8987 ICmpInst::ICMP_NE, m_Specific(RdxResult),
8988 m_VPValue(SentinelVPV)));
8989 });
8990
8991 if (RdxResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
8992 Value *StartV = RdxResult->getOperand(0)->getLiveInIRValue();
8993 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
8994 // start value; compare the final value from the main vector loop
8995 // to the start value.
8996 BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
8997 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
8998 ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
8999 if (auto *I = dyn_cast<Instruction>(ResumeV))
9000 InstsToMove.push_back(I);
9001 } else if (IsFindIV) {
9002 assert(SentinelVPV && "expected to find icmp using RdxResult");
9003
9004 // Get the frozen start value from the main loop.
9005 Value *FrozenStartV = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
9007 if (auto *FreezeI = dyn_cast<FreezeInst>(FrozenStartV))
9008 ToFrozen[FreezeI->getOperand(0)] = FrozenStartV;
9009
9010 // Adjust resume: select(icmp eq ResumeV, FrozenStartV), Sentinel,
9011 // ResumeV
9012 BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
9013 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9014 Value *Cmp = Builder.CreateICmpEQ(ResumeV, FrozenStartV);
9015 if (auto *I = dyn_cast<Instruction>(Cmp))
9016 InstsToMove.push_back(I);
9017 ResumeV =
9018 Builder.CreateSelect(Cmp, SentinelVPV->getLiveInIRValue(), ResumeV);
9019 if (auto *I = dyn_cast<Instruction>(ResumeV))
9020 InstsToMove.push_back(I);
9021 } else {
9022 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9023 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9024 if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
9026 "unexpected start value");
9027 // Partial sub-reductions always start at 0 and account for the
9028 // reduction start value in a final subtraction. Update it to use the
9029 // resume value from the main vector loop.
9030 if (PhiR->getVFScaleFactor() > 1 &&
9031 PhiR->getRecurrenceKind() == RecurKind::Sub) {
9032 auto *Sub = cast<VPInstruction>(RdxResult->getSingleUser());
9033 assert(Sub->getOpcode() == Instruction::Sub && "Unexpected opcode");
9034 assert(isa<VPIRValue>(Sub->getOperand(0)) &&
9035 "Expected operand to match the original start value of the "
9036 "reduction");
9039 "Expected start value for partial sub-reduction to start at "
9040 "zero");
9041 Sub->setOperand(0, StartVal);
9042 } else
9043 VPI->setOperand(0, StartVal);
9044 continue;
9045 }
9046 }
9047 } else {
9048 // Retrieve the induction resume values for wide inductions from
9049 // their original phi nodes in the scalar loop.
9050 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
9051 // Hook up to the PHINode generated by a ResumePhi recipe of main
9052 // loop VPlan, which feeds the scalar loop.
9053 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
9054 }
9055 assert(ResumeV && "Must have a resume value");
9056 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9057 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
9058 }
9059
9060 // For some VPValues in the epilogue plan we must re-use the generated IR
9061 // values from the main plan. Replace them with live-in VPValues.
9062 // TODO: This is a workaround needed for epilogue vectorization and it
9063 // should be removed once induction resume value creation is done
9064 // directly in VPlan.
9065 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
9066 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9067 // epilogue plan. This ensures all users use the same frozen value.
9068 auto *VPI = dyn_cast<VPInstruction>(&R);
9069 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9071 ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
9072 continue;
9073 }
9074
9075 // Re-use the trip count and steps expanded for the main loop, as
9076 // skeleton creation needs it as a value that dominates both the scalar
9077 // and vector epilogue loops
9078 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
9079 if (!ExpandR)
9080 continue;
9081 VPValue *ExpandedVal =
9082 Plan.getOrAddLiveIn(ExpandedSCEVs.lookup(ExpandR->getSCEV()));
9083 ExpandR->replaceAllUsesWith(ExpandedVal);
9084 if (Plan.getTripCount() == ExpandR)
9085 Plan.resetTripCount(ExpandedVal);
9086 ExpandR->eraseFromParent();
9087 }
9088
9089 auto VScale = CM.getVScaleForTuning();
9090 unsigned MainLoopStep =
9091 estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
9092 unsigned EpilogueLoopStep =
9093 estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
9095 Plan, EPI.VectorTripCount,
9097 EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
9098
9099 return InstsToMove;
9100}
9101
9102static void
9104 VPlan &BestEpiPlan,
9105 ArrayRef<VPInstruction *> ResumeValues) {
9106 // Fix resume values from the additional bypass block.
9107 BasicBlock *PH = L->getLoopPreheader();
9108 for (auto *Pred : predecessors(PH)) {
9109 for (PHINode &Phi : PH->phis()) {
9110 if (Phi.getBasicBlockIndex(Pred) != -1)
9111 continue;
9112 Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
9113 }
9114 }
9115 auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader());
9116 if (ScalarPH->hasPredecessors()) {
9117 // Fix resume values for inductions and reductions from the additional
9118 // bypass block using the incoming values from the main loop's resume phis.
9119 // ResumeValues correspond 1:1 with the scalar loop header phis.
9120 for (auto [ResumeV, HeaderPhi] :
9121 zip(ResumeValues, BestEpiPlan.getScalarHeader()->phis())) {
9122 auto *HeaderPhiR = cast<VPIRPhi>(&HeaderPhi);
9123 auto *EpiResumePhi =
9124 cast<PHINode>(HeaderPhiR->getIRPhi().getIncomingValueForBlock(PH));
9125 if (EpiResumePhi->getBasicBlockIndex(BypassBlock) == -1)
9126 continue;
9127 auto *MainResumePhi = cast<PHINode>(ResumeV->getUnderlyingValue());
9128 EpiResumePhi->setIncomingValueForBlock(
9129 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
9130 }
9131 }
9132}
9133
9134/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
9135/// loop, after both plans have executed, updating branches from the iteration
9136/// and runtime checks of the main loop, as well as updating various phis. \p
9137/// InstsToMove contains instructions that need to be moved to the preheader of
9138/// the epilogue vector loop.
9139static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L,
9141 DominatorTree *DT,
9142 GeneratedRTChecks &Checks,
9143 ArrayRef<Instruction *> InstsToMove,
9144 ArrayRef<VPInstruction *> ResumeValues) {
9145 BasicBlock *VecEpilogueIterationCountCheck =
9146 cast<VPIRBasicBlock>(EpiPlan.getEntry())->getIRBasicBlock();
9147
9148 BasicBlock *VecEpiloguePreHeader =
9149 cast<CondBrInst>(VecEpilogueIterationCountCheck->getTerminator())
9150 ->getSuccessor(1);
9151 // Adjust the control flow taking the state info from the main loop
9152 // vectorization into account.
9154 "expected this to be saved from the previous pass.");
9155 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
9157 VecEpilogueIterationCountCheck, VecEpiloguePreHeader);
9158
9160 VecEpilogueIterationCountCheck},
9162 VecEpiloguePreHeader}});
9163
9164 BasicBlock *ScalarPH =
9165 cast<VPIRBasicBlock>(EpiPlan.getScalarPreheader())->getIRBasicBlock();
9167 VecEpilogueIterationCountCheck, ScalarPH);
9168 DTU.applyUpdates(
9170 VecEpilogueIterationCountCheck},
9172
9173 // Adjust the terminators of runtime check blocks and phis using them.
9174 BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
9175 BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
9176 if (SCEVCheckBlock) {
9177 SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
9178 VecEpilogueIterationCountCheck, ScalarPH);
9179 DTU.applyUpdates({{DominatorTree::Delete, SCEVCheckBlock,
9180 VecEpilogueIterationCountCheck},
9181 {DominatorTree::Insert, SCEVCheckBlock, ScalarPH}});
9182 }
9183 if (MemCheckBlock) {
9184 MemCheckBlock->getTerminator()->replaceUsesOfWith(
9185 VecEpilogueIterationCountCheck, ScalarPH);
9186 DTU.applyUpdates(
9187 {{DominatorTree::Delete, MemCheckBlock, VecEpilogueIterationCountCheck},
9188 {DominatorTree::Insert, MemCheckBlock, ScalarPH}});
9189 }
9190
9191 // The vec.epilog.iter.check block may contain Phi nodes from inductions
9192 // or reductions which merge control-flow from the latch block and the
9193 // middle block. Update the incoming values here and move the Phi into the
9194 // preheader.
9195 SmallVector<PHINode *, 4> PhisInBlock(
9196 llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
9197
9198 for (PHINode *Phi : PhisInBlock) {
9199 Phi->moveBefore(VecEpiloguePreHeader->getFirstNonPHIIt());
9200 Phi->replaceIncomingBlockWith(
9201 VecEpilogueIterationCountCheck->getSinglePredecessor(),
9202 VecEpilogueIterationCountCheck);
9203
9204 // If the phi doesn't have an incoming value from the
9205 // EpilogueIterationCountCheck, we are done. Otherwise remove the
9206 // incoming value and also those from other check blocks. This is needed
9207 // for reduction phis only.
9208 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
9209 return EPI.EpilogueIterationCountCheck == IncB;
9210 }))
9211 continue;
9212 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
9213 if (SCEVCheckBlock)
9214 Phi->removeIncomingValue(SCEVCheckBlock);
9215 if (MemCheckBlock)
9216 Phi->removeIncomingValue(MemCheckBlock);
9217 }
9218
9219 auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
9220 for (auto *I : InstsToMove)
9221 I->moveBefore(IP);
9222
9223 // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
9224 // after executing the main loop. We need to update the resume values of
9225 // inductions and reductions during epilogue vectorization.
9226 fixScalarResumeValuesFromBypass(VecEpilogueIterationCountCheck, L, EpiPlan,
9227 ResumeValues);
9228
9229 // Remove dead phis that were moved to the epilogue preheader but are unused
9230 // (e.g., resume phis for inductions not widened in the epilogue vector loop).
9231 for (PHINode &Phi : make_early_inc_range(VecEpiloguePreHeader->phis()))
9232 if (Phi.use_empty())
9233 Phi.eraseFromParent();
9234}
9235
9237 assert((EnableVPlanNativePath || L->isInnermost()) &&
9238 "VPlan-native path is not enabled. Only process inner loops.");
9239
9240 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9241 << L->getHeader()->getParent()->getName() << "' from "
9242 << L->getLocStr() << "\n");
9243
9244 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9245
9246 LLVM_DEBUG(
9247 dbgs() << "LV: Loop hints:"
9248 << " force="
9250 ? "disabled"
9252 ? "enabled"
9253 : "?"))
9254 << " width=" << Hints.getWidth()
9255 << " interleave=" << Hints.getInterleave() << "\n");
9256
9257 // Function containing loop
9258 Function *F = L->getHeader()->getParent();
9259
9260 // Looking at the diagnostic output is the only way to determine if a loop
9261 // was vectorized (other than looking at the IR or machine code), so it
9262 // is important to generate an optimization remark for each loop. Most of
9263 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9264 // generated as OptimizationRemark and OptimizationRemarkMissed are
9265 // less verbose reporting vectorized loops and unvectorized loops that may
9266 // benefit from vectorization, respectively.
9267
9268 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9269 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9270 return false;
9271 }
9272
9273 PredicatedScalarEvolution PSE(*SE, *L);
9274
9275 // Query this against the original loop and save it here because the profile
9276 // of the original loop header may change as the transformation happens.
9277 bool OptForSize = llvm::shouldOptimizeForSize(
9278 L->getHeader(), PSI,
9279 PSI && PSI->hasProfileSummary() ? &GetBFI() : nullptr,
9281
9282 // Check if it is legal to vectorize the loop.
9283 LoopVectorizationRequirements Requirements;
9284 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9285 &Requirements, &Hints, DB, AC,
9286 /*AllowRuntimeSCEVChecks=*/!OptForSize, AA);
9288 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9289 Hints.emitRemarkWithHints();
9290 return false;
9291 }
9292
9293 if (LVL.hasUncountableEarlyExit()) {
9295 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
9296 "early exit is not enabled",
9297 "UncountableEarlyExitLoopsDisabled", ORE, L);
9298 return false;
9299 }
9300 }
9301
9302 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9303 // here. They may require CFG and instruction level transformations before
9304 // even evaluating whether vectorization is profitable. Since we cannot modify
9305 // the incoming IR, we need to build VPlan upfront in the vectorization
9306 // pipeline.
9307 if (!L->isInnermost())
9308 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9309 ORE, GetBFI, OptForSize, Hints,
9310 Requirements);
9311
9312 assert(L->isInnermost() && "Inner loop expected.");
9313
9314 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9315 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9316
9317 // If an override option has been passed in for interleaved accesses, use it.
9318 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9319 UseInterleaved = EnableInterleavedMemAccesses;
9320
9321 // Analyze interleaved memory accesses.
9322 if (UseInterleaved)
9324
9325 if (LVL.hasUncountableEarlyExit()) {
9326 BasicBlock *LoopLatch = L->getLoopLatch();
9327 if (IAI.requiresScalarEpilogue() ||
9329 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9330 reportVectorizationFailure("Auto-vectorization of early exit loops "
9331 "requiring a scalar epilogue is unsupported",
9332 "UncountableEarlyExitUnsupported", ORE, L);
9333 return false;
9334 }
9335 }
9336
9337 // Check the function attributes and profiles to find out if this function
9338 // should be optimized for size.
9340 getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, &IAI);
9341
9342 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9343 // count by optimizing for size, to minimize overheads.
9344 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9345 if (ExpectedTC && ExpectedTC->isFixed() &&
9346 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
9347 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9348 << "This loop is worth vectorizing only if no scalar "
9349 << "iteration overheads are incurred.");
9351 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9352 else {
9353 LLVM_DEBUG(dbgs() << "\n");
9354 // Predicate tail-folded loops are efficient even when the loop
9355 // iteration count is low. However, setting the epilogue policy to
9356 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9357 // with runtime checks. It's more effective to let
9358 // `isOutsideLoopWorkProfitable` determine if vectorization is
9359 // beneficial for the loop.
9362 }
9363 }
9364
9365 // Check the function attributes to see if implicit floats or vectors are
9366 // allowed.
9367 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9369 "Can't vectorize when the NoImplicitFloat attribute is used",
9370 "loop not vectorized due to NoImplicitFloat attribute",
9371 "NoImplicitFloat", ORE, L);
9372 Hints.emitRemarkWithHints();
9373 return false;
9374 }
9375
9376 // Check if the target supports potentially unsafe FP vectorization.
9377 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9378 // for the target we're vectorizing for, to make sure none of the
9379 // additional fp-math flags can help.
9380 if (Hints.isPotentiallyUnsafe() &&
9381 TTI->isFPVectorizationPotentiallyUnsafe()) {
9383 "Potentially unsafe FP op prevents vectorization",
9384 "loop not vectorized due to unsafe FP support.",
9385 "UnsafeFP", ORE, L);
9386 Hints.emitRemarkWithHints();
9387 return false;
9388 }
9389
9390 bool AllowOrderedReductions;
9391 // If the flag is set, use that instead and override the TTI behaviour.
9392 if (ForceOrderedReductions.getNumOccurrences() > 0)
9393 AllowOrderedReductions = ForceOrderedReductions;
9394 else
9395 AllowOrderedReductions = TTI->enableOrderedReductions();
9396 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9397 ORE->emit([&]() {
9398 auto *ExactFPMathInst = Requirements.getExactFPInst();
9399 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9400 ExactFPMathInst->getDebugLoc(),
9401 ExactFPMathInst->getParent())
9402 << "loop not vectorized: cannot prove it is safe to reorder "
9403 "floating-point operations";
9404 });
9405 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9406 "reorder floating-point operations\n");
9407 Hints.emitRemarkWithHints();
9408 return false;
9409 }
9410
9411 // Use the cost model.
9412 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9413 GetBFI, F, &Hints, IAI, OptForSize);
9414 // Use the planner for vectorization.
9415 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9416 ORE);
9417
9418 // Get user vectorization factor and interleave count.
9419 ElementCount UserVF = Hints.getWidth();
9420 unsigned UserIC = Hints.getInterleave();
9421 if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
9422 UserIC = 1;
9423
9424 // Plan how to best vectorize.
9425 LVP.plan(UserVF, UserIC);
9427 unsigned IC = 1;
9428
9429 if (ORE->allowExtraAnalysis(LV_NAME))
9431
9432 GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
9433 if (LVP.hasPlanWithVF(VF.Width)) {
9434 // Select the interleave count.
9435 IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
9436
9437 unsigned SelectedIC = std::max(IC, UserIC);
9438 // Optimistically generate runtime checks if they are needed. Drop them if
9439 // they turn out to not be profitable.
9440 if (VF.Width.isVector() || SelectedIC > 1) {
9441 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
9442 *ORE);
9443
9444 // Bail out early if either the SCEV or memory runtime checks are known to
9445 // fail. In that case, the vector loop would never execute.
9446 using namespace llvm::PatternMatch;
9447 if (Checks.getSCEVChecks().first &&
9448 match(Checks.getSCEVChecks().first, m_One()))
9449 return false;
9450 if (Checks.getMemRuntimeChecks().first &&
9451 match(Checks.getMemRuntimeChecks().first, m_One()))
9452 return false;
9453 }
9454
9455 // Check if it is profitable to vectorize with runtime checks.
9456 bool ForceVectorization =
9458 VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
9459 CM.CostKind, CM.PSE, L);
9460 if (!ForceVectorization &&
9461 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
9462 LVP.getPlanFor(VF.Width), SEL,
9463 CM.getVScaleForTuning())) {
9464 ORE->emit([&]() {
9466 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9467 L->getHeader())
9468 << "loop not vectorized: cannot prove it is safe to reorder "
9469 "memory operations";
9470 });
9471 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9472 Hints.emitRemarkWithHints();
9473 return false;
9474 }
9475 }
9476
9477 // Identify the diagnostic messages that should be produced.
9478 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9479 bool VectorizeLoop = true, InterleaveLoop = true;
9480 if (VF.Width.isScalar()) {
9481 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9482 VecDiagMsg = {
9483 "VectorizationNotBeneficial",
9484 "the cost-model indicates that vectorization is not beneficial"};
9485 VectorizeLoop = false;
9486 }
9487
9488 if (UserIC == 1 && Hints.getInterleave() > 1) {
9490 "UserIC should only be ignored due to unsafe dependencies");
9491 LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
9492 IntDiagMsg = {"InterleavingUnsafe",
9493 "Ignoring user-specified interleave count due to possibly "
9494 "unsafe dependencies in the loop."};
9495 InterleaveLoop = false;
9496 } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
9497 // Tell the user interleaving was avoided up-front, despite being explicitly
9498 // requested.
9499 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9500 "interleaving should be avoided up front\n");
9501 IntDiagMsg = {"InterleavingAvoided",
9502 "Ignoring UserIC, because interleaving was avoided up front"};
9503 InterleaveLoop = false;
9504 } else if (IC == 1 && UserIC <= 1) {
9505 // Tell the user interleaving is not beneficial.
9506 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9507 IntDiagMsg = {
9508 "InterleavingNotBeneficial",
9509 "the cost-model indicates that interleaving is not beneficial"};
9510 InterleaveLoop = false;
9511 if (UserIC == 1) {
9512 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9513 IntDiagMsg.second +=
9514 " and is explicitly disabled or interleave count is set to 1";
9515 }
9516 } else if (IC > 1 && UserIC == 1) {
9517 // Tell the user interleaving is beneficial, but it explicitly disabled.
9518 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
9519 "disabled.\n");
9520 IntDiagMsg = {"InterleavingBeneficialButDisabled",
9521 "the cost-model indicates that interleaving is beneficial "
9522 "but is explicitly disabled or interleave count is set to 1"};
9523 InterleaveLoop = false;
9524 }
9525
9526 // If there is a histogram in the loop, do not just interleave without
9527 // vectorizing. The order of operations will be incorrect without the
9528 // histogram intrinsics, which are only used for recipes with VF > 1.
9529 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
9530 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
9531 << "to histogram operations.\n");
9532 IntDiagMsg = {
9533 "HistogramPreventsScalarInterleaving",
9534 "Unable to interleave without vectorization due to constraints on "
9535 "the order of histogram operations"};
9536 InterleaveLoop = false;
9537 }
9538
9539 // Override IC if user provided an interleave count.
9540 IC = UserIC > 0 ? UserIC : IC;
9541
9542 // Emit diagnostic messages, if any.
9543 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9544 if (!VectorizeLoop && !InterleaveLoop) {
9545 // Do not vectorize or interleaving the loop.
9546 ORE->emit([&]() {
9547 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9548 L->getStartLoc(), L->getHeader())
9549 << VecDiagMsg.second;
9550 });
9551 ORE->emit([&]() {
9552 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9553 L->getStartLoc(), L->getHeader())
9554 << IntDiagMsg.second;
9555 });
9556 return false;
9557 }
9558
9559 if (!VectorizeLoop && InterleaveLoop) {
9560 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9561 ORE->emit([&]() {
9562 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9563 L->getStartLoc(), L->getHeader())
9564 << VecDiagMsg.second;
9565 });
9566 } else if (VectorizeLoop && !InterleaveLoop) {
9567 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9568 << ") in " << L->getLocStr() << '\n');
9569 ORE->emit([&]() {
9570 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9571 L->getStartLoc(), L->getHeader())
9572 << IntDiagMsg.second;
9573 });
9574 } else if (VectorizeLoop && InterleaveLoop) {
9575 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9576 << ") in " << L->getLocStr() << '\n');
9577 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9578 }
9579
9580 // Report the vectorization decision.
9581 if (VF.Width.isScalar()) {
9582 using namespace ore;
9583 assert(IC > 1);
9584 ORE->emit([&]() {
9585 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9586 L->getHeader())
9587 << "interleaved loop (interleaved count: "
9588 << NV("InterleaveCount", IC) << ")";
9589 });
9590 } else {
9591 // Report the vectorization decision.
9592 reportVectorization(ORE, L, VF, IC);
9593 }
9594 if (ORE->allowExtraAnalysis(LV_NAME))
9596
9597 // If we decided that it is *legal* to interleave or vectorize the loop, then
9598 // do it.
9599
9600 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9601 // Consider vectorizing the epilogue too if it's profitable.
9602 VectorizationFactor EpilogueVF =
9604 bool HasBranchWeights =
9605 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9606 if (EpilogueVF.Width.isVector()) {
9607 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
9608
9609 // The first pass vectorizes the main loop and creates a scalar epilogue
9610 // to be vectorized by executing the plan (potentially with a different
9611 // factor) again shortly afterwards.
9612 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
9613 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
9614 BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
9615 SmallVector<VPInstruction *> ResumeValues =
9616 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
9617 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
9618 BestEpiPlan);
9619
9620 // Add minimum iteration check for the epilogue plan, followed by runtime
9621 // checks for the main plan.
9622 LVP.addMinimumIterationCheck(*BestMainPlan, EPI.EpilogueVF, EPI.EpilogueUF,
9624 LVP.attachRuntimeChecks(*BestMainPlan, Checks, HasBranchWeights);
9626 *BestMainPlan, EPI.MainLoopVF, EPI.MainLoopUF,
9628 HasBranchWeights ? MinItersBypassWeights : nullptr,
9629 L->getLoopPredecessor()->getTerminator()->getDebugLoc(), PSE);
9630
9631 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9632 Checks, *BestMainPlan);
9633 auto ExpandedSCEVs = LVP.executePlan(
9634 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT,
9636 ++LoopsVectorized;
9637
9638 // Derive EPI fields from VPlan-generated IR.
9639 BasicBlock *EntryBB =
9640 cast<VPIRBasicBlock>(BestMainPlan->getEntry())->getIRBasicBlock();
9641 EntryBB->setName("iter.check");
9642 EPI.EpilogueIterationCountCheck = EntryBB;
9643 // The check chain is: Entry -> [SCEV] -> [Mem] -> MainCheck -> VecPH.
9644 // MainCheck is the non-bypass successor of the last runtime check block
9645 // (or Entry if there are no runtime checks).
9646 BasicBlock *LastCheck = EntryBB;
9647 if (BasicBlock *MemBB = Checks.getMemRuntimeChecks().second)
9648 LastCheck = MemBB;
9649 else if (BasicBlock *SCEVBB = Checks.getSCEVChecks().second)
9650 LastCheck = SCEVBB;
9651 BasicBlock *ScalarPH = L->getLoopPreheader();
9652 auto *BI = cast<CondBrInst>(LastCheck->getTerminator());
9654 BI->getSuccessor(BI->getSuccessor(0) == ScalarPH);
9655
9656 // Second pass vectorizes the epilogue and adjusts the control flow
9657 // edges from the first pass.
9658 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9659 Checks, BestEpiPlan);
9661 BestEpiPlan, L, ExpandedSCEVs, EPI, CM, *PSE.getSE());
9662 LVP.attachRuntimeChecks(BestEpiPlan, Checks, HasBranchWeights);
9663 LVP.executePlan(
9664 EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
9666 connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, Checks, InstsToMove,
9667 ResumeValues);
9668 ++LoopsEpilogueVectorized;
9669 } else {
9670 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, Checks,
9671 BestPlan);
9672 // TODO: Move to general VPlan pipeline once epilogue loops are also
9673 // supported.
9675 BestPlan, VF.Width, IC, PSE);
9676 LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
9678 LVP.attachRuntimeChecks(BestPlan, Checks, HasBranchWeights);
9679
9680 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
9681 ++LoopsVectorized;
9682 }
9683
9684 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
9685 "DT not preserved correctly");
9686 assert(!verifyFunction(*F, &dbgs()));
9687
9688 return true;
9689}
9690
9692
9693 // Don't attempt if
9694 // 1. the target claims to have no vector registers, and
9695 // 2. interleaving won't help ILP.
9696 //
9697 // The second condition is necessary because, even if the target has no
9698 // vector registers, loop vectorization may still enable scalar
9699 // interleaving.
9700 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9701 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
9702 return LoopVectorizeResult(false, false);
9703
9704 bool Changed = false, CFGChanged = false;
9705
9706 // The vectorizer requires loops to be in simplified form.
9707 // Since simplification may add new inner loops, it has to run before the
9708 // legality and profitability checks. This means running the loop vectorizer
9709 // will simplify all loops, regardless of whether anything end up being
9710 // vectorized.
9711 for (const auto &L : *LI)
9712 Changed |= CFGChanged |=
9713 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9714
9715 // Build up a worklist of inner-loops to vectorize. This is necessary as
9716 // the act of vectorizing or partially unrolling a loop creates new loops
9717 // and can invalidate iterators across the loops.
9718 SmallVector<Loop *, 8> Worklist;
9719
9720 for (Loop *L : *LI)
9721 collectSupportedLoops(*L, LI, ORE, Worklist);
9722
9723 LoopsAnalyzed += Worklist.size();
9724
9725 // Now walk the identified inner loops.
9726 while (!Worklist.empty()) {
9727 Loop *L = Worklist.pop_back_val();
9728
9729 // For the inner loops we actually process, form LCSSA to simplify the
9730 // transform.
9731 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9732
9733 Changed |= CFGChanged |= processLoop(L);
9734
9735 if (Changed) {
9736 LAIs->clear();
9737
9738#ifndef NDEBUG
9739 if (VerifySCEV)
9740 SE->verify();
9741#endif
9742 }
9743 }
9744
9745 // Process each loop nest in the function.
9746 return LoopVectorizeResult(Changed, CFGChanged);
9747}
9748
9751 LI = &AM.getResult<LoopAnalysis>(F);
9752 // There are no loops in the function. Return before computing other
9753 // expensive analyses.
9754 if (LI->empty())
9755 return PreservedAnalyses::all();
9764 AA = &AM.getResult<AAManager>(F);
9765
9766 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9767 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9768 GetBFI = [&AM, &F]() -> BlockFrequencyInfo & {
9770 };
9771 LoopVectorizeResult Result = runImpl(F);
9772 if (!Result.MadeAnyChange)
9773 return PreservedAnalyses::all();
9775
9776 if (isAssignmentTrackingEnabled(*F.getParent())) {
9777 for (auto &BB : F)
9779 }
9780
9781 PA.preserve<LoopAnalysis>();
9785
9786 if (Result.MadeCFGChange) {
9787 // Making CFG changes likely means a loop got vectorized. Indicate that
9788 // extra simplification passes should be run.
9789 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
9790 // be run if runtime checks have been added.
9793 } else {
9795 }
9796 return PA;
9797}
9798
9800 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
9801 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
9802 OS, MapClassName2PassName);
9803
9804 OS << '<';
9805 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
9806 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
9807 OS << '>';
9808}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Lower Kernel Arguments
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI)
Definition CostModel.cpp:73
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
static bool hasNoUnsignedWrap(BinaryOperator &I)
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
This header provides classes for managing per-loop analyses.
static cl::opt< bool > WidenIV("loop-flatten-widen-iv", cl::Hidden, cl::init(true), cl::desc("Widen the loop induction variables, if possible, so " "overflow checks won't reject flattening"))
static const char * VerboseDebug
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static unsigned getMaxTCFromNonZeroRange(PredicatedScalarEvolution &PSE, Loop *L)
Get the maximum trip count for L from the SCEV unsigned range, excluding zero from the range.
static Type * maybeVectorizeType(Type *Ty, ElementCount VF)
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L)
A version of ScalarEvolution::getSmallConstantTripCount that returns an ElementCount to include loops...
static bool hasUnsupportedHeaderPhiRecipe(VPlan &Plan)
Returns true if the VPlan contains header phi recipes that are not currently supported for epilogue v...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI, DominatorTree *DT, GeneratedRTChecks &Checks, ArrayRef< Instruction * > InstsToMove, ArrayRef< VPInstruction * > ResumeValues)
Connect the epilogue vector loop generated for EpiPlan to the main vector loop, after both plans have...
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static void legacyCSE(BasicBlock *BB)
FIXME: This legacy common-subexpression-elimination routine is scheduled for removal,...
static VPIRBasicBlock * replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB, VPlan *Plan=nullptr)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, std::function< BlockFrequencyInfo &()> GetBFI, bool OptForSize, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static cl::opt< bool > ConsiderRegPressure("vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, cl::desc("Discard VFs if their register pressure is too high."))
static unsigned estimateElementCount(ElementCount VF, std::optional< unsigned > VScale)
This function attempts to return a value that represents the ElementCount at runtime.
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static SmallVector< VPInstruction * > preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop, ElementCount VF)
Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, bool OptForSize, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static std::optional< ElementCount > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true, bool CanExcludeZeroTrips=false)
Returns "best known" trip count, which is either a valid positive trip count or std::nullopt when an ...
static SmallVector< Instruction * > preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM, ScalarEvolution &SE)
Prepare Plan for vectorizing the epilogue loop.
static const SCEV * getAddressAccessSCEV(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets the address access SCEV for Ptr, if it should be used for cost modeling according to isAddressSC...
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool hasReplicatorRegion(VPlan &Plan)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static bool hasFindLastReductionPhi(VPlan &Plan)
Returns true if the VPlan contains a VPReductionPHIRecipe with FindLast recurrence kind.
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx, VPlan &Plan, ElementCount VF)
For loops with uncountable early exits, find the cost of doing work when exiting the loop early,...
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L, VPlan &BestEpiPlan, ArrayRef< VPInstruction * > ResumeValues)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static cl::opt< bool > ForceTargetSupportsMaskedMemoryOps("force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden, cl::desc("Assume the target supports masked memory operations (used for " "testing)."))
Note: This currently only applies to llvm.masked.load and llvm.masked.store.
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, PredicatedScalarEvolution &PSE, VPCostContext &CostCtx, VPlan &Plan, ScalarEpilogueLowering SEL, std::optional< unsigned > VScale)
This function determines whether or not it's still profitable to vectorize the loop given the extra w...
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file implements a map that provides insertion order iteration.
This file contains the declarations for metadata subclasses.
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
This file contains some templates that are useful if you are working with the STL at all.
#define OP(OPC)
Definition Instruction.h:46
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file contains the declarations of different VPlan-related auxiliary helpers.
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
#define RUN_VPLAN_PASS_NO_VERIFY(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:986
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Conditional Branch instruction.
BasicBlock * getSuccessor(unsigned i) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
This class represents a range of values.
LLVM_ABI APInt getUnsignedMax() const
Return the largest unsigned value contained in the ConstantRange.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getTemporary()
Definition DebugLoc.h:160
static DebugLoc getUnknown()
Definition DebugLoc.h:161
An analysis that produces DemandedBits for a function.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
void insert_range(Range &&R)
Inserts range of 'std::pair<KeyT, ValueT>' values into the map.
Definition DenseMap.h:294
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition TypeSize.h:315
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, GeneratedRTChecks &Checks, VPlan &Plan)
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (i....
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, GeneratedRTChecks &Check, VPlan &Plan)
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Class to represent function types.
param_iterator param_begin() const
param_iterator param_end() const
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:728
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
A struct for saving information about induction variables.
const SCEV * getStep() const
ArrayRef< Instruction * > getCastInsts() const
Returns an ArrayRef to the type cast instructions in the induction update chain, that are redundant w...
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor)
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
const TargetTransformInfo * TTI
Target Transform Info.
LoopVectorizationCostModel * Cost
The profitablity analysis.
friend class LoopVectorizationPlanner
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationCostModel *CM, GeneratedRTChecks &RTChecks, VPlan &Plan)
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
DominatorTree * DT
Dominator Tree.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
virtual BasicBlock * createVectorizedLoopSkeleton()
Creates a basic block for the scalar preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
VPBasicBlock * VectorPHVPBB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
unsigned UF
The vectorization unroll factor to use.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
BasicBlock * createScalarPreheader(StringRef Prefix)
Create and return a new IR basic block for the scalar preheader whose name is prefixed with Prefix.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
LLVM_ABI APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition Type.cpp:378
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
InstTy * getInsertPos() const
uint32_t getNumMembers() const
Drive the analysis of interleaved memory accesses in the loop.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
LLVM_ABI void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
An instruction for reading from memory.
Type * getPointerOperandType() const
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Store the result of a depth first search within basic blocks contained by a single loop.
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
RPOIterator endRPO() const
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool useWideActiveLaneMask() const
Returns true if the use of wide lane masks is requested and the loop is using tail-folding with a lan...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
BlockFrequencyInfo * BFI
The BlockFrequencyInfo returned from GetBFI.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
BlockFrequencyInfo & getBFI()
Returns the BlockFrequencyInfo for the function if cached, otherwise fetches it via GetBFI.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF)
Returns true if an artificially high cost for emulated masked memrefs should be used.
void collectNonVectorizedAndSetWideningDecisions(ElementCount VF)
Collect values that will not be widened, including Uniforms, Scalars, and Instructions to Scalarize f...
bool isMaskRequired(Instruction *I) const
Wrapper function for LoopVectorizationLegality::isMaskRequired, that passes the Instruction I and if ...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< unsigned > getMaxSafeElements() const
Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationLegality * Legal
Vectorization legality.
uint64_t getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB)
A helper function that returns how much we should divide the cost of a predicated block by.
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool OptForSize
Whether this loop should be optimized for size based on function attribute or profile information.
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind)
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
bool shouldConsiderRegPressureForVF(ElementCount VF)
Loop * TheLoop
The loop that we evaluate.
TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
std::optional< unsigned > getVScaleForTuning() const
Return the value of vscale used for tuning the cost model.
void setTailFoldingStyle(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool preferPredicatedLoop() const
Returns true if tail-folding is preferred over a scalar epilogue.
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool usePredicatedReductionSelect(RecurKind RecurrenceKind) const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF)
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool isScalarWithPredication(Instruction *I, ElementCount VF)
Returns true if I is an instruction which requires predication and for which our chosen predication s...
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
std::function< BlockFrequencyInfo &()> GetBFI
A function to lazily fetch BlockFrequencyInfo.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, std::function< BlockFrequencyInfo &()> GetBFI, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI, bool OptForSize)
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
FixedScalableVFPair MaxPermissibleVFWithoutMaxBW
The highest VF possible for this loop, without using MaxBandwidth.
const SmallPtrSetImpl< PHINode * > & getInLoopReductions() const
Returns the set of in-loop reduction PHIs.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
TailFoldingStyle getTailFoldingStyle() const
Returns the TailFoldingStyle that is best for the current loop.
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
MapVector< PHINode *, InductionDescriptor > InductionList
InductionList saves induction variables and maps them to the induction descriptor.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
bool hasUncountableEarlyExit() const
Returns true if the loop has uncountable early exits, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
Planner drives the vectorization process after having passed Legality checks.
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, EpilogueVectorizationKind EpilogueVecKind=EpilogueVectorizationKind::None)
EpilogueVectorizationKind
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
@ MainLoop
Vectorizing the main loop of epilogue vectorization.
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition VPlan.cpp:1632
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
void updateLoopMetadataAndProfileInfo(Loop *VectorLoop, VPBasicBlock *HeaderVPBB, const VPlan &Plan, bool VectorizingEpilogue, MDNode *OrigLoopID, std::optional< unsigned > OrigAverageTripCount, unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF, bool DisableRuntimeUnroll)
Update loop metadata and profile info for both the scalar remainder loop and VectorLoop,...
Definition VPlan.cpp:1683
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
Definition VPlan.cpp:1616
VectorizationFactor computeBestVF()
Compute and return the most profitable vectorization factor.
void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const
Attach the runtime checks of RTChecks to Plan.
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, InstructionCost LoopCost)
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1597
VectorizationFactor selectEpilogueVectorizationFactor(ElementCount MainLoopVF, unsigned IC)
void printPlans(raw_ostream &O)
Definition VPlan.cpp:1777
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount) const
Create a check to Plan to see if the vector loop should be executed based on its trip count.
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition LoopInfo.cpp:73
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:653
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:67
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:124
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
Value * getIncomingValueForBlock(const BasicBlock *BB) const
unsigned getNumIncomingValues() const
Return the number of incoming edges.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEVPredicate & getPredicate() const
LLVM_ABI unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
LLVM_ABI const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
bool hasUsesOutsideReductionChain() const
Returns true if the reduction PHI has any uses outside the reduction chain.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
LLVM_ABI Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
void eraseDeadInstructions(Value *Root)
Remove inserted instructions that are dead, e.g.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getURemExpr(SCEVUse LHS, SCEVUse RHS)
Represents an unsigned remainder expression based on unsigned division.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
LLVM_ABI void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
const SCEV * getMinusOne(Type *Ty)
Return a SCEV for the constant -1 of a specific type.
LLVM_ABI void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void insert_range(Range &&R)
Definition SetVector.h:176
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:262
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI std::optional< unsigned > getVScaleForTuning() const
LLVM_ABI bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
@ Store
The extracted value is stored (ExtractElement only).
LLVM_ABI bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getOperandsScalarizationOverhead(ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing operands with the given types.
LLVM_ABI bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI bool isElementTypeLegalForScalableVector(Type *Ty) const
LLVM_ABI ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
LLVM_ABI InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI bool supportsScalableVectors() const
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing an instruction.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
LLVM_ABI InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4253
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4280
iterator end()
Definition VPlan.h:4290
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4288
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4341
InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override
Return the cost of this VPBasicBlock.
Definition VPlan.cpp:778
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
const VPRecipeBase & front() const
Definition VPlan.h:4300
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:644
bool empty() const
Definition VPlan.h:4299
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
void setName(const Twine &newName)
Definition VPlan.h:183
VPlan * getPlan()
Definition VPlan.cpp:177
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:231
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:273
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition VPlanUtils.h:244
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createAdd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3831
VPIRValue * getStartValue() const
Returns the start value of the canonical induction.
Definition VPlan.h:3853
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:465
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:438
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2306
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2348
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2337
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2048
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4406
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1225
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1456
iterator_range< operand_iterator > operandsWithoutMask()
Returns an iterator range over the operands excluding the mask operand if present.
Definition VPlan.h:1476
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1272
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1330
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1321
unsigned getOpcode() const
Definition VPlan.h:1405
void setName(StringRef NewName)
Set the symbolic name for the VPInstruction.
Definition VPlan.h:1504
VPValue * getMask() const
Returns the mask for the VPInstruction.
Definition VPlan.h:1470
bool isMasked() const
Returns true if the VPInstruction has a mask operand.
Definition VPlan.h:1446
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2970
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1633
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:406
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R, VFRange &Range)
Create and return a widened recipe for a non-phi recipe R if one can be created within the given VF R...
VPValue * getVPValueOrAddLiveIn(Value *V)
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
bool isOrdered() const
Returns true, if the phi is part of an ordered reduction.
Definition VPlan.h:2761
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2740
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2764
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2758
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3063
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4441
const VPBlockBase * getEntry() const
Definition VPlan.h:4477
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4539
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3217
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:607
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:675
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:296
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:340
operand_iterator op_begin()
Definition VPlanValue.h:360
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:335
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:70
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1428
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1434
user_range users()
Definition VPlanValue.h:149
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2154
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1840
A recipe for handling GEP instructions.
Definition VPlan.h:2090
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2454
A recipe for widened phis.
Definition VPlan.h:2590
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1784
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4571
bool hasVF(ElementCount VF) const
Definition VPlan.h:4784
VPBasicBlock * getEntry()
Definition VPlan.h:4663
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4721
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4791
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4760
bool hasUF(unsigned UF) const
Definition VPlan.h:4802
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4711
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4827
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4853
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1058
bool hasEarlyExit() const
Returns true if the VPlan is based on a loop with an early exit.
Definition VPlan.h:4948
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition VPlan.cpp:1040
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4735
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4688
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4757
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4702
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition VPlan.cpp:922
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4707
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4668
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4753
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1206
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:709
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
constexpr bool isNonZero() const
Definition TypeSize.h:155
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr bool isZero() const
Definition TypeSize.h:153
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:223
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:237
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an std::string.
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:190
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
cst_pred_ty< is_specific_signed_cst > m_scev_SpecificSInt(int64_t V)
Match an SCEV constant with a plain signed integer (sign-extended value will be matched)
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bind_ty< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
SCEVBinaryExpr_match< SCEVMulExpr, Op0_t, Op1_t, SCEV::FlagAnyWrap, true > m_scev_c_Mul(const Op0_t &Op0, const Op1_t &Op1)
class_match< const SCEV > m_SCEV()
int_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
bool matchFindIVResult(VPInstruction *VPI, Op0_t ReducedIV, Op1_t Start)
Match FindIV result pattern: select(icmp ne ComputeReductionResult(ReducedIV), Sentinel),...
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
bool match(Val *V, const Pattern &P)
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
Add a small namespace to avoid name clashes with the classes used in the streaming interface.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPBasicBlock * getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT)
Returns the header block of the first, top-level loop, or null if none exist.
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
auto cast_if_present(const Y &Val)
cast_if_present<X> - Functionally identical to cast, except that a null value is accepted.
Definition Casting.h:683
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
LLVM_ABI_FOR_TEST cl::opt< bool > VerifyEachVPlan
LLVM_ABI std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Return either:
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
OuterAnalysisManagerProxy< ModuleAnalysisManager, Function > ModuleAnalysisManagerFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
LLVM_ABI bool VerifySCEV
LLVM_ABI_FOR_TEST cl::opt< bool > VPlanPrintAfterAll
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:280
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:366
constexpr auto bind_front(FnT &&Fn, BindArgsT &&...BindArgs)
C++20 bind_front.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
bool containsIrreducibleCFG(RPOTraversalT &RPOTraversal, const LoopInfoT &LI)
Return true if the control flow in RPOTraversal is irreducible.
Definition CFG.h:154
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:83
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:88
@ MaskedHandleExitInScalarLoop
All memory operations other than the load(s) required to determine whether an uncountable exit occurr...
Definition VPlan.h:93
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI cl::opt< bool > EnableLoopVectorization
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
LLVM_ABI_FOR_TEST cl::list< std::string > VPlanPrintAfterPasses
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:422
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
T * find_singleton(R &&Range, Predicate P, bool AllowRepeats=false)
Return the single value in Range that satisfies P(<member of Range> *, AllowRepeats)->T * returning n...
Definition STLExtras.h:1837
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
DWARFExpression::Operation Op
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
cl::opt< bool > EnableVPlanNativePath
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
LLVM_ABI Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
bool pred_empty(const BasicBlock *BB)
Definition CFG.h:119
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataAndControlFlow
Use predicate to control both data and control flow.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:345
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
std::unique_ptr< VPlan > VPlanPtr
Definition VPlan.h:78
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI_FOR_TEST bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
LLVM_ABI_FOR_TEST cl::opt< bool > VPlanPrintVectorRegionScope
LLVM_ABI cl::opt< bool > EnableLoopInterleaving
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition Analysis.h:29
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
An information struct used to provide DenseMap with the various necessary components for a given valu...
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
TargetLibraryInfo * TLI
LLVM_ABI LoopVectorizeResult runImpl(Function &F)
LLVM_ABI bool processLoop(Loop *L)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
LLVM_ABI void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LLVM_ABI LoopVectorizePass(LoopVectorizeOptions Opts={})
ScalarEvolution * SE
AssumptionCache * AC
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
std::function< BlockFrequencyInfo &()> GetBFI
TargetTransformInfo * TTI
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:70
A marker analysis to determine if extra passes should be run after loop vectorization.
static LLVM_ABI AnalysisKey Key
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
ElementCount End
Struct to hold various analysis needed for cost computations.
LoopVectorizationCostModel & CM
bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const
Return true if I is considered uniform-after-vectorization in the legacy cost model for VF.
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
uint64_t getPredBlockCostDivisor(BasicBlock *BB) const
TargetTransformInfo::TargetCostKind CostKind
SmallPtrSet< Instruction *, 8 > SkipCostComputation
A struct that represents some properties of the register usage of a loop.
InstructionCost spillCost(VPCostContext &Ctx, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3619
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3702
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, bool TailFolded, Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL, PredicatedScalarEvolution &PSE, VPBasicBlock *CheckBlock=nullptr)
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static LLVM_ABI_FOR_TEST std::unique_ptr< VPlan > buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, PredicatedScalarEvolution &PSE, LoopVersioning *LVer=nullptr)
Create a base VPlan0, serving as the common starting point for all later candidates.
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void introduceMasksAndLinearize(VPlan &Plan)
Predicate and linearize the control-flow in the only loop region of Plan.
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void createInLoopReductionRecipes(VPlan &Plan, const DenseSet< BasicBlock * > &BlocksNeedingPredication, ElementCount MinVF)
Create VPReductionRecipes for in-loop reductions.
static void foldTailByMasking(VPlan &Plan)
Adapts the vector loop region for tail folding by introducing a header mask and conditionally executi...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addMinimumVectorEpilogueIterationCheck(VPlan &Plan, Value *VectorTripCount, bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF, unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE)
Add a check to Plan to see if the epilogue vector loop should be executed.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool handleMultiUseReductions(VPlan &Plan, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
Try to legalize reductions with multiple in-loop uses.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static bool handleFindLastReductions(VPlan &Plan)
Check if Plan contains any FindLast reductions.
static void unrollByUF(VPlan &Plan, unsigned UF)
Explicitly unroll Plan by UF.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void createHeaderPhiRecipes(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &OrigLoop, const MapVector< PHINode *, InductionDescriptor > &Inductions, const MapVector< PHINode *, RecurrenceDescriptor > &Reductions, const SmallPtrSetImpl< const PHINode * > &FixedOrderRecurrences, const SmallPtrSetImpl< PHINode * > &InLoopReductions, bool AllowReordering)
Replace VPPhi recipes in Plan's header with corresponding VPHeaderPHIRecipe subclasses for inductions...
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses except the canoni...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static LLVM_ABI_FOR_TEST bool handleEarlyExits(VPlan &Plan, UncountableExitStyle Style, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC)
Update Plan to account for all early exits.
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool handleMaxMinNumReductions(VPlan &Plan)
Check if Plan contains any FMaxNum or FMinNum reductions.
static LLVM_ABI_FOR_TEST void createLoopRegions(VPlan &Plan)
Replace loops in Plan's flat CFG with VPRegionBlocks, turning Plan's flat CFG into a hierarchical CFG...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void replicateByVF(VPlan &Plan, ElementCount VF)
Replace replicating VPReplicateRecipe, VPScalarIVStepsRecipe and VPInstruction in Plan with VF single...
static void addIterationCountCheckBlock(VPlan &Plan, ElementCount VF, unsigned UF, bool RequiresScalarEpilogue, Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL, PredicatedScalarEvolution &PSE)
Add a new check block before the vector preheader to Plan to check if the main vector loop should be ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step)
Materialize vector trip count computations to a set of VPInstructions.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...
static LLVM_ABI_FOR_TEST void addMiddleCheck(VPlan &Plan, bool TailFolded)
If a check is needed to guard executing the scalar epilogue loop, it will be added to the middle bloc...
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.
static LLVM_ABI bool HoistRuntimeChecks