LLVM 23.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
84#include "llvm/Analysis/CFG.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
147#include <algorithm>
148#include <cassert>
149#include <cmath>
150#include <cstdint>
151#include <functional>
152#include <iterator>
153#include <limits>
154#include <memory>
155#include <string>
156#include <tuple>
157#include <utility>
158
159using namespace llvm;
160using namespace SCEVPatternMatch;
161
162#define LV_NAME "loop-vectorize"
163#define DEBUG_TYPE LV_NAME
164
165#ifndef NDEBUG
166const char VerboseDebug[] = DEBUG_TYPE "-verbose";
167#endif
168
169STATISTIC(LoopsVectorized, "Number of loops vectorized");
170STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
172STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
173
175 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
176 cl::desc("Enable vectorization of epilogue loops."));
177
179 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
180 cl::desc("When epilogue vectorization is enabled, and a value greater than "
181 "1 is specified, forces the given VF for all applicable epilogue "
182 "loops."));
183
185 "epilogue-vectorization-minimum-VF", cl::Hidden,
186 cl::desc("Only loops with vectorization factor equal to or larger than "
187 "the specified value are considered for epilogue vectorization."));
188
189/// Loops with a known constant trip count below this number are vectorized only
190/// if no scalar iteration overheads are incurred.
192 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
193 cl::desc("Loops with a constant trip count that is smaller than this "
194 "value are vectorized only if no scalar iteration overheads "
195 "are incurred."));
196
198 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
199 cl::desc("The maximum allowed number of runtime memory checks"));
200
201/// Option tail-folding-policy controls the tail-folding strategy and lists all
202/// available options. The vectorizer will attempt to fold the tail-loop into
203/// the vector loop (main/epilogue loops) and predicate the instructions
204/// accordingly. If tail-folding fails, there are different fallback strategies
205/// depending on these values:
207
209 "tail-folding-policy", cl::init(TailFoldingPolicyTy::None), cl::Hidden,
210 cl::desc("Tail-folding preferences over creating an epilogue loop."),
212 clEnumValN(TailFoldingPolicyTy::None, "dont-fold-tail",
213 "Don't tail-fold loops."),
215 "prefer tail-folding, otherwise create an epilogue when "
216 "appropriate."),
218 "always tail-fold, don't attempt vectorization if "
219 "tail-folding fails.")));
220
222 "epilogue-tail-folding-policy", cl::Hidden,
223 cl::desc(
224 "Epilogue-tail-folding preferences over creating an epilogue loop."),
226 clEnumValN(TailFoldingPolicyTy::None, "dont-fold-tail",
227 "Don't tail-fold loops."),
229 "prefer tail-folding, otherwise create an epilogue when "
230 "appropriate.")));
231
233 "force-tail-folding-style", cl::desc("Force the tail folding style"),
236 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
239 "Create lane mask for data only, using active.lane.mask intrinsic"),
241 "data-without-lane-mask",
242 "Create lane mask with compare/stepvector"),
244 "Create lane mask using active.lane.mask intrinsic, and use "
245 "it for both data and control flow"),
247 "Use predicated EVL instructions for tail folding. If EVL "
248 "is unsupported, fallback to data-without-lane-mask.")));
249
251 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
252 cl::desc("Enable use of wide lane masks when used for control flow in "
253 "tail-folded loops"));
254
256 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
257 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
258
259/// An interleave-group may need masking if it resides in a block that needs
260/// predication, or in order to mask away gaps.
262 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
263 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
264
266 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
267 cl::desc("A flag that overrides the target's number of scalar registers."));
268
270 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
271 cl::desc("A flag that overrides the target's number of vector registers."));
272
274 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
275 cl::desc("A flag that overrides the target's max interleave factor for "
276 "scalar loops."));
277
279 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
280 cl::desc("A flag that overrides the target's max interleave factor for "
281 "vectorized loops."));
282
284 "force-target-instruction-cost", cl::init(0), cl::Hidden,
285 cl::desc("A flag that overrides the target's expected cost for "
286 "an instruction to a single constant value. Mostly "
287 "useful for getting consistent testing."));
288
290 "small-loop-cost", cl::init(20), cl::Hidden,
291 cl::desc(
292 "The cost of a loop that is considered 'small' by the interleaver."));
293
295 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
296 cl::desc("Enable the use of the block frequency analysis to access PGO "
297 "heuristics minimizing code growth in cold regions and being more "
298 "aggressive in hot regions."));
299
300// Runtime interleave loops for load/store throughput.
302 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
303 cl::desc(
304 "Enable runtime interleaving until load/store ports are saturated"));
305
306/// The number of stores in a loop that are allowed to need predication.
308 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
309 cl::desc("Max number of stores to be predicated behind an if."));
310
312 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
313 cl::desc("Count the induction variable only once when interleaving"));
314
316 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
317 cl::desc("The maximum interleave count to use when interleaving a scalar "
318 "reduction in a nested loop."));
319
321 "force-ordered-reductions", cl::init(false), cl::Hidden,
322 cl::desc("Enable the vectorisation of loops with in-order (strict) "
323 "FP reductions"));
324
326 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
327 cl::desc(
328 "Prefer predicating a reduction operation over an after loop select."));
329
331 "enable-vplan-native-path", cl::Hidden,
332 cl::desc("Enable VPlan-native vectorization path with "
333 "support for outer loop vectorization."));
334
336 llvm::VerifyEachVPlan("vplan-verify-each",
337#ifdef EXPENSIVE_CHECKS
338 cl::init(true),
339#else
340 cl::init(false),
341#endif
343 cl::desc("Verify VPlans after VPlan transforms."));
344
345#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
347 "vplan-print-after-all", cl::init(false), cl::Hidden,
348 cl::desc("Print VPlans after all VPlan transformations."));
349
351 "vplan-print-after", cl::Hidden,
352 cl::desc("Print VPlans after specified VPlan transformations (regexp)."));
353
355 "vplan-print-vector-region-scope", cl::init(false), cl::Hidden,
356 cl::desc("Limit VPlan printing to vector loop region in "
357 "`-vplan-print-after*` if the plan has one."));
358#endif
359
360// This flag enables the stress testing of the VPlan H-CFG construction in the
361// VPlan-native vectorization path. It must be used in conjuction with
362// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
363// verification of the H-CFGs built.
365 "vplan-build-outerloop-stress-test", cl::init(false), cl::Hidden,
366 cl::desc(
367 "Build VPlan for every supported loop nest in the function and bail "
368 "out right after the build (stress test the VPlan H-CFG construction "
369 "in the VPlan-native vectorization path)."));
370
372 "interleave-loops", cl::init(true), cl::Hidden,
373 cl::desc("Enable loop interleaving in Loop vectorization passes"));
375 "vectorize-loops", cl::init(true), cl::Hidden,
376 cl::desc("Run the Loop vectorization passes"));
377
379 ForceMaskedDivRem("force-widen-divrem-via-masked-intrinsic", cl::Hidden,
380 cl::desc("Override cost based masked intrinsic widening "
381 "for div/rem instructions"));
382
384 "enable-early-exit-vectorization", cl::init(true), cl::Hidden,
385 cl::desc(
386 "Enable vectorization of early exit loops with uncountable exits."));
387
388// Likelyhood of bypassing the vectorized loop because there are zero trips left
389// after prolog. See `emitIterationCountCheck`.
390static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
391
392/// A helper function that returns true if the given type is irregular. The
393/// type is irregular if its allocated size doesn't equal the store size of an
394/// element of the corresponding vector type.
395static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
396 // Determine if an array of N elements of type Ty is "bitcast compatible"
397 // with a <N x Ty> vector.
398 // This is only true if there is no padding between the array elements.
399 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
400}
401
402/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
403/// ElementCount to include loops whose trip count is a function of vscale.
405 const Loop *L) {
406 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
407 return ElementCount::getFixed(ExpectedTC);
408
409 const SCEV *BTC = SE->getBackedgeTakenCount(L);
411 return ElementCount::getFixed(0);
412
413 const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
414 if (isa<SCEVVScale>(ExitCount))
416
417 const APInt *Scale;
418 if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
419 if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
420 if (Scale->getActiveBits() <= 32)
422
423 return ElementCount::getFixed(0);
424}
425
426/// Get the maximum trip count for \p L from the SCEV unsigned range, excluding
427/// zero from the range. Only valid when not folding the tail, as the minimum
428/// iteration count check guards against a zero trip count. Returns 0 if
429/// unknown.
431 Loop *L) {
432 const SCEV *BTC = PSE.getBackedgeTakenCount();
434 return 0;
435 ScalarEvolution *SE = PSE.getSE();
436 const SCEV *TripCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
437 ConstantRange TCRange = SE->getUnsignedRange(TripCount);
438 APInt MaxTCFromRange = TCRange.getUnsignedMax();
439 if (!MaxTCFromRange.isZero() && MaxTCFromRange.getActiveBits() <= 32)
440 return MaxTCFromRange.getZExtValue();
441 return 0;
442}
443
444/// Returns "best known" trip count, which is either a valid positive trip count
445/// or std::nullopt when an estimate cannot be made (including when the trip
446/// count would overflow), for the specified loop \p L as defined by the
447/// following procedure:
448/// 1) Returns exact trip count if it is known.
449/// 2) Returns expected trip count according to profile data if any.
450/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
451/// 4) Returns the maximum trip count from the SCEV range excluding zero,
452/// if \p CanUseConstantMax and \p CanExcludeZeroTrips.
453/// 5) Returns std::nullopt if all of the above failed.
454static std::optional<ElementCount>
456 bool CanUseConstantMax = true,
457 bool CanExcludeZeroTrips = false) {
458 // Check if exact trip count is known.
459 if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
460 return ExpectedTC;
461
462 // Check if there is an expected trip count available from profile data.
464 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
465 return ElementCount::getFixed(*EstimatedTC);
466
467 if (!CanUseConstantMax)
468 return std::nullopt;
469
470 // Check if upper bound estimate is known.
471 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
472 return ElementCount::getFixed(ExpectedTC);
473
474 // Get the maximum trip count from the SCEV range excluding zero. This is
475 // only safe when not folding the tail, as the minimum iteration count check
476 // prevents entering the vector loop with a zero trip count.
477 if (CanUseConstantMax && CanExcludeZeroTrips)
478 if (unsigned RefinedTC = getMaxTCFromNonZeroRange(PSE, L))
479 return ElementCount::getFixed(RefinedTC);
480
481 return std::nullopt;
482}
483
484namespace {
485// Forward declare GeneratedRTChecks.
486class GeneratedRTChecks;
487
488using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
489} // namespace
490
491namespace llvm {
492
494
495/// InnerLoopVectorizer vectorizes loops which contain only one basic
496/// block to a specified vectorization factor (VF).
497/// This class performs the widening of scalars into vectors, or multiple
498/// scalars. This class also implements the following features:
499/// * It inserts an epilogue loop for handling loops that don't have iteration
500/// counts that are known to be a multiple of the vectorization factor.
501/// * It handles the code generation for reduction variables.
502/// * Scalarization (implementation using scalars) of un-vectorizable
503/// instructions.
504/// InnerLoopVectorizer does not perform any vectorization-legality
505/// checks, and relies on the caller to check for the different legality
506/// aspects. The InnerLoopVectorizer relies on the
507/// LoopVectorizationLegality class to provide information about the induction
508/// and reduction variables that were found to a given vectorization factor.
510public:
514 ElementCount VecWidth, unsigned UnrollFactor,
516 GeneratedRTChecks &RTChecks, VPlan &Plan)
517 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
518 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
521 Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
522
523 virtual ~InnerLoopVectorizer() = default;
524
525 /// Creates a basic block for the scalar preheader. Both
526 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
527 /// the method to create additional blocks and checks needed for epilogue
528 /// vectorization.
530
531 /// Fix the vectorized code, taking care of header phi's, and more.
533
534 /// Fix the non-induction PHIs in \p Plan.
536
537protected:
539
540 /// Create and return a new IR basic block for the scalar preheader whose name
541 /// is prefixed with \p Prefix.
543
544 /// Allow subclasses to override and print debug traces before/after vplan
545 /// execution, when trace information is requested.
546 virtual void printDebugTracesAtStart() {}
547 virtual void printDebugTracesAtEnd() {}
548
549 /// The original loop.
551
552 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
553 /// dynamic knowledge to simplify SCEV expressions and converts them to a
554 /// more usable form.
556
557 /// Loop Info.
559
560 /// Dominator Tree.
562
563 /// Target Transform Info.
565
566 /// Assumption Cache.
568
569 /// The vectorization SIMD factor to use. Each vector will have this many
570 /// vector elements.
572
573 /// The vectorization unroll factor to use. Each scalar is vectorized to this
574 /// many different vector instructions.
575 unsigned UF;
576
577 /// The builder that we use
579
580 // --- Vectorization state ---
581
582 /// The profitablity analysis.
584
585 /// Structure to hold information about generated runtime checks, responsible
586 /// for cleaning the checks, if vectorization turns out unprofitable.
587 GeneratedRTChecks &RTChecks;
588
590
591 /// The vector preheader block of \p Plan, used as target for check blocks
592 /// introduced during skeleton creation.
594};
595
596/// Encapsulate information regarding vectorization of a loop and its epilogue.
597/// This information is meant to be updated and used across two stages of
598/// epilogue vectorization.
601 unsigned MainLoopUF = 0;
603 unsigned EpilogueUF = 0;
608
610 ElementCount EVF, unsigned EUF,
612 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
614 assert(EUF == 1 &&
615 "A high UF for the epilogue loop is likely not beneficial.");
616 }
617};
618
619/// An extension of the inner loop vectorizer that creates a skeleton for a
620/// vectorized loop that has its epilogue (residual) also vectorized.
621/// The idea is to run the vplan on a given loop twice, firstly to setup the
622/// skeleton and vectorize the main loop, and secondly to complete the skeleton
623/// from the first step and vectorize the epilogue. This is achieved by
624/// deriving two concrete strategy classes from this base class and invoking
625/// them in succession from the loop vectorizer planner.
627public:
637
638 /// Holds and updates state information required to vectorize the main loop
639 /// and its epilogue in two separate passes. This setup helps us avoid
640 /// regenerating and recomputing runtime safety checks. It also helps us to
641 /// shorten the iteration-count-check path length for the cases where the
642 /// iteration count of the loop is so small that the main vector loop is
643 /// completely skipped.
645
646protected:
648};
649
650/// A specialized derived class of inner loop vectorizer that performs
651/// vectorization of *main* loops in the process of vectorizing loops and their
652/// epilogues.
654public:
665
666protected:
667 void printDebugTracesAtStart() override;
668 void printDebugTracesAtEnd() override;
669};
670
671// A specialized derived class of inner loop vectorizer that performs
672// vectorization of *epilogue* loops in the process of vectorizing loops and
673// their epilogues.
675public:
682 GeneratedRTChecks &Checks, VPlan &Plan)
684 Checks, Plan, EPI.EpilogueVF,
685 EPI.EpilogueVF, EPI.EpilogueUF) {}
686 /// Implements the interface for creating a vectorized skeleton using the
687 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
689
690protected:
691 void printDebugTracesAtStart() override;
692 void printDebugTracesAtEnd() override;
693};
694} // end namespace llvm
695
696/// Look for a meaningful debug location on the instruction or its operands.
698 if (!I)
699 return DebugLoc::getUnknown();
700
702 if (I->getDebugLoc() != Empty)
703 return I->getDebugLoc();
704
705 for (Use &Op : I->operands()) {
706 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
707 if (OpInst->getDebugLoc() != Empty)
708 return OpInst->getDebugLoc();
709 }
710
711 return I->getDebugLoc();
712}
713
714/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
715/// is passed, the message relates to that particular instruction.
716#ifndef NDEBUG
717static void debugVectorizationMessage(const StringRef Prefix,
718 const StringRef DebugMsg,
719 Instruction *I) {
720 dbgs() << "LV: " << Prefix << DebugMsg;
721 if (I != nullptr)
722 dbgs() << " " << *I;
723 else
724 dbgs() << '.';
725 dbgs() << '\n';
726}
727#endif
728
729/// Create an analysis remark that explains why vectorization failed
730///
731/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
732/// RemarkName is the identifier for the remark. If \p I is passed it is an
733/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
734/// the location of the remark. If \p DL is passed, use it as debug location for
735/// the remark. \return the remark object that can be streamed to.
736static OptimizationRemarkAnalysis
737createLVAnalysis(const char *PassName, StringRef RemarkName,
738 const Loop *TheLoop, Instruction *I, DebugLoc DL = {}) {
739 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
740 // If debug location is attached to the instruction, use it. Otherwise if DL
741 // was not provided, use the loop's.
742 if (I && I->getDebugLoc())
743 DL = I->getDebugLoc();
744 else if (!DL)
745 DL = TheLoop->getStartLoc();
746
747 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
748}
749
750namespace llvm {
751
752/// Return the runtime value for VF.
754 return B.CreateElementCount(Ty, VF);
755}
756
758 const StringRef OREMsg, const StringRef ORETag,
760 const Loop *TheLoop, Instruction *I) {
761 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
762 LoopVectorizeHints Hints(TheLoop, false /* doesn't matter */, *ORE);
763 ORE->emit(createLVAnalysis(LV_NAME, ORETag, TheLoop, I)
764 << "loop not vectorized: " << OREMsg);
765}
766
767void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
769 const Loop *TheLoop, Instruction *I, DebugLoc DL) {
771 LoopVectorizeHints Hints(TheLoop, false /* doesn't matter */, *ORE);
772 ORE->emit(createLVAnalysis(LV_NAME, ORETag, TheLoop, I, DL) << Msg);
773}
774
775/// Report successful vectorization of the loop. In case an outer loop is
776/// vectorized, prepend "outer" to the vectorization remark.
778 VectorizationFactor VF, unsigned IC) {
780 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
781 nullptr));
782 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
783 ORE->emit([&]() {
784 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
785 TheLoop->getHeader())
786 << "vectorized " << LoopType << "loop (vectorization width: "
787 << ore::NV("VectorizationFactor", VF.Width)
788 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
789 });
790}
791
792} // end namespace llvm
793
794namespace llvm {
795
796// Loop vectorization cost-model hints how the epilogue/tail loop should be
797// lowered.
799
800 // The default: allowing epilogues.
802
803 // Vectorization with OptForSize: don't allow epilogues.
805
806 // A special case of vectorisation with OptForSize: loops with a very small
807 // trip count are considered for vectorization under OptForSize, thereby
808 // making sure the cost of their loop body is dominant, free of runtime
809 // guards and scalar iteration overheads.
811
812 // Loop hint indicating an epilogue is undesired, apply tail folding.
814
815 // Directive indicating we must either fold the epilogue/tail or not vectorize
817};
818
819/// LoopVectorizationCostModel - estimates the expected speedups due to
820/// vectorization.
821/// In many cases vectorization is not profitable. This can happen because of
822/// a number of reasons. In this class we mainly attempt to predict the
823/// expected speedup/slowdowns due to the supported instruction set. We use the
824/// TargetTransformInfo to query the different backends for the cost of
825/// different operations.
828
829public:
843
844 /// \return An upper bound for the vectorization factors (both fixed and
845 /// scalable). If the factors are 0, vectorization and interleaving should be
846 /// avoided up front.
847 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
848
849 /// Memory access instruction may be vectorized in more than one way.
850 /// Form of instruction after vectorization depends on cost.
851 /// This function takes cost-based decisions for Load/Store instructions
852 /// and collects them in a map. This decisions map is used for building
853 /// the lists of loop-uniform and loop-scalar instructions.
854 /// The calculated cost is saved with widening decision in order to
855 /// avoid redundant calculations.
856 void setCostBasedWideningDecision(ElementCount VF);
857
858 /// A call may be vectorized in different ways depending on whether we have
859 /// vectorized variants available and whether the target supports masking.
860 /// This function analyzes all calls in the function at the supplied VF,
861 /// makes a decision based on the costs of available options, and stores that
862 /// decision in a map for use in planning and plan execution.
863 void setVectorizedCallDecision(ElementCount VF);
864
865 /// Collect values we want to ignore in the cost model.
866 void collectValuesToIgnore();
867
868 /// \returns True if it is more profitable to scalarize instruction \p I for
869 /// vectorization factor \p VF.
871 assert(VF.isVector() &&
872 "Profitable to scalarize relevant only for VF > 1.");
873 assert(
874 TheLoop->isInnermost() &&
875 "cost-model should not be used for outer loops (in VPlan-native path)");
876
877 auto Scalars = InstsToScalarize.find(VF);
878 assert(Scalars != InstsToScalarize.end() &&
879 "VF not yet analyzed for scalarization profitability");
880 return Scalars->second.contains(I);
881 }
882
883 /// Returns true if \p I is known to be uniform after vectorization.
885 assert(
886 TheLoop->isInnermost() &&
887 "cost-model should not be used for outer loops (in VPlan-native path)");
888
889 // If VF is scalar, then all instructions are trivially uniform.
890 if (VF.isScalar())
891 return true;
892
893 // Pseudo probes must be duplicated per vector lane so that the
894 // profiled loop trip count is not undercounted.
896 return false;
897
898 auto UniformsPerVF = Uniforms.find(VF);
899 assert(UniformsPerVF != Uniforms.end() &&
900 "VF not yet analyzed for uniformity");
901 return UniformsPerVF->second.count(I);
902 }
903
904 /// Returns true if \p I is known to be scalar after vectorization.
906 assert(
907 TheLoop->isInnermost() &&
908 "cost-model should not be used for outer loops (in VPlan-native path)");
909 if (VF.isScalar())
910 return true;
911
912 auto ScalarsPerVF = Scalars.find(VF);
913 assert(ScalarsPerVF != Scalars.end() &&
914 "Scalar values are not calculated for VF");
915 return ScalarsPerVF->second.count(I);
916 }
917
918 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
919 /// for vectorization factor \p VF.
921 const auto &MinBWs = Config.getMinimalBitwidths();
922 // Truncs must truncate at most to their destination type.
923 if (isa_and_nonnull<TruncInst>(I) && MinBWs.contains(I) &&
924 I->getType()->getScalarSizeInBits() < MinBWs.lookup(I))
925 return false;
926 return VF.isVector() && MinBWs.contains(I) &&
929 }
930
931 /// Decision that was taken during cost calculation for memory instruction.
934 CM_Widen, // For consecutive accesses with stride +1.
935 CM_Widen_Reverse, // For consecutive accesses with stride -1.
941 /// A widening decision that has been invalidated after replacing the
942 /// corresponding recipe during VPlan transforms.
943 /// TODO: Remove once the legacy exit cost computation is retired.
945 };
946
947 /// Save vectorization decision \p W and \p Cost taken by the cost model for
948 /// instruction \p I and vector width \p VF.
951 assert(VF.isVector() && "Expected VF >=2");
952 WideningDecisions[{I, VF}] = {W, Cost};
953 }
954
955 /// Save vectorization decision \p W and \p Cost taken by the cost model for
956 /// interleaving group \p Grp and vector width \p VF.
960 assert(VF.isVector() && "Expected VF >=2");
961 /// Broadcast this decicion to all instructions inside the group.
962 /// When interleaving, the cost will only be assigned one instruction, the
963 /// insert position. For other cases, add the appropriate fraction of the
964 /// total cost to each instruction. This ensures accurate costs are used,
965 /// even if the insert position instruction is not used.
966 InstructionCost InsertPosCost = Cost;
967 InstructionCost OtherMemberCost = 0;
968 if (W != CM_Interleave)
969 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
970 ;
971 for (auto *I : Grp->members()) {
972 if (Grp->getInsertPos() == I)
973 WideningDecisions[{I, VF}] = {W, InsertPosCost};
974 else
975 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
976 }
977 }
978
979 /// Return the cost model decision for the given instruction \p I and vector
980 /// width \p VF. Return CM_Unknown if this instruction did not pass
981 /// through the cost modeling.
983 assert(VF.isVector() && "Expected VF to be a vector VF");
984 assert(
985 TheLoop->isInnermost() &&
986 "cost-model should not be used for outer loops (in VPlan-native path)");
987
988 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
989 auto Itr = WideningDecisions.find(InstOnVF);
990 if (Itr == WideningDecisions.end())
991 return CM_Unknown;
992 return Itr->second.first;
993 }
994
995 /// Return the vectorization cost for the given instruction \p I and vector
996 /// width \p VF.
998 assert(VF.isVector() && "Expected VF >=2");
999 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1000 assert(WideningDecisions.contains(InstOnVF) &&
1001 "The cost is not calculated");
1002 return WideningDecisions[InstOnVF].second;
1003 }
1004
1011
1013 Function *Variant, Intrinsic::ID IID,
1015 assert(!VF.isScalar() && "Expected vector VF");
1016 CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, Cost};
1017 }
1018
1020 ElementCount VF) const {
1021 assert(!VF.isScalar() && "Expected vector VF");
1022 auto I = CallWideningDecisions.find({CI, VF});
1023 if (I == CallWideningDecisions.end())
1024 return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, 0};
1025 return I->second;
1026 }
1027
1028 /// Return True if instruction \p I is an optimizable truncate whose operand
1029 /// is an induction variable. Such a truncate will be removed by adding a new
1030 /// induction variable with the destination type.
1032 // If the instruction is not a truncate, return false.
1033 auto *Trunc = dyn_cast<TruncInst>(I);
1034 if (!Trunc)
1035 return false;
1036
1037 // Get the source and destination types of the truncate.
1038 Type *SrcTy = toVectorTy(Trunc->getSrcTy(), VF);
1039 Type *DestTy = toVectorTy(Trunc->getDestTy(), VF);
1040
1041 // If the truncate is free for the given types, return false. Replacing a
1042 // free truncate with an induction variable would add an induction variable
1043 // update instruction to each iteration of the loop. We exclude from this
1044 // check the primary induction variable since it will need an update
1045 // instruction regardless.
1046 Value *Op = Trunc->getOperand(0);
1047 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1048 return false;
1049
1050 // If the truncated value is not an induction variable, return false.
1051 return Legal->isInductionPhi(Op);
1052 }
1053
1054 /// Collects the instructions to scalarize for each predicated instruction in
1055 /// the loop.
1056 void collectInstsToScalarize(ElementCount VF);
1057
1058 /// Collect values that will not be widened, including Uniforms, Scalars, and
1059 /// Instructions to Scalarize for the given \p VF.
1060 /// The sets depend on CM decision for Load/Store instructions
1061 /// that may be vectorized as interleave, gather-scatter or scalarized.
1062 /// Also make a decision on what to do about call instructions in the loop
1063 /// at that VF -- scalarize, call a known vector routine, or call a
1064 /// vector intrinsic.
1066 // Do the analysis once.
1067 if (VF.isScalar() || Uniforms.contains(VF))
1068 return;
1070 collectLoopUniforms(VF);
1072 collectLoopScalars(VF);
1074 }
1075
1076 /// Given costs for both strategies, return true if the scalar predication
1077 /// lowering should be used for div/rem. This incorporates an override
1078 /// option so it is not simply a cost comparison.
1080 InstructionCost MaskedCost) const {
1081 switch (ForceMaskedDivRem) {
1082 case cl::BOU_UNSET:
1083 return ScalarCost < MaskedCost;
1084 case cl::BOU_TRUE:
1085 return false;
1086 case cl::BOU_FALSE:
1087 return true;
1088 }
1089 llvm_unreachable("impossible case value");
1090 }
1091
1092 /// Returns true if \p I is an instruction which requires predication and
1093 /// for which our chosen predication strategy is scalarization (i.e. we
1094 /// don't have an alternate strategy such as masking available).
1095 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1096 bool isScalarWithPredication(Instruction *I, ElementCount VF);
1097
1098 /// Wrapper function for LoopVectorizationLegality::isMaskRequired,
1099 /// that passes the Instruction \p I and if we fold tail.
1100 bool isMaskRequired(Instruction *I) const;
1101
1102 /// Returns true if \p I is an instruction that needs to be predicated
1103 /// at runtime. The result is independent of the predication mechanism.
1104 /// Superset of instructions that return true for isScalarWithPredication.
1105 bool isPredicatedInst(Instruction *I) const;
1106
1107 /// A helper function that returns how much we should divide the cost of a
1108 /// predicated block by. Typically this is the reciprocal of the block
1109 /// probability, i.e. if we return X we are assuming the predicated block will
1110 /// execute once for every X iterations of the loop header so the block should
1111 /// only contribute 1/X of its cost to the total cost calculation, but when
1112 /// optimizing for code size it will just be 1 as code size costs don't depend
1113 /// on execution probabilities.
1114 ///
1115 /// Note that if a block wasn't originally predicated but was predicated due
1116 /// to tail folding, the divisor will still be 1 because it will execute for
1117 /// every iteration of the loop header.
1118 inline uint64_t
1119 getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1120 const BasicBlock *BB);
1121
1122 /// Returns true if an artificially high cost for emulated masked memrefs
1123 /// should be used.
1124 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1125
1126 /// Return the costs for our two available strategies for lowering a
1127 /// div/rem operation which requires speculating at least one lane.
1128 /// First result is for scalarization (will be invalid for scalable
1129 /// vectors); second is for the masked intrinsic strategy.
1130 std::pair<InstructionCost, InstructionCost>
1131 getDivRemSpeculationCost(Instruction *I, ElementCount VF);
1132
1133 /// Returns true if \p I is a memory instruction with consecutive memory
1134 /// access that can be widened.
1135 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1136
1137 /// Returns true if \p I is a memory instruction in an interleaved-group
1138 /// of memory accesses that can be vectorized with wide vector loads/stores
1139 /// and shuffles.
1140 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1141
1142 /// Check if \p Instr belongs to any interleaved access group.
1144 return InterleaveInfo.isInterleaved(Instr);
1145 }
1146
1147 /// Get the interleaved access group that \p Instr belongs to.
1150 return InterleaveInfo.getInterleaveGroup(Instr);
1151 }
1152
1153 /// Returns true if we're required to use a scalar epilogue for at least
1154 /// the final iteration of the original loop.
1155 bool requiresScalarEpilogue(bool IsVectorizing) const {
1156 if (!isEpilogueAllowed()) {
1157 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1158 return false;
1159 }
1160 // If we might exit from anywhere but the latch and early exit vectorization
1161 // is disabled, we must run the exiting iteration in scalar form.
1162 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1163 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1164 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1165 "from latch block\n");
1166 return true;
1167 }
1168 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1169 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1170 "interleaved group requires scalar epilogue\n");
1171 return true;
1172 }
1173 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1174 return false;
1175 }
1176
1177 /// Returns true if an epilogue is allowed (e.g., not prevented by
1178 /// optsize or a loop hint annotation).
1179 bool isEpilogueAllowed() const {
1180 return EpilogueLoweringStatus == CM_EpilogueAllowed;
1181 }
1182
1183 /// Returns true if tail-folding is preferred over an epilogue.
1185 return EpilogueLoweringStatus == CM_EpilogueNotNeededFoldTail ||
1186 EpilogueLoweringStatus == CM_EpilogueNotAllowedFoldTail;
1187 }
1188
1189 /// Returns the TailFoldingStyle that is best for the current loop.
1191 return ChosenTailFoldingStyle;
1192 }
1193
1194 /// Selects and saves TailFoldingStyle.
1195 /// \param IsScalableVF true if scalable vector factors enabled.
1196 /// \param UserIC User specific interleave count.
1197 void setTailFoldingStyle(bool IsScalableVF, unsigned UserIC) {
1198 assert(ChosenTailFoldingStyle == TailFoldingStyle::None &&
1199 "Tail folding must not be selected yet.");
1200 if (!Legal->canFoldTailByMasking()) {
1201 ChosenTailFoldingStyle = TailFoldingStyle::None;
1202 return;
1203 }
1204
1205 // Default to TTI preference, but allow command line override.
1206 ChosenTailFoldingStyle = TTI.getPreferredTailFoldingStyle();
1207 if (ForceTailFoldingStyle.getNumOccurrences())
1208 ChosenTailFoldingStyle = ForceTailFoldingStyle.getValue();
1209
1210 if (ChosenTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1211 return;
1212 // Override EVL styles if needed.
1213 // FIXME: Investigate opportunity for fixed vector factor.
1214 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1215 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1216 if (EVLIsLegal)
1217 return;
1218 // If for some reason EVL mode is unsupported, fallback to an epilogue
1219 // if it's allowed, or DataWithoutLaneMask otherwise.
1220 if (EpilogueLoweringStatus == CM_EpilogueAllowed ||
1221 EpilogueLoweringStatus == CM_EpilogueNotNeededFoldTail)
1222 ChosenTailFoldingStyle = TailFoldingStyle::None;
1223 else
1224 ChosenTailFoldingStyle = TailFoldingStyle::DataWithoutLaneMask;
1225
1226 LLVM_DEBUG(
1227 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1228 "not try to generate VP Intrinsics "
1229 << (UserIC > 1
1230 ? "since interleave count specified is greater than 1.\n"
1231 : "due to non-interleaving reasons.\n"));
1232 }
1233
1234 /// Returns true if all loop blocks should be masked to fold tail loop.
1235 bool foldTailByMasking() const {
1237 }
1238
1239 /// Returns true if the use of wide lane masks is requested and the loop is
1240 /// using tail-folding with a lane mask for control flow.
1243 return false;
1244
1246 }
1247
1248 /// Returns true if the instructions in this block requires predication
1249 /// for any reason, e.g. because tail folding now requires a predicate
1250 /// or because the block in the original loop was predicated.
1252 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1253 }
1254
1255 /// Returns true if VP intrinsics with explicit vector length support should
1256 /// be generated in the tail folded loop.
1260
1261 /// Returns true if the predicated reduction select should be used to set the
1262 /// incoming value for the reduction phi.
1263 bool usePredicatedReductionSelect(RecurKind RecurrenceKind) const {
1264 // Force to use predicated reduction select since the EVL of the
1265 // second-to-last iteration might not be VF*UF.
1266 if (foldTailWithEVL())
1267 return true;
1268
1269 // Note: For FindLast recurrences we prefer a predicated select to simplify
1270 // matching in handleFindLastReductions(), rather than handle multiple
1271 // cases.
1273 return true;
1274
1276 TTI.preferPredicatedReductionSelect();
1277 }
1278
1279 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1280 /// with factor VF. Return the cost of the instruction, including
1281 /// scalarization overhead if it's needed.
1282 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1283
1284 /// Estimate cost of a call instruction CI if it were vectorized with factor
1285 /// VF. Return the cost of the instruction, including scalarization overhead
1286 /// if it's needed.
1287 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1288
1289 /// Invalidates decisions already taken by the cost model.
1291 WideningDecisions.clear();
1292 CallWideningDecisions.clear();
1293 Uniforms.clear();
1294 Scalars.clear();
1295 }
1296
1297 /// Returns the expected execution cost. The unit of the cost does
1298 /// not matter because we use the 'cost' units to compare different
1299 /// vector widths. The cost that is returned is *not* normalized by
1300 /// the factor width.
1301 InstructionCost expectedCost(ElementCount VF);
1302
1303 /// Returns true if epilogue vectorization is considered profitable, and
1304 /// false otherwise.
1305 /// \p VF is the vectorization factor chosen for the original loop.
1306 /// \p Multiplier is an aditional scaling factor applied to VF before
1307 /// comparing to EpilogueVectorizationMinVF.
1308 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1309 const unsigned IC) const;
1310
1311 /// Returns the execution time cost of an instruction for a given vector
1312 /// width. Vector width of one means scalar.
1313 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1314
1315 /// Return the cost of instructions in an inloop reduction pattern, if I is
1316 /// part of that pattern.
1317 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1318 ElementCount VF,
1319 Type *VectorTy) const;
1320
1321 /// Returns true if \p Op should be considered invariant and if it is
1322 /// trivially hoistable.
1323 bool shouldConsiderInvariant(Value *Op);
1324
1325 /// Returns true if \p I has been forced to be scalarized at \p VF.
1327 auto FS = ForcedScalars.find(VF);
1328 return FS != ForcedScalars.end() && FS->second.contains(I);
1329 }
1330
1331private:
1332 unsigned NumPredStores = 0;
1333
1334 /// VF selection state independent of cost-modeling decisions.
1335 VFSelectionContext &Config;
1336
1337 /// Calculate vectorization cost of memory instruction \p I.
1338 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1339
1340 /// The cost computation for scalarized memory instruction.
1341 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1342
1343 /// The cost computation for interleaving group of memory instructions.
1344 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1345
1346 /// The cost computation for Gather/Scatter instruction.
1347 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1348
1349 /// The cost computation for widening instruction \p I with consecutive
1350 /// memory access.
1351 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1352
1353 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1354 /// Load: scalar load + broadcast.
1355 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1356 /// element)
1357 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1358
1359 /// Estimate the overhead of scalarizing an instruction. This is a
1360 /// convenience wrapper for the type-based getScalarizationOverhead API.
1362 ElementCount VF) const;
1363
1364 /// A type representing the costs for instructions if they were to be
1365 /// scalarized rather than vectorized. The entries are Instruction-Cost
1366 /// pairs.
1367 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1368
1369 /// A set containing all BasicBlocks that are known to present after
1370 /// vectorization as a predicated block.
1372 PredicatedBBsAfterVectorization;
1373
1374 /// Records whether it is allowed to have the original scalar loop execute at
1375 /// least once. This may be needed as a fallback loop in case runtime
1376 /// aliasing/dependence checks fail, or to handle the tail/remainder
1377 /// iterations when the trip count is unknown or doesn't divide by the VF,
1378 /// or as a peel-loop to handle gaps in interleave-groups.
1379 /// Under optsize and when the trip count is very small we don't allow any
1380 /// iterations to execute in the scalar loop.
1381 EpilogueLowering EpilogueLoweringStatus = CM_EpilogueAllowed;
1382
1383 /// Control finally chosen tail folding style.
1384 TailFoldingStyle ChosenTailFoldingStyle = TailFoldingStyle::None;
1385
1386 /// A map holding scalar costs for different vectorization factors. The
1387 /// presence of a cost for an instruction in the mapping indicates that the
1388 /// instruction will be scalarized when vectorizing with the associated
1389 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1391
1392 /// Holds the instructions known to be uniform after vectorization.
1393 /// The data is collected per VF.
1395
1396 /// Holds the instructions known to be scalar after vectorization.
1397 /// The data is collected per VF.
1399
1400 /// Holds the instructions (address computations) that are forced to be
1401 /// scalarized.
1403
1404 /// Returns the expected difference in cost from scalarizing the expression
1405 /// feeding a predicated instruction \p PredInst. The instructions to
1406 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1407 /// non-negative return value implies the expression will be scalarized.
1408 /// Currently, only single-use chains are considered for scalarization.
1409 InstructionCost computePredInstDiscount(Instruction *PredInst,
1410 ScalarCostsTy &ScalarCosts,
1411 ElementCount VF);
1412
1413 /// Collect the instructions that are uniform after vectorization. An
1414 /// instruction is uniform if we represent it with a single scalar value in
1415 /// the vectorized loop corresponding to each vector iteration. Examples of
1416 /// uniform instructions include pointer operands of consecutive or
1417 /// interleaved memory accesses. Note that although uniformity implies an
1418 /// instruction will be scalar, the reverse is not true. In general, a
1419 /// scalarized instruction will be represented by VF scalar values in the
1420 /// vectorized loop, each corresponding to an iteration of the original
1421 /// scalar loop.
1422 void collectLoopUniforms(ElementCount VF);
1423
1424 /// Collect the instructions that are scalar after vectorization. An
1425 /// instruction is scalar if it is known to be uniform or will be scalarized
1426 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1427 /// to the list if they are used by a load/store instruction that is marked as
1428 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1429 /// VF values in the vectorized loop, each corresponding to an iteration of
1430 /// the original scalar loop.
1431 void collectLoopScalars(ElementCount VF);
1432
1433 /// Keeps cost model vectorization decision and cost for instructions.
1434 /// Right now it is used for memory instructions only.
1436 std::pair<InstWidening, InstructionCost>>;
1437
1438 DecisionList WideningDecisions;
1439
1440 using CallDecisionList =
1441 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1442
1443 CallDecisionList CallWideningDecisions;
1444
1445 /// Returns true if \p V is expected to be vectorized and it needs to be
1446 /// extracted.
1447 bool needsExtract(Value *V, ElementCount VF) const {
1449 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1450 TheLoop->isLoopInvariant(I) ||
1451 getWideningDecision(I, VF) == CM_Scalarize ||
1452 (isa<CallInst>(I) &&
1453 getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
1454 return false;
1455
1456 // Assume we can vectorize V (and hence we need extraction) if the
1457 // scalars are not computed yet. This can happen, because it is called
1458 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1459 // the scalars are collected. That should be a safe assumption in most
1460 // cases, because we check if the operands have vectorizable types
1461 // beforehand in LoopVectorizationLegality.
1462 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1463 };
1464
1465 /// Returns a range containing only operands needing to be extracted.
1466 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1467 ElementCount VF) const {
1468
1469 SmallPtrSet<const Value *, 4> UniqueOperands;
1471 for (Value *Op : Ops) {
1472 if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second ||
1473 !needsExtract(Op, VF))
1474 continue;
1475 Res.push_back(Op);
1476 }
1477 return Res;
1478 }
1479
1480public:
1481 /// The loop that we evaluate.
1483
1484 /// Predicated scalar evolution analysis.
1486
1487 /// Loop Info analysis.
1489
1490 /// Vectorization legality.
1492
1493 /// Vector target information.
1495
1496 /// Target Library Info.
1498
1499 /// Assumption cache.
1501
1502 /// Interface to emit optimization remarks.
1504
1505 /// A function to lazily fetch BlockFrequencyInfo. This avoids computing it
1506 /// unless necessary, e.g. when the loop isn't legal to vectorize or when
1507 /// there is no predication.
1508 std::function<BlockFrequencyInfo &()> GetBFI;
1509 /// The BlockFrequencyInfo returned from GetBFI.
1511 /// Returns the BlockFrequencyInfo for the function if cached, otherwise
1512 /// fetches it via GetBFI. Avoids an indirect call to the std::function.
1514 if (!BFI)
1515 BFI = &GetBFI();
1516 return *BFI;
1517 }
1518
1520
1521 /// Loop Vectorize Hint.
1523
1524 /// The interleave access information contains groups of interleaved accesses
1525 /// with the same stride and close to each other.
1527
1528 /// Values to ignore in the cost model.
1530
1531 /// Values to ignore in the cost model when VF > 1.
1533};
1534} // end namespace llvm
1535
1536namespace {
1537/// Helper struct to manage generating runtime checks for vectorization.
1538///
1539/// The runtime checks are created up-front in temporary blocks to allow better
1540/// estimating the cost and un-linked from the existing IR. After deciding to
1541/// vectorize, the checks are moved back. If deciding not to vectorize, the
1542/// temporary blocks are completely removed.
1543class GeneratedRTChecks {
1544 /// Basic block which contains the generated SCEV checks, if any.
1545 BasicBlock *SCEVCheckBlock = nullptr;
1546
1547 /// The value representing the result of the generated SCEV checks. If it is
1548 /// nullptr no SCEV checks have been generated.
1549 Value *SCEVCheckCond = nullptr;
1550
1551 /// Basic block which contains the generated memory runtime checks, if any.
1552 BasicBlock *MemCheckBlock = nullptr;
1553
1554 /// The value representing the result of the generated memory runtime checks.
1555 /// If it is nullptr no memory runtime checks have been generated.
1556 Value *MemRuntimeCheckCond = nullptr;
1557
1558 DominatorTree *DT;
1559 LoopInfo *LI;
1561
1562 SCEVExpander SCEVExp;
1563 SCEVExpander MemCheckExp;
1564
1565 bool CostTooHigh = false;
1566
1567 Loop *OuterLoop = nullptr;
1568
1570
1571 /// The kind of cost that we are calculating
1573
1574public:
1575 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1578 : DT(DT), LI(LI), TTI(TTI),
1579 SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1580 MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1581 PSE(PSE), CostKind(CostKind) {}
1582
1583 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1584 /// accurately estimate the cost of the runtime checks. The blocks are
1585 /// un-linked from the IR and are added back during vector code generation. If
1586 /// there is no vector code generation, the check blocks are removed
1587 /// completely.
1588 void create(Loop *L, const LoopAccessInfo &LAI,
1589 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
1590 OptimizationRemarkEmitter &ORE) {
1591
1592 // Hard cutoff to limit compile-time increase in case a very large number of
1593 // runtime checks needs to be generated.
1594 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1595 // profile info.
1596 CostTooHigh =
1598 if (CostTooHigh) {
1599 // Mark runtime checks as never succeeding when they exceed the threshold.
1600 MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
1601 SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
1602 ORE.emit([&]() {
1603 return OptimizationRemarkAnalysisAliasing(
1604 DEBUG_TYPE, "TooManyMemoryRuntimeChecks", L->getStartLoc(),
1605 L->getHeader())
1606 << "loop not vectorized: too many memory checks needed";
1607 });
1608 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
1609 return;
1610 }
1611
1612 BasicBlock *LoopHeader = L->getHeader();
1613 BasicBlock *Preheader = L->getLoopPreheader();
1614
1615 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1616 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1617 // may be used by SCEVExpander. The blocks will be un-linked from their
1618 // predecessors and removed from LI & DT at the end of the function.
1619 if (!UnionPred.isAlwaysTrue()) {
1620 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1621 nullptr, "vector.scevcheck");
1622
1623 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1624 &UnionPred, SCEVCheckBlock->getTerminator());
1625 if (isa<Constant>(SCEVCheckCond)) {
1626 // Clean up directly after expanding the predicate to a constant, to
1627 // avoid further expansions re-using anything left over from SCEVExp.
1628 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1629 SCEVCleaner.cleanup();
1630 }
1631 }
1632
1633 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1634 if (RtPtrChecking.Need) {
1635 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1636 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1637 "vector.memcheck");
1638
1639 auto DiffChecks = RtPtrChecking.getDiffChecks();
1640 if (DiffChecks) {
1641 Value *RuntimeVF = nullptr;
1642 MemRuntimeCheckCond = addDiffRuntimeChecks(
1643 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1644 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1645 if (!RuntimeVF)
1646 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1647 return RuntimeVF;
1648 },
1649 IC);
1650 } else {
1651 MemRuntimeCheckCond = addRuntimeChecks(
1652 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1654 }
1655 assert(MemRuntimeCheckCond &&
1656 "no RT checks generated although RtPtrChecking "
1657 "claimed checks are required");
1658 }
1659
1660 SCEVExp.eraseDeadInstructions(SCEVCheckCond);
1661
1662 if (!MemCheckBlock && !SCEVCheckBlock)
1663 return;
1664
1665 // Unhook the temporary block with the checks, update various places
1666 // accordingly.
1667 if (SCEVCheckBlock)
1668 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1669 if (MemCheckBlock)
1670 MemCheckBlock->replaceAllUsesWith(Preheader);
1671
1672 if (SCEVCheckBlock) {
1673 SCEVCheckBlock->getTerminator()->moveBefore(
1674 Preheader->getTerminator()->getIterator());
1675 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1676 UI->setDebugLoc(DebugLoc::getTemporary());
1677 Preheader->getTerminator()->eraseFromParent();
1678 }
1679 if (MemCheckBlock) {
1680 MemCheckBlock->getTerminator()->moveBefore(
1681 Preheader->getTerminator()->getIterator());
1682 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1683 UI->setDebugLoc(DebugLoc::getTemporary());
1684 Preheader->getTerminator()->eraseFromParent();
1685 }
1686
1687 DT->changeImmediateDominator(LoopHeader, Preheader);
1688 if (MemCheckBlock) {
1689 DT->eraseNode(MemCheckBlock);
1690 LI->removeBlock(MemCheckBlock);
1691 }
1692 if (SCEVCheckBlock) {
1693 DT->eraseNode(SCEVCheckBlock);
1694 LI->removeBlock(SCEVCheckBlock);
1695 }
1696
1697 // Outer loop is used as part of the later cost calculations.
1698 OuterLoop = L->getParentLoop();
1699 }
1700
1702 if (SCEVCheckBlock || MemCheckBlock)
1703 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1704
1705 if (CostTooHigh) {
1707 Cost.setInvalid();
1708 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1709 return Cost;
1710 }
1711
1712 InstructionCost RTCheckCost = 0;
1713 if (SCEVCheckBlock)
1714 for (Instruction &I : *SCEVCheckBlock) {
1715 if (SCEVCheckBlock->getTerminator() == &I)
1716 continue;
1718 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1719 RTCheckCost += C;
1720 }
1721 if (MemCheckBlock) {
1722 InstructionCost MemCheckCost = 0;
1723 for (Instruction &I : *MemCheckBlock) {
1724 if (MemCheckBlock->getTerminator() == &I)
1725 continue;
1727 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1728 MemCheckCost += C;
1729 }
1730
1731 // If the runtime memory checks are being created inside an outer loop
1732 // we should find out if these checks are outer loop invariant. If so,
1733 // the checks will likely be hoisted out and so the effective cost will
1734 // reduce according to the outer loop trip count.
1735 if (OuterLoop) {
1736 ScalarEvolution *SE = MemCheckExp.getSE();
1737 // TODO: If profitable, we could refine this further by analysing every
1738 // individual memory check, since there could be a mixture of loop
1739 // variant and invariant checks that mean the final condition is
1740 // variant.
1741 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1742 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1743 // It seems reasonable to assume that we can reduce the effective
1744 // cost of the checks even when we know nothing about the trip
1745 // count. Assume that the outer loop executes at least twice.
1746 unsigned BestTripCount = 2;
1747
1748 // Get the best known TC estimate.
1749 if (auto EstimatedTC = getSmallBestKnownTC(
1750 PSE, OuterLoop, /* CanUseConstantMax = */ false))
1751 if (EstimatedTC->isFixed())
1752 BestTripCount = EstimatedTC->getFixedValue();
1753
1754 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1755
1756 // Let's ensure the cost is always at least 1.
1757 NewMemCheckCost = std::max(NewMemCheckCost.getValue(),
1758 (InstructionCost::CostType)1);
1759
1760 if (BestTripCount > 1)
1762 << "We expect runtime memory checks to be hoisted "
1763 << "out of the outer loop. Cost reduced from "
1764 << MemCheckCost << " to " << NewMemCheckCost << '\n');
1765
1766 MemCheckCost = NewMemCheckCost;
1767 }
1768 }
1769
1770 RTCheckCost += MemCheckCost;
1771 }
1772
1773 if (SCEVCheckBlock || MemCheckBlock)
1774 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1775 << "\n");
1776
1777 return RTCheckCost;
1778 }
1779
1780 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1781 /// unused.
1782 ~GeneratedRTChecks() {
1783 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1784 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1785 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(SCEVCheckBlock);
1786 bool MemChecksUsed = !MemCheckBlock || !pred_empty(MemCheckBlock);
1787 if (SCEVChecksUsed)
1788 SCEVCleaner.markResultUsed();
1789
1790 if (MemChecksUsed) {
1791 MemCheckCleaner.markResultUsed();
1792 } else {
1793 auto &SE = *MemCheckExp.getSE();
1794 // Memory runtime check generation creates compares that use expanded
1795 // values. Remove them before running the SCEVExpanderCleaners.
1796 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
1797 if (MemCheckExp.isInsertedInstruction(&I))
1798 continue;
1799 SE.forgetValue(&I);
1800 I.eraseFromParent();
1801 }
1802 }
1803 MemCheckCleaner.cleanup();
1804 SCEVCleaner.cleanup();
1805
1806 if (!SCEVChecksUsed)
1807 SCEVCheckBlock->eraseFromParent();
1808 if (!MemChecksUsed)
1809 MemCheckBlock->eraseFromParent();
1810 }
1811
1812 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
1813 /// outside VPlan.
1814 std::pair<Value *, BasicBlock *> getSCEVChecks() const {
1815 using namespace llvm::PatternMatch;
1816 if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
1817 return {nullptr, nullptr};
1818
1819 return {SCEVCheckCond, SCEVCheckBlock};
1820 }
1821
1822 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
1823 /// outside VPlan.
1824 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
1825 using namespace llvm::PatternMatch;
1826 if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
1827 return {nullptr, nullptr};
1828 return {MemRuntimeCheckCond, MemCheckBlock};
1829 }
1830
1831 /// Return true if any runtime checks have been added
1832 bool hasChecks() const {
1833 return getSCEVChecks().first || getMemRuntimeChecks().first;
1834 }
1835};
1836} // namespace
1837
1839 return Style == TailFoldingStyle::Data ||
1841}
1842
1846
1847// Return true if \p OuterLp is an outer loop annotated with hints for explicit
1848// vectorization. The loop needs to be annotated with #pragma omp simd
1849// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1850// vector length information is not provided, vectorization is not considered
1851// explicit. Interleave hints are not allowed either. These limitations will be
1852// relaxed in the future.
1853// Please, note that we are currently forced to abuse the pragma 'clang
1854// vectorize' semantics. This pragma provides *auto-vectorization hints*
1855// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1856// provides *explicit vectorization hints* (LV can bypass legal checks and
1857// assume that vectorization is legal). However, both hints are implemented
1858// using the same metadata (llvm.loop.vectorize, processed by
1859// LoopVectorizeHints). This will be fixed in the future when the native IR
1860// representation for pragma 'omp simd' is introduced.
1861static bool isExplicitVecOuterLoop(Loop *OuterLp,
1863 assert(!OuterLp->isInnermost() && "This is not an outer loop");
1864 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1865
1866 // Only outer loops with an explicit vectorization hint are supported.
1867 // Unannotated outer loops are ignored.
1869 return false;
1870
1871 Function *Fn = OuterLp->getHeader()->getParent();
1872 if (!Hints.allowVectorization(Fn, OuterLp,
1873 true /*VectorizeOnlyWhenForced*/)) {
1874 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1875 return false;
1876 }
1877
1878 if (Hints.getInterleave() > 1) {
1879 // TODO: Interleave support is future work.
1880 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1881 "outer loops.\n");
1882 Hints.emitRemarkWithHints();
1883 return false;
1884 }
1885
1886 return true;
1887}
1888
1892 // Collect inner loops and outer loops without irreducible control flow. For
1893 // now, only collect outer loops that have explicit vectorization hints. If we
1894 // are stress testing the VPlan H-CFG construction, we collect the outermost
1895 // loop of every loop nest.
1896 if (L.isInnermost() || VPlanBuildOuterloopStressTest ||
1898 LoopBlocksRPO RPOT(&L);
1899 RPOT.perform(LI);
1901 V.push_back(&L);
1902 // TODO: Collect inner loops inside marked outer loops in case
1903 // vectorization fails for the outer loop. Do not invoke
1904 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1905 // already known to be reducible. We can use an inherited attribute for
1906 // that.
1907 return;
1908 }
1909 }
1910 for (Loop *InnerL : L)
1911 collectSupportedLoops(*InnerL, LI, ORE, V);
1912}
1913
1914//===----------------------------------------------------------------------===//
1915// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1916// LoopVectorizationCostModel and LoopVectorizationPlanner.
1917//===----------------------------------------------------------------------===//
1918
1919/// For the given VF and UF and maximum trip count computed for the loop, return
1920/// whether the induction variable might overflow in the vectorized loop. If not,
1921/// then we know a runtime overflow check always evaluates to false and can be
1922/// removed.
1924 const LoopVectorizationCostModel *Cost,
1925 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
1926 // Always be conservative if we don't know the exact unroll factor.
1927 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
1928
1929 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
1930 APInt MaxUIntTripCount = IdxTy->getMask();
1931
1932 // We know the runtime overflow check is known false iff the (max) trip-count
1933 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
1934 // the vector loop induction variable.
1935 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
1936 uint64_t MaxVF = VF.getKnownMinValue();
1937 if (VF.isScalable()) {
1938 std::optional<unsigned> MaxVScale =
1939 getMaxVScale(*Cost->TheFunction, Cost->TTI);
1940 if (!MaxVScale)
1941 return false;
1942 MaxVF *= *MaxVScale;
1943 }
1944
1945 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
1946 }
1947
1948 return false;
1949}
1950
1951// Return whether we allow using masked interleave-groups (for dealing with
1952// strided loads/stores that reside in predicated blocks, or for dealing
1953// with gaps).
1955 // If an override option has been passed in for interleaved accesses, use it.
1956 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
1958
1959 return TTI.enableMaskedInterleavedAccessVectorization();
1960}
1961
1962/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
1963/// VPBB are moved to the end of the newly created VPIRBasicBlock. All
1964/// predecessors and successors of VPBB, if any, are rewired to the new
1965/// VPIRBasicBlock. If \p VPBB may be unreachable, \p Plan must be passed.
1967 BasicBlock *IRBB,
1968 VPlan *Plan = nullptr) {
1969 if (!Plan)
1970 Plan = VPBB->getPlan();
1971 VPIRBasicBlock *IRVPBB = Plan->createVPIRBasicBlock(IRBB);
1972 auto IP = IRVPBB->begin();
1973 for (auto &R : make_early_inc_range(VPBB->phis()))
1974 R.moveBefore(*IRVPBB, IP);
1975
1976 for (auto &R :
1978 R.moveBefore(*IRVPBB, IRVPBB->end());
1979
1980 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
1981 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
1982 return IRVPBB;
1983}
1984
1986 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
1987 assert(VectorPH && "Invalid loop structure");
1988 assert((OrigLoop->getUniqueLatchExitBlock() ||
1989 Cost->requiresScalarEpilogue(VF.isVector())) &&
1990 "loops not exiting via the latch without required epilogue?");
1991
1992 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
1993 // wrapping the newly created scalar preheader here at the moment, because the
1994 // Plan's scalar preheader may be unreachable at this point. Instead it is
1995 // replaced in executePlan.
1996 return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr,
1997 Twine(Prefix) + "scalar.ph");
1998}
1999
2000/// Knowing that loop \p L executes a single vector iteration, add instructions
2001/// that will get simplified and thus should not have any cost to \p
2002/// InstsToIgnore.
2005 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2006 auto *Cmp = L->getLatchCmpInst();
2007 if (Cmp)
2008 InstsToIgnore.insert(Cmp);
2009 for (const auto &KV : IL) {
2010 // Extract the key by hand so that it can be used in the lambda below. Note
2011 // that captured structured bindings are a C++20 extension.
2012 const PHINode *IV = KV.first;
2013
2014 // Get next iteration value of the induction variable.
2015 Instruction *IVInst =
2016 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2017 if (all_of(IVInst->users(),
2018 [&](const User *U) { return U == IV || U == Cmp; }))
2019 InstsToIgnore.insert(IVInst);
2020 }
2021}
2022
2024 // Create a new IR basic block for the scalar preheader.
2025 BasicBlock *ScalarPH = createScalarPreheader("");
2026 return ScalarPH->getSinglePredecessor();
2027}
2028
2029namespace {
2030
2031struct CSEDenseMapInfo {
2032 static bool canHandle(const Instruction *I) {
2035 }
2036
2037 static inline Instruction *getEmptyKey() {
2039 }
2040
2041 static inline Instruction *getTombstoneKey() {
2042 return DenseMapInfo<Instruction *>::getTombstoneKey();
2043 }
2044
2045 static unsigned getHashValue(const Instruction *I) {
2046 assert(canHandle(I) && "Unknown instruction!");
2047 return hash_combine(I->getOpcode(),
2048 hash_combine_range(I->operand_values()));
2049 }
2050
2051 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2052 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2053 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2054 return LHS == RHS;
2055 return LHS->isIdenticalTo(RHS);
2056 }
2057};
2058
2059} // end anonymous namespace
2060
2061/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
2062/// removal, in favor of the VPlan-based one.
2063static void legacyCSE(BasicBlock *BB) {
2064 // Perform simple cse.
2066 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2067 if (!CSEDenseMapInfo::canHandle(&In))
2068 continue;
2069
2070 // Check if we can replace this instruction with any of the
2071 // visited instructions.
2072 if (Instruction *V = CSEMap.lookup(&In)) {
2073 In.replaceAllUsesWith(V);
2074 In.eraseFromParent();
2075 continue;
2076 }
2077
2078 CSEMap[&In] = &In;
2079 }
2080}
2081
2082/// This function attempts to return a value that represents the ElementCount
2083/// at runtime. For fixed-width VFs we know this precisely at compile
2084/// time, but for scalable VFs we calculate it based on an estimate of the
2085/// vscale value.
2087 std::optional<unsigned> VScale) {
2088 unsigned EstimatedVF = VF.getKnownMinValue();
2089 if (VF.isScalable())
2090 if (VScale)
2091 EstimatedVF *= *VScale;
2092 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2093 return EstimatedVF;
2094}
2095
2098 ElementCount VF) const {
2099 // We only need to calculate a cost if the VF is scalar; for actual vectors
2100 // we should already have a pre-calculated cost at each VF.
2101 if (!VF.isScalar())
2102 return getCallWideningDecision(CI, VF).Cost;
2103
2104 Type *RetTy = CI->getType();
2106 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2107 return *RedCost;
2108
2110 for (auto &ArgOp : CI->args())
2111 Tys.push_back(ArgOp->getType());
2112
2113 InstructionCost ScalarCallCost = TTI.getCallInstrCost(
2114 CI->getCalledFunction(), RetTy, Tys, Config.CostKind);
2115
2116 // If this is an intrinsic we may have a lower cost for it.
2119 return std::min(ScalarCallCost, IntrinsicCost);
2120 }
2121 return ScalarCallCost;
2122}
2123
2125 if (VF.isScalar() || !canVectorizeTy(Ty))
2126 return Ty;
2127 return toVectorizedTy(Ty, VF);
2128}
2129
2132 ElementCount VF) const {
2134 assert(ID && "Expected intrinsic call!");
2135 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2136 FastMathFlags FMF;
2137 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2138 FMF = FPMO->getFastMathFlags();
2139
2142 SmallVector<Type *> ParamTys;
2143 std::transform(FTy->param_begin(), FTy->param_end(),
2144 std::back_inserter(ParamTys),
2145 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2146
2147 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2150 return TTI.getIntrinsicInstrCost(CostAttrs, Config.CostKind);
2151}
2152
2154 // Fix widened non-induction PHIs by setting up the PHI operands.
2155 fixNonInductionPHIs(State);
2156
2157 // Don't apply optimizations below when no (vector) loop remains, as they all
2158 // require one at the moment.
2159 VPBasicBlock *HeaderVPBB =
2160 vputils::getFirstLoopHeader(*State.Plan, State.VPDT);
2161 if (!HeaderVPBB)
2162 return;
2163
2164 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2165
2166 // Remove redundant induction instructions.
2167 legacyCSE(HeaderBB);
2168}
2169
2171 auto Iter = vp_depth_first_shallow(Plan.getEntry());
2173 for (VPRecipeBase &P : VPBB->phis()) {
2175 if (!VPPhi)
2176 continue;
2177 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
2178 // Make sure the builder has a valid insert point.
2179 Builder.SetInsertPoint(NewPhi);
2180 for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2181 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
2182 }
2183 }
2184}
2185
2186void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2187 // We should not collect Scalars more than once per VF. Right now, this
2188 // function is called from collectUniformsAndScalars(), which already does
2189 // this check. Collecting Scalars for VF=1 does not make any sense.
2190 assert(VF.isVector() && !Scalars.contains(VF) &&
2191 "This function should not be visited twice for the same VF");
2192
2193 // This avoids any chances of creating a REPLICATE recipe during planning
2194 // since that would result in generation of scalarized code during execution,
2195 // which is not supported for scalable vectors.
2196 if (VF.isScalable()) {
2197 Scalars[VF].insert_range(Uniforms[VF]);
2198 return;
2199 }
2200
2202
2203 // These sets are used to seed the analysis with pointers used by memory
2204 // accesses that will remain scalar.
2206 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2207 auto *Latch = TheLoop->getLoopLatch();
2208
2209 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2210 // The pointer operands of loads and stores will be scalar as long as the
2211 // memory access is not a gather or scatter operation. The value operand of a
2212 // store will remain scalar if the store is scalarized.
2213 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2214 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
2215 assert(WideningDecision != CM_Unknown &&
2216 "Widening decision should be ready at this moment");
2217 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
2218 if (Ptr == Store->getValueOperand())
2219 return WideningDecision == CM_Scalarize;
2220 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2221 "Ptr is neither a value or pointer operand");
2222 return WideningDecision != CM_GatherScatter;
2223 };
2224
2225 // A helper that returns true if the given value is a getelementptr
2226 // instruction contained in the loop.
2227 auto IsLoopVaryingGEP = [&](Value *V) {
2228 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
2229 };
2230
2231 // A helper that evaluates a memory access's use of a pointer. If the use will
2232 // be a scalar use and the pointer is only used by memory accesses, we place
2233 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2234 // PossibleNonScalarPtrs.
2235 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2236 // We only care about bitcast and getelementptr instructions contained in
2237 // the loop.
2238 if (!IsLoopVaryingGEP(Ptr))
2239 return;
2240
2241 // If the pointer has already been identified as scalar (e.g., if it was
2242 // also identified as uniform), there's nothing to do.
2243 auto *I = cast<Instruction>(Ptr);
2244 if (Worklist.count(I))
2245 return;
2246
2247 // If the use of the pointer will be a scalar use, and all users of the
2248 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2249 // place the pointer in PossibleNonScalarPtrs.
2250 if (IsScalarUse(MemAccess, Ptr) &&
2252 ScalarPtrs.insert(I);
2253 else
2254 PossibleNonScalarPtrs.insert(I);
2255 };
2256
2257 // We seed the scalars analysis with three classes of instructions: (1)
2258 // instructions marked uniform-after-vectorization and (2) bitcast,
2259 // getelementptr and (pointer) phi instructions used by memory accesses
2260 // requiring a scalar use.
2261 //
2262 // (1) Add to the worklist all instructions that have been identified as
2263 // uniform-after-vectorization.
2264 Worklist.insert_range(Uniforms[VF]);
2265
2266 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2267 // memory accesses requiring a scalar use. The pointer operands of loads and
2268 // stores will be scalar unless the operation is a gather or scatter.
2269 // The value operand of a store will remain scalar if the store is scalarized.
2270 for (auto *BB : TheLoop->blocks())
2271 for (auto &I : *BB) {
2272 if (auto *Load = dyn_cast<LoadInst>(&I)) {
2273 EvaluatePtrUse(Load, Load->getPointerOperand());
2274 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
2275 EvaluatePtrUse(Store, Store->getPointerOperand());
2276 EvaluatePtrUse(Store, Store->getValueOperand());
2277 }
2278 }
2279 for (auto *I : ScalarPtrs)
2280 if (!PossibleNonScalarPtrs.count(I)) {
2281 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2282 Worklist.insert(I);
2283 }
2284
2285 // Insert the forced scalars.
2286 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2287 // induction variable when the PHI user is scalarized.
2288 auto ForcedScalar = ForcedScalars.find(VF);
2289 if (ForcedScalar != ForcedScalars.end())
2290 for (auto *I : ForcedScalar->second) {
2291 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2292 Worklist.insert(I);
2293 }
2294
2295 // Expand the worklist by looking through any bitcasts and getelementptr
2296 // instructions we've already identified as scalar. This is similar to the
2297 // expansion step in collectLoopUniforms(); however, here we're only
2298 // expanding to include additional bitcasts and getelementptr instructions.
2299 unsigned Idx = 0;
2300 while (Idx != Worklist.size()) {
2301 Instruction *Dst = Worklist[Idx++];
2302 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
2303 continue;
2304 auto *Src = cast<Instruction>(Dst->getOperand(0));
2305 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
2306 auto *J = cast<Instruction>(U);
2307 return !TheLoop->contains(J) || Worklist.count(J) ||
2308 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
2309 IsScalarUse(J, Src));
2310 })) {
2311 Worklist.insert(Src);
2312 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2313 }
2314 }
2315
2316 // An induction variable will remain scalar if all users of the induction
2317 // variable and induction variable update remain scalar.
2318 for (const auto &Induction : Legal->getInductionVars()) {
2319 auto *Ind = Induction.first;
2320 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
2321
2322 // If tail-folding is applied, the primary induction variable will be used
2323 // to feed a vector compare.
2324 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2325 continue;
2326
2327 // Returns true if \p Indvar is a pointer induction that is used directly by
2328 // load/store instruction \p I.
2329 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2330 Instruction *I) {
2331 return Induction.second.getKind() ==
2334 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
2335 };
2336
2337 // Determine if all users of the induction variable are scalar after
2338 // vectorization.
2339 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
2340 auto *I = cast<Instruction>(U);
2341 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
2342 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2343 });
2344 if (!ScalarInd)
2345 continue;
2346
2347 // If the induction variable update is a fixed-order recurrence, neither the
2348 // induction variable or its update should be marked scalar after
2349 // vectorization.
2350 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
2351 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
2352 continue;
2353
2354 // Determine if all users of the induction variable update instruction are
2355 // scalar after vectorization.
2356 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
2357 auto *I = cast<Instruction>(U);
2358 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
2359 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2360 });
2361 if (!ScalarIndUpdate)
2362 continue;
2363
2364 // The induction variable and its update instruction will remain scalar.
2365 Worklist.insert(Ind);
2366 Worklist.insert(IndUpdate);
2367 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2368 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2369 << "\n");
2370 }
2371
2372 Scalars[VF].insert_range(Worklist);
2373}
2374
2376 ElementCount VF) {
2377 if (!isPredicatedInst(I))
2378 return false;
2379
2380 // Do we have a non-scalar lowering for this predicated
2381 // instruction? No - it is scalar with predication.
2382 switch(I->getOpcode()) {
2383 default:
2384 return true;
2385 case Instruction::Call:
2386 if (VF.isScalar())
2387 return true;
2389 case Instruction::Load:
2390 case Instruction::Store: {
2391 bool IsConsecutive = Legal->isConsecutivePtr(getLoadStoreType(I),
2393 return !(IsConsecutive && Config.isLegalMaskedLoadOrStore(I, VF)) &&
2394 !Config.isLegalGatherOrScatter(I, VF);
2395 }
2396 case Instruction::UDiv:
2397 case Instruction::SDiv:
2398 case Instruction::SRem:
2399 case Instruction::URem: {
2400 // We have the option to use the llvm.masked.udiv intrinsics to avoid
2401 // predication. The cost based decision here will always select the masked
2402 // intrinsics for scalable vectors as scalarization isn't legal.
2403 const auto [ScalarCost, MaskedCost] = getDivRemSpeculationCost(I, VF);
2404 return isDivRemScalarWithPredication(ScalarCost, MaskedCost);
2405 }
2406 }
2407}
2408
2410 return Legal->isMaskRequired(I, foldTailByMasking());
2411}
2412
2413// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2415 // TODO: We can use the loop-preheader as context point here and get
2416 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2420 return false;
2421
2422 // If the instruction was executed conditionally in the original scalar loop,
2423 // predication is needed with a mask whose lanes are all possibly inactive.
2424 if (Legal->blockNeedsPredication(I->getParent()))
2425 return true;
2426
2427 // If we're not folding the tail by masking, predication is unnecessary.
2428 if (!foldTailByMasking())
2429 return false;
2430
2431 // All that remain are instructions with side-effects originally executed in
2432 // the loop unconditionally, but now execute under a tail-fold mask (only)
2433 // having at least one active lane (the first). If the side-effects of the
2434 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2435 // - it will cause the same side-effects as when masked.
2436 switch(I->getOpcode()) {
2437 default:
2439 "instruction should have been considered by earlier checks");
2440 case Instruction::Call:
2441 // Side-effects of a Call are assumed to be non-invariant, needing a
2442 // (fold-tail) mask.
2444 "should have returned earlier for calls not needing a mask");
2445 return true;
2446 case Instruction::Load:
2447 // If the address is loop invariant no predication is needed.
2448 return !Legal->isInvariant(getLoadStorePointerOperand(I));
2449 case Instruction::Store: {
2450 // For stores, we need to prove both speculation safety (which follows from
2451 // the same argument as loads), but also must prove the value being stored
2452 // is correct. The easiest form of the later is to require that all values
2453 // stored are the same.
2454 return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
2455 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
2456 }
2457 case Instruction::UDiv:
2458 case Instruction::URem:
2459 // If the divisor is loop-invariant no predication is needed.
2460 return !Legal->isInvariant(I->getOperand(1));
2461 case Instruction::SDiv:
2462 case Instruction::SRem:
2463 // Conservative for now, since masked-off lanes may be poison and could
2464 // trigger signed overflow.
2465 return true;
2466 }
2467}
2468
2472 return 1;
2473 // If the block wasn't originally predicated then return early to avoid
2474 // computing BlockFrequencyInfo unnecessarily.
2475 if (!Legal->blockNeedsPredication(BB))
2476 return 1;
2477
2478 uint64_t HeaderFreq =
2479 getBFI().getBlockFreq(TheLoop->getHeader()).getFrequency();
2480 uint64_t BBFreq = getBFI().getBlockFreq(BB).getFrequency();
2481 assert(HeaderFreq >= BBFreq &&
2482 "Header has smaller block freq than dominated BB?");
2483 return std::round((double)HeaderFreq / BBFreq);
2484}
2485
2487 switch (Opcode) {
2488 case Instruction::UDiv:
2489 return Intrinsic::masked_udiv;
2490 case Instruction::SDiv:
2491 return Intrinsic::masked_sdiv;
2492 case Instruction::URem:
2493 return Intrinsic::masked_urem;
2494 case Instruction::SRem:
2495 return Intrinsic::masked_srem;
2496 default:
2497 llvm_unreachable("Unexpected opcode");
2498 }
2499}
2500
2501std::pair<InstructionCost, InstructionCost>
2503 ElementCount VF) {
2504 assert(I->getOpcode() == Instruction::UDiv ||
2505 I->getOpcode() == Instruction::SDiv ||
2506 I->getOpcode() == Instruction::SRem ||
2507 I->getOpcode() == Instruction::URem);
2509
2510 // Scalarization isn't legal for scalable vector types
2511 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2512 if (!VF.isScalable()) {
2513 // Get the scalarization cost and scale this amount by the probability of
2514 // executing the predicated block. If the instruction is not predicated,
2515 // we fall through to the next case.
2516 ScalarizationCost = 0;
2517
2518 // These instructions have a non-void type, so account for the phi nodes
2519 // that we will create. This cost is likely to be zero. The phi node
2520 // cost, if any, should be scaled by the block probability because it
2521 // models a copy at the end of each predicated block.
2522 ScalarizationCost += VF.getFixedValue() *
2523 TTI.getCFInstrCost(Instruction::PHI, Config.CostKind);
2524
2525 // The cost of the non-predicated instruction.
2526 ScalarizationCost +=
2527 VF.getFixedValue() * TTI.getArithmeticInstrCost(
2528 I->getOpcode(), I->getType(), Config.CostKind);
2529
2530 // The cost of insertelement and extractelement instructions needed for
2531 // scalarization.
2532 ScalarizationCost += getScalarizationOverhead(I, VF);
2533
2534 // Scale the cost by the probability of executing the predicated blocks.
2535 // This assumes the predicated block for each vector lane is equally
2536 // likely.
2537 ScalarizationCost =
2538 ScalarizationCost /
2539 getPredBlockCostDivisor(Config.CostKind, I->getParent());
2540 }
2541
2542 auto *VecTy = toVectorTy(I->getType(), VF);
2543 auto *MaskTy = toVectorTy(Type::getInt1Ty(I->getContext()), VF);
2544 IntrinsicCostAttributes ICA(getMaskedDivRemIntrinsic(I->getOpcode()), VecTy,
2545 {VecTy, VecTy, MaskTy});
2546 InstructionCost MaskedCost = TTI.getIntrinsicInstrCost(ICA, Config.CostKind);
2547 return {ScalarizationCost, MaskedCost};
2548}
2549
2551 Instruction *I, ElementCount VF) const {
2552 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2554 "Decision should not be set yet.");
2555 auto *Group = getInterleavedAccessGroup(I);
2556 assert(Group && "Must have a group.");
2557 unsigned InterleaveFactor = Group->getFactor();
2558
2559 // If the instruction's allocated size doesn't equal its type size, it
2560 // requires padding and will be scalarized.
2561 auto &DL = I->getDataLayout();
2562 auto *ScalarTy = getLoadStoreType(I);
2563 if (hasIrregularType(ScalarTy, DL))
2564 return false;
2565
2566 // For scalable vectors, the interleave factors must be <= 8 since we require
2567 // the (de)interleaveN intrinsics instead of shufflevectors.
2568 if (VF.isScalable() && InterleaveFactor > 8)
2569 return false;
2570
2571 // If the group involves a non-integral pointer, we may not be able to
2572 // losslessly cast all values to a common type.
2573 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
2574 for (Instruction *Member : Group->members()) {
2575 auto *MemberTy = getLoadStoreType(Member);
2576 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
2577 // Don't coerce non-integral pointers to integers or vice versa.
2578 if (MemberNI != ScalarNI)
2579 // TODO: Consider adding special nullptr value case here
2580 return false;
2581 if (MemberNI && ScalarNI &&
2582 ScalarTy->getPointerAddressSpace() !=
2583 MemberTy->getPointerAddressSpace())
2584 return false;
2585 }
2586
2587 // Check if masking is required.
2588 // A Group may need masking for one of two reasons: it resides in a block that
2589 // needs predication, or it was decided to use masking to deal with gaps
2590 // (either a gap at the end of a load-access that may result in a speculative
2591 // load, or any gaps in a store-access).
2592 bool PredicatedAccessRequiresMasking =
2594 bool LoadAccessWithGapsRequiresEpilogMasking =
2595 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
2597 bool StoreAccessWithGapsRequiresMasking =
2598 isa<StoreInst>(I) && !Group->isFull();
2599 if (!PredicatedAccessRequiresMasking &&
2600 !LoadAccessWithGapsRequiresEpilogMasking &&
2601 !StoreAccessWithGapsRequiresMasking)
2602 return true;
2603
2604 // If masked interleaving is required, we expect that the user/target had
2605 // enabled it, because otherwise it either wouldn't have been created or
2606 // it should have been invalidated by the CostModel.
2608 "Masked interleave-groups for predicated accesses are not enabled.");
2609
2610 if (Group->isReverse())
2611 return false;
2612
2613 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
2614 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
2615 StoreAccessWithGapsRequiresMasking;
2616 if (VF.isScalable() && NeedsMaskForGaps)
2617 return false;
2618
2619 return Config.isLegalMaskedLoadOrStore(I, VF);
2620}
2621
2623 Instruction *I, ElementCount VF) {
2624 // Get and ensure we have a valid memory instruction.
2625 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
2626
2627 auto *Ptr = getLoadStorePointerOperand(I);
2628 auto *ScalarTy = getLoadStoreType(I);
2629
2630 // In order to be widened, the pointer should be consecutive, first of all.
2631 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
2632 return false;
2633
2634 // If the instruction is a store located in a predicated block, it will be
2635 // scalarized.
2636 if (isScalarWithPredication(I, VF))
2637 return false;
2638
2639 // If the instruction's allocated size doesn't equal it's type size, it
2640 // requires padding and will be scalarized.
2641 auto &DL = I->getDataLayout();
2642 if (hasIrregularType(ScalarTy, DL))
2643 return false;
2644
2645 return true;
2646}
2647
2648void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
2649 // We should not collect Uniforms more than once per VF. Right now,
2650 // this function is called from collectUniformsAndScalars(), which
2651 // already does this check. Collecting Uniforms for VF=1 does not make any
2652 // sense.
2653
2654 assert(VF.isVector() && !Uniforms.contains(VF) &&
2655 "This function should not be visited twice for the same VF");
2656
2657 // Visit the list of Uniforms. If we find no uniform value, we won't
2658 // analyze again. Uniforms.count(VF) will return 1.
2659 Uniforms[VF].clear();
2660
2661 // Now we know that the loop is vectorizable!
2662 // Collect instructions inside the loop that will remain uniform after
2663 // vectorization.
2664
2665 // Global values, params and instructions outside of current loop are out of
2666 // scope.
2667 auto IsOutOfScope = [&](Value *V) -> bool {
2669 return (!I || !TheLoop->contains(I));
2670 };
2671
2672 // Worklist containing uniform instructions demanding lane 0.
2673 SetVector<Instruction *> Worklist;
2674
2675 // Add uniform instructions demanding lane 0 to the worklist. Instructions
2676 // that require predication must not be considered uniform after
2677 // vectorization, because that would create an erroneous replicating region
2678 // where only a single instance out of VF should be formed.
2679 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
2680 if (IsOutOfScope(I)) {
2681 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
2682 << *I << "\n");
2683 return;
2684 }
2685 if (isPredicatedInst(I)) {
2686 LLVM_DEBUG(
2687 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
2688 << "\n");
2689 return;
2690 }
2691 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
2692 Worklist.insert(I);
2693 };
2694
2695 // Start with the conditional branches exiting the loop. If the branch
2696 // condition is an instruction contained in the loop that is only used by the
2697 // branch, it is uniform. Note conditions from uncountable early exits are not
2698 // uniform.
2700 TheLoop->getExitingBlocks(Exiting);
2701 for (BasicBlock *E : Exiting) {
2702 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
2703 continue;
2704 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
2705 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
2706 AddToWorklistIfAllowed(Cmp);
2707 }
2708
2709 auto PrevVF = VF.divideCoefficientBy(2);
2710 // Return true if all lanes perform the same memory operation, and we can
2711 // thus choose to execute only one.
2712 auto IsUniformMemOpUse = [&](Instruction *I) {
2713 // If the value was already known to not be uniform for the previous
2714 // (smaller VF), it cannot be uniform for the larger VF.
2715 if (PrevVF.isVector()) {
2716 auto Iter = Uniforms.find(PrevVF);
2717 if (Iter != Uniforms.end() && !Iter->second.contains(I))
2718 return false;
2719 }
2720 if (!Legal->isUniformMemOp(*I, VF))
2721 return false;
2722 if (isa<LoadInst>(I))
2723 // Loading the same address always produces the same result - at least
2724 // assuming aliasing and ordering which have already been checked.
2725 return true;
2726 // Storing the same value on every iteration.
2727 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
2728 };
2729
2730 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
2731 InstWidening WideningDecision = getWideningDecision(I, VF);
2732 assert(WideningDecision != CM_Unknown &&
2733 "Widening decision should be ready at this moment");
2734
2735 if (IsUniformMemOpUse(I))
2736 return true;
2737
2738 return (WideningDecision == CM_Widen ||
2739 WideningDecision == CM_Widen_Reverse ||
2740 WideningDecision == CM_Interleave);
2741 };
2742
2743 // Returns true if Ptr is the pointer operand of a memory access instruction
2744 // I, I is known to not require scalarization, and the pointer is not also
2745 // stored.
2746 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
2747 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
2748 return false;
2749 return getLoadStorePointerOperand(I) == Ptr &&
2750 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
2751 };
2752
2753 // Holds a list of values which are known to have at least one uniform use.
2754 // Note that there may be other uses which aren't uniform. A "uniform use"
2755 // here is something which only demands lane 0 of the unrolled iterations;
2756 // it does not imply that all lanes produce the same value (e.g. this is not
2757 // the usual meaning of uniform)
2758 SetVector<Value *> HasUniformUse;
2759
2760 // Scan the loop for instructions which are either a) known to have only
2761 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
2762 for (auto *BB : TheLoop->blocks())
2763 for (auto &I : *BB) {
2764 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
2765 switch (II->getIntrinsicID()) {
2766 case Intrinsic::sideeffect:
2767 case Intrinsic::experimental_noalias_scope_decl:
2768 case Intrinsic::assume:
2769 case Intrinsic::lifetime_start:
2770 case Intrinsic::lifetime_end:
2771 if (TheLoop->hasLoopInvariantOperands(&I))
2772 AddToWorklistIfAllowed(&I);
2773 break;
2774 default:
2775 break;
2776 }
2777 }
2778
2779 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
2780 if (IsOutOfScope(EVI->getAggregateOperand())) {
2781 AddToWorklistIfAllowed(EVI);
2782 continue;
2783 }
2784 // Only ExtractValue instructions where the aggregate value comes from a
2785 // call are allowed to be non-uniform.
2786 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
2787 "Expected aggregate value to be call return value");
2788 }
2789
2790 // If there's no pointer operand, there's nothing to do.
2791 auto *Ptr = getLoadStorePointerOperand(&I);
2792 if (!Ptr)
2793 continue;
2794
2795 // If the pointer can be proven to be uniform, always add it to the
2796 // worklist.
2797 if (isa<Instruction>(Ptr) && Legal->isUniform(Ptr, VF))
2798 AddToWorklistIfAllowed(cast<Instruction>(Ptr));
2799
2800 if (IsUniformMemOpUse(&I))
2801 AddToWorklistIfAllowed(&I);
2802
2803 if (IsVectorizedMemAccessUse(&I, Ptr))
2804 HasUniformUse.insert(Ptr);
2805 }
2806
2807 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
2808 // demanding) users. Since loops are assumed to be in LCSSA form, this
2809 // disallows uses outside the loop as well.
2810 for (auto *V : HasUniformUse) {
2811 if (IsOutOfScope(V))
2812 continue;
2813 auto *I = cast<Instruction>(V);
2814 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
2815 auto *UI = cast<Instruction>(U);
2816 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
2817 });
2818 if (UsersAreMemAccesses)
2819 AddToWorklistIfAllowed(I);
2820 }
2821
2822 // Expand Worklist in topological order: whenever a new instruction
2823 // is added , its users should be already inside Worklist. It ensures
2824 // a uniform instruction will only be used by uniform instructions.
2825 unsigned Idx = 0;
2826 while (Idx != Worklist.size()) {
2827 Instruction *I = Worklist[Idx++];
2828
2829 for (auto *OV : I->operand_values()) {
2830 // isOutOfScope operands cannot be uniform instructions.
2831 if (IsOutOfScope(OV))
2832 continue;
2833 // First order recurrence Phi's should typically be considered
2834 // non-uniform.
2835 auto *OP = dyn_cast<PHINode>(OV);
2836 if (OP && Legal->isFixedOrderRecurrence(OP))
2837 continue;
2838 // If all the users of the operand are uniform, then add the
2839 // operand into the uniform worklist.
2840 auto *OI = cast<Instruction>(OV);
2841 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
2842 auto *J = cast<Instruction>(U);
2843 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
2844 }))
2845 AddToWorklistIfAllowed(OI);
2846 }
2847 }
2848
2849 // For an instruction to be added into Worklist above, all its users inside
2850 // the loop should also be in Worklist. However, this condition cannot be
2851 // true for phi nodes that form a cyclic dependence. We must process phi
2852 // nodes separately. An induction variable will remain uniform if all users
2853 // of the induction variable and induction variable update remain uniform.
2854 // The code below handles both pointer and non-pointer induction variables.
2855 BasicBlock *Latch = TheLoop->getLoopLatch();
2856 for (const auto &Induction : Legal->getInductionVars()) {
2857 auto *Ind = Induction.first;
2858 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
2859
2860 // Determine if all users of the induction variable are uniform after
2861 // vectorization.
2862 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
2863 auto *I = cast<Instruction>(U);
2864 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
2865 IsVectorizedMemAccessUse(I, Ind);
2866 });
2867 if (!UniformInd)
2868 continue;
2869
2870 // Determine if all users of the induction variable update instruction are
2871 // uniform after vectorization.
2872 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
2873 auto *I = cast<Instruction>(U);
2874 return I == Ind || Worklist.count(I) ||
2875 IsVectorizedMemAccessUse(I, IndUpdate);
2876 });
2877 if (!UniformIndUpdate)
2878 continue;
2879
2880 // The induction variable and its update instruction will remain uniform.
2881 AddToWorklistIfAllowed(Ind);
2882 AddToWorklistIfAllowed(IndUpdate);
2883 }
2884
2885 Uniforms[VF].insert_range(Worklist);
2886}
2887
2888FixedScalableVFPair
2890 // For outer loops, use simple type-based heuristic VF. No cost model or
2891 // memory dependence analysis is available.
2892 if (!TheLoop->isInnermost()) {
2893 return Config.computeVPlanOuterloopVF(UserVF);
2894 }
2895
2896 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
2897 // TODO: It may be useful to do since it's still likely to be dynamically
2898 // uniform if the target can skip.
2900 "Not inserting runtime ptr check for divergent target",
2901 "runtime pointer checks needed. Not enabled for divergent target",
2902 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
2904 }
2905
2906 ScalarEvolution *SE = PSE.getSE();
2908 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
2909 if (!MaxTC && EpilogueLoweringStatus == CM_EpilogueAllowed)
2911 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
2912 if (TC != ElementCount::getFixed(MaxTC))
2913 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
2914 if (TC.isScalar()) {
2915 reportVectorizationFailure("Single iteration (non) loop",
2916 "loop trip count is one, irrelevant for vectorization",
2917 "SingleIterationLoop", ORE, TheLoop);
2919 }
2920
2921 // If BTC matches the widest induction type and is -1 then the trip count
2922 // computation will wrap to 0 and the vector trip count will be 0. Do not try
2923 // to vectorize.
2924 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
2925 if (!isa<SCEVCouldNotCompute>(BTC) &&
2926 BTC->getType()->getScalarSizeInBits() >=
2927 Legal->getWidestInductionType()->getScalarSizeInBits() &&
2929 SE->getMinusOne(BTC->getType()))) {
2931 "Trip count computation wrapped",
2932 "backedge-taken count is -1, loop trip count wrapped to 0",
2933 "TripCountWrapped", ORE, TheLoop);
2935 }
2936
2937 assert(WideningDecisions.empty() && CallWideningDecisions.empty() &&
2938 Uniforms.empty() && Scalars.empty() &&
2939 "No cost-modeling decisions should have been taken at this point");
2940
2941 switch (EpilogueLoweringStatus) {
2942 case CM_EpilogueAllowed:
2943 return Config.computeFeasibleMaxVF(MaxTC, UserVF, UserIC, false,
2946 [[fallthrough]];
2948 LLVM_DEBUG(dbgs() << "LV: tail-folding hint/switch found.\n"
2949 << "LV: Not allowing epilogue, creating tail-folded "
2950 << "vector loop.\n");
2951 break;
2953 // fallthrough as a special case of OptForSize
2955 if (EpilogueLoweringStatus == CM_EpilogueNotAllowedOptSize)
2956 LLVM_DEBUG(dbgs() << "LV: Not allowing epilogue due to -Os/-Oz.\n");
2957 else
2958 LLVM_DEBUG(dbgs() << "LV: Not allowing epilogue due to low trip "
2959 << "count.\n");
2960
2961 // Bail if runtime checks are required, which are not good when optimising
2962 // for size.
2963 if (Config.runtimeChecksRequired())
2965
2966 break;
2967 }
2968
2969 // Now try the tail folding
2970
2971 // Invalidate interleave groups that require an epilogue if we can't mask
2972 // the interleave-group.
2974 // Note: There is no need to invalidate any cost modeling decisions here, as
2975 // none were taken so far (see assertion above).
2976 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
2977 }
2978
2979 FixedScalableVFPair MaxFactors = Config.computeFeasibleMaxVF(
2980 MaxTC, UserVF, UserIC, true, requiresScalarEpilogue(true));
2981
2982 // Avoid tail folding if the trip count is known to be a multiple of any VF
2983 // we choose.
2984 std::optional<unsigned> MaxPowerOf2RuntimeVF =
2985 MaxFactors.FixedVF.getFixedValue();
2986 if (MaxFactors.ScalableVF) {
2987 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
2988 if (MaxVScale) {
2989 MaxPowerOf2RuntimeVF = std::max<unsigned>(
2990 *MaxPowerOf2RuntimeVF,
2991 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
2992 } else
2993 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
2994 }
2995
2996 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
2997 // Return false if the loop is neither a single-latch-exit loop nor an
2998 // early-exit loop as tail-folding is not supported in that case.
2999 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3000 !Legal->hasUncountableEarlyExit())
3001 return false;
3002 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3003 ScalarEvolution *SE = PSE.getSE();
3004 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3005 // with uncountable exits. For countable loops, the symbolic maximum must
3006 // remain identical to the known back-edge taken count.
3007 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3008 assert((Legal->hasUncountableEarlyExit() ||
3009 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3010 "Invalid loop count");
3011 const SCEV *ExitCount = SE->getAddExpr(
3012 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3013 const SCEV *Rem = SE->getURemExpr(
3014 SE->applyLoopGuards(ExitCount, TheLoop),
3015 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
3016 return Rem->isZero();
3017 };
3018
3019 if (MaxPowerOf2RuntimeVF > 0u) {
3020 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3021 "MaxFixedVF must be a power of 2");
3022 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3023 // Accept MaxFixedVF if we do not have a tail.
3024 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3025 return MaxFactors;
3026 }
3027 }
3028
3029 auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
3030 if (ExpectedTC && ExpectedTC->isFixed() &&
3031 ExpectedTC->getFixedValue() <=
3032 TTI.getMinTripCountTailFoldingThreshold()) {
3033 if (MaxPowerOf2RuntimeVF > 0u) {
3034 // If we have a low-trip-count, and the fixed-width VF is known to divide
3035 // the trip count but the scalable factor does not, use the fixed-width
3036 // factor in preference to allow the generation of a non-predicated loop.
3037 if (EpilogueLoweringStatus == CM_EpilogueNotAllowedLowTripLoop &&
3038 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3039 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3040 "remain for any chosen VF.\n");
3041 MaxFactors.ScalableVF = ElementCount::getScalable(0);
3042 return MaxFactors;
3043 }
3044 }
3045
3047 "The trip count is below the minial threshold value.",
3048 "loop trip count is too low, avoiding vectorization", "LowTripCount",
3049 ORE, TheLoop);
3051 }
3052
3053 // If we don't know the precise trip count, or if the trip count that we
3054 // found modulo the vectorization factor is not zero, try to fold the tail
3055 // by masking.
3056 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3057 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3058 setTailFoldingStyle(ContainsScalableVF, UserIC);
3059 if (foldTailByMasking()) {
3060 if (foldTailWithEVL()) {
3061 LLVM_DEBUG(
3062 dbgs()
3063 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3064 "try to generate VP Intrinsics with scalable vector "
3065 "factors only.\n");
3066 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3067 // for now.
3068 // TODO: extend it for fixed vectors, if required.
3069 assert(ContainsScalableVF && "Expected scalable vector factor.");
3070
3071 MaxFactors.FixedVF = ElementCount::getFixed(1);
3072 }
3073 return MaxFactors;
3074 }
3075
3076 // If there was a tail-folding hint/switch, but we can't fold the tail by
3077 // masking, fallback to a vectorization with an epilogue.
3078 if (EpilogueLoweringStatus == CM_EpilogueNotNeededFoldTail) {
3079 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with an "
3080 "epilogue instead.\n");
3081 EpilogueLoweringStatus = CM_EpilogueAllowed;
3082 return MaxFactors;
3083 }
3084
3085 if (EpilogueLoweringStatus == CM_EpilogueNotAllowedFoldTail) {
3086 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3088 }
3089
3090 if (TC.isZero()) {
3092 "unable to calculate the loop count due to complex control flow",
3093 "UnknownLoopCountComplexCFG", ORE, TheLoop);
3095 }
3096
3098 "Cannot optimize for size and vectorize at the same time.",
3099 "cannot optimize for size and vectorize at the same time. "
3100 "Enable vectorization of this loop with '#pragma clang loop "
3101 "vectorize(enable)' when compiling with -Os/-Oz",
3102 "NoTailLoopWithOptForSize", ORE, TheLoop);
3104}
3105
3108 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3109 SmallVector<RecipeVFPair> InvalidCosts;
3110 for (const auto &Plan : VPlans) {
3111 for (ElementCount VF : Plan->vectorFactors()) {
3112 // The VPlan-based cost model is designed for computing vector cost.
3113 // Querying VPlan-based cost model with a scarlar VF will cause some
3114 // errors because we expect the VF is vector for most of the widen
3115 // recipes.
3116 if (VF.isScalar())
3117 continue;
3118
3119 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, Config.CostKind, CM.PSE,
3120 OrigLoop);
3121 precomputeCosts(*Plan, VF, CostCtx);
3122 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
3124 for (auto &R : *VPBB) {
3125 if (!R.cost(VF, CostCtx).isValid())
3126 InvalidCosts.emplace_back(&R, VF);
3127 }
3128 }
3129 }
3130 }
3131 if (InvalidCosts.empty())
3132 return;
3133
3134 // Emit a report of VFs with invalid costs in the loop.
3135
3136 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
3138 unsigned I = 0;
3139 for (auto &Pair : InvalidCosts)
3140 if (Numbering.try_emplace(Pair.first, I).second)
3141 ++I;
3142
3143 // Sort the list, first on recipe(number) then on VF.
3144 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
3145 unsigned NA = Numbering[A.first];
3146 unsigned NB = Numbering[B.first];
3147 if (NA != NB)
3148 return NA < NB;
3149 return ElementCount::isKnownLT(A.second, B.second);
3150 });
3151
3152 // For a list of ordered recipe-VF pairs:
3153 // [(load, VF1), (load, VF2), (store, VF1)]
3154 // group the recipes together to emit separate remarks for:
3155 // load (VF1, VF2)
3156 // store (VF1)
3157 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
3158 auto Subset = ArrayRef<RecipeVFPair>();
3159 do {
3160 if (Subset.empty())
3161 Subset = Tail.take_front(1);
3162
3163 VPRecipeBase *R = Subset.front().first;
3164
3165 unsigned Opcode =
3167 .Case([](const VPHeaderPHIRecipe *R) { return Instruction::PHI; })
3168 .Case(
3169 [](const VPWidenStoreRecipe *R) { return Instruction::Store; })
3170 .Case([](const VPWidenLoadRecipe *R) { return Instruction::Load; })
3171 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
3172 [](const auto *R) { return Instruction::Call; })
3175 [](const auto *R) { return R->getOpcode(); })
3176 .Case([](const VPInterleaveRecipe *R) {
3177 return R->getStoredValues().empty() ? Instruction::Load
3178 : Instruction::Store;
3179 })
3180 .Case([](const VPReductionRecipe *R) {
3181 return RecurrenceDescriptor::getOpcode(R->getRecurrenceKind());
3182 });
3183
3184 // If the next recipe is different, or if there are no other pairs,
3185 // emit a remark for the collated subset. e.g.
3186 // [(load, VF1), (load, VF2))]
3187 // to emit:
3188 // remark: invalid costs for 'load' at VF=(VF1, VF2)
3189 if (Subset == Tail || Tail[Subset.size()].first != R) {
3190 std::string OutString;
3191 raw_string_ostream OS(OutString);
3192 assert(!Subset.empty() && "Unexpected empty range");
3193 OS << "Recipe with invalid costs prevented vectorization at VF=(";
3194 for (const auto &Pair : Subset)
3195 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
3196 OS << "):";
3197 if (Opcode == Instruction::Call) {
3198 StringRef Name = "";
3199 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
3200 Name = Int->getIntrinsicName();
3201 } else {
3202 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
3203 Function *CalledFn =
3204 WidenCall ? WidenCall->getCalledScalarFunction()
3205 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
3206 ->getLiveInIRValue());
3207 Name = CalledFn->getName();
3208 }
3209 OS << " call to " << Name;
3210 } else
3211 OS << " " << Instruction::getOpcodeName(Opcode);
3212 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
3213 R->getDebugLoc());
3214 Tail = Tail.drop_front(Subset.size());
3215 Subset = {};
3216 } else
3217 // Grow the subset by one element
3218 Subset = Tail.take_front(Subset.size() + 1);
3219 } while (!Tail.empty());
3220}
3221
3222/// Check if any recipe of \p Plan will generate a vector value, which will be
3223/// assigned a vector register.
3225 const TargetTransformInfo &TTI) {
3226 assert(VF.isVector() && "Checking a scalar VF?");
3227 VPTypeAnalysis TypeInfo(Plan);
3228 DenseSet<VPRecipeBase *> EphemeralRecipes;
3229 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
3230 // Set of already visited types.
3231 DenseSet<Type *> Visited;
3234 for (VPRecipeBase &R : *VPBB) {
3235 if (EphemeralRecipes.contains(&R))
3236 continue;
3237 // Continue early if the recipe is considered to not produce a vector
3238 // result. Note that this includes VPInstruction where some opcodes may
3239 // produce a vector, to preserve existing behavior as VPInstructions model
3240 // aspects not directly mapped to existing IR instructions.
3241 switch (R.getVPRecipeID()) {
3242 case VPRecipeBase::VPDerivedIVSC:
3243 case VPRecipeBase::VPScalarIVStepsSC:
3244 case VPRecipeBase::VPReplicateSC:
3245 case VPRecipeBase::VPInstructionSC:
3246 case VPRecipeBase::VPCurrentIterationPHISC:
3247 case VPRecipeBase::VPVectorPointerSC:
3248 case VPRecipeBase::VPVectorEndPointerSC:
3249 case VPRecipeBase::VPExpandSCEVSC:
3250 case VPRecipeBase::VPPredInstPHISC:
3251 case VPRecipeBase::VPBranchOnMaskSC:
3252 continue;
3253 case VPRecipeBase::VPReductionSC:
3254 case VPRecipeBase::VPActiveLaneMaskPHISC:
3255 case VPRecipeBase::VPWidenCallSC:
3256 case VPRecipeBase::VPWidenCanonicalIVSC:
3257 case VPRecipeBase::VPWidenCastSC:
3258 case VPRecipeBase::VPWidenGEPSC:
3259 case VPRecipeBase::VPWidenIntrinsicSC:
3260 case VPRecipeBase::VPWidenMemIntrinsicSC:
3261 case VPRecipeBase::VPWidenSC:
3262 case VPRecipeBase::VPBlendSC:
3263 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
3264 case VPRecipeBase::VPHistogramSC:
3265 case VPRecipeBase::VPWidenPHISC:
3266 case VPRecipeBase::VPWidenIntOrFpInductionSC:
3267 case VPRecipeBase::VPWidenPointerInductionSC:
3268 case VPRecipeBase::VPReductionPHISC:
3269 case VPRecipeBase::VPInterleaveEVLSC:
3270 case VPRecipeBase::VPInterleaveSC:
3271 case VPRecipeBase::VPWidenLoadEVLSC:
3272 case VPRecipeBase::VPWidenLoadSC:
3273 case VPRecipeBase::VPWidenStoreEVLSC:
3274 case VPRecipeBase::VPWidenStoreSC:
3275 break;
3276 default:
3277 llvm_unreachable("unhandled recipe");
3278 }
3279
3280 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
3281 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
3282 if (!NumLegalParts)
3283 return false;
3284 if (VF.isScalable()) {
3285 // <vscale x 1 x iN> is assumed to be profitable over iN because
3286 // scalable registers are a distinct register class from scalar
3287 // ones. If we ever find a target which wants to lower scalable
3288 // vectors back to scalars, we'll need to update this code to
3289 // explicitly ask TTI about the register class uses for each part.
3290 return NumLegalParts <= VF.getKnownMinValue();
3291 }
3292 // Two or more elements that share a register - are vectorized.
3293 return NumLegalParts < VF.getFixedValue();
3294 };
3295
3296 // If no def nor is a store, e.g., branches, continue - no value to check.
3297 if (R.getNumDefinedValues() == 0 &&
3299 continue;
3300 // For multi-def recipes, currently only interleaved loads, suffice to
3301 // check first def only.
3302 // For stores check their stored value; for interleaved stores suffice
3303 // the check first stored value only. In all cases this is the second
3304 // operand.
3305 VPValue *ToCheck =
3306 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
3307 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
3308 if (!Visited.insert({ScalarTy}).second)
3309 continue;
3310 Type *WideTy = toVectorizedTy(ScalarTy, VF);
3311 if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
3312 return true;
3313 }
3314 }
3315
3316 return false;
3317}
3318
3319static bool hasReplicatorRegion(VPlan &Plan) {
3321 Plan.getVectorLoopRegion()->getEntry())),
3322 [](auto *VPRB) { return VPRB->isReplicator(); });
3323}
3324
3325/// Returns true if the VPlan contains a VPReductionPHIRecipe with
3326/// FindLast recurrence kind.
3327static bool hasFindLastReductionPhi(VPlan &Plan) {
3329 [](VPRecipeBase &R) {
3330 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(&R);
3331 return RedPhi &&
3332 RecurrenceDescriptor::isFindLastRecurrenceKind(
3333 RedPhi->getRecurrenceKind());
3334 });
3335}
3336
3337/// Returns true if the VPlan contains header phi recipes that are not currently
3338/// supported for epilogue vectorization.
3340 return any_of(
3342 [](VPRecipeBase &R) {
3343 switch (R.getVPRecipeID()) {
3344 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
3345 // TODO: Add support for fixed-order recurrences.
3346 return true;
3347 case VPRecipeBase::VPWidenIntOrFpInductionSC:
3348 return !cast<VPWidenIntOrFpInductionRecipe>(&R)->getPHINode();
3349 case VPRecipeBase::VPReductionPHISC: {
3350 auto *RedPhi = cast<VPReductionPHIRecipe>(&R);
3351 // TODO: Support FMinNum/FMaxNum, FindLast reductions, and reductions
3352 // without underlying values.
3353 RecurKind Kind = RedPhi->getRecurrenceKind();
3354 if (RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind) ||
3355 RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) ||
3356 !RedPhi->getUnderlyingValue())
3357 return true;
3358 // TODO: Add support for FindIV reductions with sunk expressions: the
3359 // resume value from the main loop is in expression domain (e.g.,
3360 // mul(ReducedIV, 3)), but the epilogue tracks raw IV values. A sunk
3361 // expression is identified by a non-VPInstruction user of
3362 // ComputeReductionResult.
3363 if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind)) {
3364 auto *RdxResult = vputils::findComputeReductionResult(RedPhi);
3365 assert(RdxResult &&
3366 "FindIV reduction must have ComputeReductionResult");
3367 return any_of(RdxResult->users(),
3368 std::not_fn(IsaPred<VPInstruction>));
3369 }
3370 return false;
3371 }
3372 default:
3373 return false;
3374 };
3375 });
3376}
3377
3378bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
3379 VPlan &MainPlan) const {
3380 // Bail out if the plan contains header phi recipes not yet supported
3381 // for epilogue vectorization.
3382 if (hasUnsupportedHeaderPhiRecipe(MainPlan))
3383 return false;
3384
3385 // Epilogue vectorization code has not been auditted to ensure it handles
3386 // non-latch exits properly. It may be fine, but it needs auditted and
3387 // tested.
3388 // TODO: Add support for loops with an early exit.
3389 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
3390 return false;
3391
3392 return true;
3393}
3394
3396 const ElementCount VF, const unsigned IC) const {
3397 // FIXME: We need a much better cost-model to take different parameters such
3398 // as register pressure, code size increase and cost of extra branches into
3399 // account. For now we apply a very crude heuristic and only consider loops
3400 // with vectorization factors larger than a certain value.
3401
3402 // Allow the target to opt out.
3403 if (!TTI.preferEpilogueVectorization(VF * IC))
3404 return false;
3405
3406 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
3408 : TTI.getEpilogueVectorizationMinVF();
3409 return estimateElementCount(VF * IC, Config.getVScaleForTuning()) >=
3410 MinVFThreshold;
3411}
3412
3414 VPlan &MainPlan, ElementCount MainLoopVF, unsigned IC) {
3416 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
3417 return nullptr;
3418 }
3419
3420 if (!CM.isEpilogueAllowed()) {
3421 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
3422 "epilogue is allowed.\n");
3423 return nullptr;
3424 }
3425
3426 // Not really a cost consideration, but check for unsupported cases here to
3427 // simplify the logic.
3428 if (!isCandidateForEpilogueVectorization(MainPlan)) {
3429 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
3430 "is not a supported candidate.\n");
3431 return nullptr;
3432 }
3433
3436 IC * estimateElementCount(MainLoopVF, Config.getVScaleForTuning())) {
3437 // Note that the main loop leaves IC * MainLoopVF iterations iff a scalar
3438 // epilogue is required, but then the epilogue loop also requires a scalar
3439 // epilogue.
3440 LLVM_DEBUG(dbgs() << "LEV: Forced epilogue VF results in dead epilogue "
3441 "vector loop, skipping vectorizing epilogue.\n");
3442 return nullptr;
3443 }
3444
3445 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
3447 if (hasPlanWithVF(ForcedEC)) {
3448 std::unique_ptr<VPlan> Clone(getPlanFor(ForcedEC).duplicate());
3449 Clone->setVF(ForcedEC);
3450 return Clone;
3451 }
3452
3453 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
3454 "viable.\n");
3455 return nullptr;
3456 }
3457
3458 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
3459 LLVM_DEBUG(
3460 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
3461 return nullptr;
3462 }
3463
3464 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
3465 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
3466 "this loop\n");
3467 return nullptr;
3468 }
3469
3470 // Check if a plan's vector loop processes fewer iterations than VF (e.g. when
3471 // interleave groups have been narrowed) narrowInterleaveGroups) and return
3472 // the adjusted, effective VF.
3473 using namespace VPlanPatternMatch;
3474 auto GetEffectiveVF = [](VPlan &Plan, ElementCount VF) -> ElementCount {
3475 auto *Exiting = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3476 if (match(&Exiting->back(),
3477 m_BranchOnCount(m_Add(m_CanonicalIV(), m_Specific(&Plan.getUF())),
3478 m_VPValue())))
3479 return ElementCount::get(1, VF.isScalable());
3480 return VF;
3481 };
3482
3483 // Check if the main loop processes fewer than MainLoopVF elements per
3484 // iteration (e.g. due to narrowing interleave groups). Adjust MainLoopVF
3485 // as needed.
3486 MainLoopVF = GetEffectiveVF(MainPlan, MainLoopVF);
3487
3488 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
3489 // the main loop handles 8 lanes per iteration. We could still benefit from
3490 // vectorizing the epilogue loop with VF=4.
3491 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
3492 estimateElementCount(MainLoopVF, Config.getVScaleForTuning()));
3493
3494 Type *TCType = Legal->getWidestInductionType();
3495 const SCEV *RemainingIterations = nullptr;
3496 unsigned MaxTripCount = 0;
3497 const SCEV *TC = vputils::getSCEVExprForVPValue(MainPlan.getTripCount(), PSE);
3498 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
3499 const SCEV *KnownMinTC;
3500 bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale()));
3501 bool ScalableRemIter = false;
3502 ScalarEvolution &SE = *PSE.getSE();
3503 // Use versions of TC and VF in which both are either scalable or fixed.
3504 if (ScalableTC == MainLoopVF.isScalable()) {
3505 ScalableRemIter = ScalableTC;
3506 RemainingIterations =
3507 SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
3508 } else if (ScalableTC) {
3509 const SCEV *EstimatedTC = SE.getMulExpr(
3510 KnownMinTC,
3511 SE.getConstant(TCType, Config.getVScaleForTuning().value_or(1)));
3512 RemainingIterations = SE.getURemExpr(
3513 EstimatedTC, SE.getElementCount(TCType, MainLoopVF * IC));
3514 } else
3515 RemainingIterations =
3516 SE.getURemExpr(TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC));
3517
3518 // No iterations left to process in the epilogue.
3519 if (RemainingIterations->isZero())
3520 return nullptr;
3521
3522 if (MainLoopVF.isFixed()) {
3523 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
3524 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
3525 SE.getConstant(TCType, MaxTripCount))) {
3526 MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
3527 }
3528 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
3529 << MaxTripCount << "\n");
3530 }
3531
3532 auto SkipVF = [&](const SCEV *VF, const SCEV *RemIter) -> bool {
3533 return SE.isKnownPredicate(CmpInst::ICMP_UGT, VF, RemIter);
3534 };
3536 VPlan *BestPlan = nullptr;
3537 for (auto &NextVF : ProfitableVFs) {
3538 // Skip candidate VFs without a corresponding VPlan.
3539 if (!hasPlanWithVF(NextVF.Width))
3540 continue;
3541
3542 VPlan &CurrentPlan = getPlanFor(NextVF.Width);
3543 ElementCount EffectiveVF = GetEffectiveVF(CurrentPlan, NextVF.Width);
3544 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
3545 // vectors) or > the VF of the main loop (fixed vectors).
3546 if ((!EffectiveVF.isScalable() && MainLoopVF.isScalable() &&
3547 ElementCount::isKnownGE(EffectiveVF, EstimatedRuntimeVF)) ||
3548 (EffectiveVF.isScalable() &&
3549 ElementCount::isKnownGE(EffectiveVF, MainLoopVF)) ||
3550 (!EffectiveVF.isScalable() && !MainLoopVF.isScalable() &&
3551 ElementCount::isKnownGT(EffectiveVF, MainLoopVF)))
3552 continue;
3553
3554 // If EffectiveVF is greater than the number of remaining iterations, the
3555 // epilogue loop would be dead. Skip such factors. If the epilogue plan
3556 // also has narrowed interleave groups, use the effective VF since
3557 // the epilogue step will be reduced to its IC.
3558 // TODO: We should also consider comparing against a scalable
3559 // RemainingIterations when SCEV be able to evaluate non-canonical
3560 // vscale-based expressions.
3561 if (!ScalableRemIter) {
3562 // Handle the case where EffectiveVF and RemainingIterations are in
3563 // different numerical spaces.
3564 if (EffectiveVF.isScalable())
3565 EffectiveVF = ElementCount::getFixed(
3566 estimateElementCount(EffectiveVF, Config.getVScaleForTuning()));
3567 if (SkipVF(SE.getElementCount(TCType, EffectiveVF), RemainingIterations))
3568 continue;
3569 }
3570
3571 if (Result.Width.isScalar() ||
3572 isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
3573 /*IsEpilogue*/ true)) {
3574 Result = NextVF;
3575 BestPlan = &CurrentPlan;
3576 }
3577 }
3578
3579 if (!BestPlan)
3580 return nullptr;
3581
3582 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
3583 << Result.Width << "\n");
3584 std::unique_ptr<VPlan> Clone(BestPlan->duplicate());
3585 Clone->setVF(Result.Width);
3586 return Clone;
3587}
3588
3589unsigned
3591 InstructionCost LoopCost) {
3592 // -- The interleave heuristics --
3593 // We interleave the loop in order to expose ILP and reduce the loop overhead.
3594 // There are many micro-architectural considerations that we can't predict
3595 // at this level. For example, frontend pressure (on decode or fetch) due to
3596 // code size, or the number and capabilities of the execution ports.
3597 //
3598 // We use the following heuristics to select the interleave count:
3599 // 1. If the code has reductions, then we interleave to break the cross
3600 // iteration dependency.
3601 // 2. If the loop is really small, then we interleave to reduce the loop
3602 // overhead.
3603 // 3. We don't interleave if we think that we will spill registers to memory
3604 // due to the increased register pressure.
3605
3606 // Only interleave tail-folded loops if wide lane masks are requested, as the
3607 // overhead of multiple instructions to calculate the predicate is likely
3608 // not beneficial. If an epilogue is not allowed for any other reason,
3609 // do not interleave.
3610 if (!CM.isEpilogueAllowed() &&
3611 !(CM.preferTailFoldedLoop() && CM.useWideActiveLaneMask()))
3612 return 1;
3613
3616 LLVM_DEBUG(dbgs() << "LV: Loop requires variable-length step. "
3617 "Unroll factor forced to be 1.\n");
3618 return 1;
3619 }
3620
3621 // We used the distance for the interleave count.
3622 if (!Legal->isSafeForAnyVectorWidth())
3623 return 1;
3624
3625 // We don't attempt to perform interleaving for loops with uncountable early
3626 // exits because the VPInstruction::AnyOf code cannot currently handle
3627 // multiple parts.
3628 if (Plan.hasEarlyExit())
3629 return 1;
3630
3631 const bool HasReductions =
3634
3635 // FIXME: implement interleaving for FindLast transform correctly.
3636 if (hasFindLastReductionPhi(Plan))
3637 return 1;
3638
3639 VPRegisterUsage R =
3640 calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
3641
3642 // If we did not calculate the cost for VF (because the user selected the VF)
3643 // then we calculate the cost of VF here.
3644 if (LoopCost == 0) {
3645 if (VF.isScalar())
3646 LoopCost = CM.expectedCost(VF);
3647 else
3648 LoopCost = cost(Plan, VF, &R);
3649 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
3650
3651 // Loop body is free and there is no need for interleaving.
3652 if (LoopCost == 0)
3653 return 1;
3654 }
3655
3656 // We divide by these constants so assume that we have at least one
3657 // instruction that uses at least one register.
3658 for (auto &Pair : R.MaxLocalUsers) {
3659 Pair.second = std::max(Pair.second, 1U);
3660 }
3661
3662 // We calculate the interleave count using the following formula.
3663 // Subtract the number of loop invariants from the number of available
3664 // registers. These registers are used by all of the interleaved instances.
3665 // Next, divide the remaining registers by the number of registers that is
3666 // required by the loop, in order to estimate how many parallel instances
3667 // fit without causing spills. All of this is rounded down if necessary to be
3668 // a power of two. We want power of two interleave count to simplify any
3669 // addressing operations or alignment considerations.
3670 // We also want power of two interleave counts to ensure that the induction
3671 // variable of the vector loop wraps to zero, when tail is folded by masking;
3672 // this currently happens when OptForSize, in which case IC is set to 1 above.
3673 unsigned IC = UINT_MAX;
3674
3675 for (const auto &Pair : R.MaxLocalUsers) {
3676 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
3677 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
3678 << " registers of "
3679 << TTI.getRegisterClassName(Pair.first)
3680 << " register class\n");
3681 if (VF.isScalar()) {
3682 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
3683 TargetNumRegisters = ForceTargetNumScalarRegs;
3684 } else {
3685 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
3686 TargetNumRegisters = ForceTargetNumVectorRegs;
3687 }
3688 unsigned MaxLocalUsers = Pair.second;
3689 unsigned LoopInvariantRegs = 0;
3690 if (R.LoopInvariantRegs.contains(Pair.first))
3691 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
3692
3693 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
3694 MaxLocalUsers);
3695 // Don't count the induction variable as interleaved.
3697 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
3698 std::max(1U, (MaxLocalUsers - 1)));
3699 }
3700
3701 IC = std::min(IC, TmpIC);
3702 }
3703
3704 // Clamp the interleave ranges to reasonable counts.
3705 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
3706 LLVM_DEBUG(dbgs() << "LV: MaxInterleaveFactor for the target is "
3707 << MaxInterleaveCount << "\n");
3708
3709 // Check if the user has overridden the max.
3710 if (VF.isScalar()) {
3711 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
3712 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
3713 } else {
3714 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
3715 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
3716 }
3717
3718 // Try to get the exact trip count, or an estimate based on profiling data or
3719 // ConstantMax from PSE, failing that.
3720 auto BestKnownTC =
3721 getSmallBestKnownTC(PSE, OrigLoop,
3722 /*CanUseConstantMax=*/true,
3723 /*CanExcludeZeroTrips=*/CM.isEpilogueAllowed());
3724
3725 // For fixed length VFs treat a scalable trip count as unknown.
3726 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
3727 // Re-evaluate trip counts and VFs to be in the same numerical space.
3728 unsigned AvailableTC =
3729 estimateElementCount(*BestKnownTC, Config.getVScaleForTuning());
3730 unsigned EstimatedVF =
3731 estimateElementCount(VF, Config.getVScaleForTuning());
3732
3733 // At least one iteration must be scalar when this constraint holds. So the
3734 // maximum available iterations for interleaving is one less.
3735 if (CM.requiresScalarEpilogue(VF.isVector()))
3736 --AvailableTC;
3737
3738 unsigned InterleaveCountLB = bit_floor(std::max(
3739 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
3740
3741 if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) {
3742 // If the best known trip count is exact, we select between two
3743 // prospective ICs, where
3744 //
3745 // 1) the aggressive IC is capped by the trip count divided by VF
3746 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
3747 //
3748 // The final IC is selected in a way that the epilogue loop trip count is
3749 // minimized while maximizing the IC itself, so that we either run the
3750 // vector loop at least once if it generates a small epilogue loop, or
3751 // else we run the vector loop at least twice.
3752
3753 unsigned InterleaveCountUB = bit_floor(std::max(
3754 1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
3755 MaxInterleaveCount = InterleaveCountLB;
3756
3757 if (InterleaveCountUB != InterleaveCountLB) {
3758 unsigned TailTripCountUB =
3759 (AvailableTC % (EstimatedVF * InterleaveCountUB));
3760 unsigned TailTripCountLB =
3761 (AvailableTC % (EstimatedVF * InterleaveCountLB));
3762 // If both produce same scalar tail, maximize the IC to do the same work
3763 // in fewer vector loop iterations
3764 if (TailTripCountUB == TailTripCountLB)
3765 MaxInterleaveCount = InterleaveCountUB;
3766 }
3767 } else {
3768 // If trip count is an estimated compile time constant, limit the
3769 // IC to be capped by the trip count divided by VF * 2, such that the
3770 // vector loop runs at least twice to make interleaving seem profitable
3771 // when there is an epilogue loop present. Since exact Trip count is not
3772 // known we choose to be conservative in our IC estimate.
3773 MaxInterleaveCount = InterleaveCountLB;
3774 }
3775 }
3776
3777 assert(MaxInterleaveCount > 0 &&
3778 "Maximum interleave count must be greater than 0");
3779
3780 // Clamp the calculated IC to be between the 1 and the max interleave count
3781 // that the target and trip count allows.
3782 if (IC > MaxInterleaveCount)
3783 IC = MaxInterleaveCount;
3784 else
3785 // Make sure IC is greater than 0.
3786 IC = std::max(1u, IC);
3787
3788 assert(IC > 0 && "Interleave count must be greater than 0.");
3789
3790 // Interleave if we vectorized this loop and there is a reduction that could
3791 // benefit from interleaving.
3792 if (VF.isVector() && HasReductions) {
3793 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
3794 return IC;
3795 }
3796
3797 // For any scalar loop that either requires runtime checks or tail-folding we
3798 // are better off leaving this to the unroller. Note that if we've already
3799 // vectorized the loop we will have done the runtime check and so interleaving
3800 // won't require further checks.
3801 bool ScalarInterleavingRequiresPredication =
3802 (VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
3803 return Legal->blockNeedsPredication(BB);
3804 }));
3805 bool ScalarInterleavingRequiresRuntimePointerCheck =
3806 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
3807
3808 // We want to interleave small loops in order to reduce the loop overhead and
3809 // potentially expose ILP opportunities.
3810 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
3811 << "LV: IC is " << IC << '\n'
3812 << "LV: VF is " << VF << '\n');
3813 const bool AggressivelyInterleave =
3814 TTI.enableAggressiveInterleaving(HasReductions);
3815 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
3816 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
3817 // We assume that the cost overhead is 1 and we use the cost model
3818 // to estimate the cost of the loop and interleave until the cost of the
3819 // loop overhead is about 5% of the cost of the loop.
3820 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
3821 SmallLoopCost / LoopCost.getValue()));
3822
3823 // Interleave until store/load ports (estimated by max interleave count) are
3824 // saturated.
3825 unsigned NumStores = 0;
3826 unsigned NumLoads = 0;
3829 for (VPRecipeBase &R : *VPBB) {
3831 NumLoads++;
3832 continue;
3833 }
3835 NumStores++;
3836 continue;
3837 }
3838
3839 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
3840 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
3841 NumStores += StoreOps;
3842 else
3843 NumLoads += InterleaveR->getNumDefinedValues();
3844 continue;
3845 }
3846 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
3847 NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr());
3848 NumStores += isa<StoreInst>(RepR->getUnderlyingInstr());
3849 continue;
3850 }
3851 if (isa<VPHistogramRecipe>(&R)) {
3852 NumLoads++;
3853 NumStores++;
3854 continue;
3855 }
3856 }
3857 }
3858 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
3859 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
3860
3861 // There is little point in interleaving for reductions containing selects
3862 // and compares when VF=1 since it may just create more overhead than it's
3863 // worth for loops with small trip counts. This is because we still have to
3864 // do the final reduction after the loop.
3865 bool HasSelectCmpReductions =
3866 HasReductions &&
3868 [](VPRecipeBase &R) {
3869 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
3870 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
3871 RedR->getRecurrenceKind()) ||
3872 RecurrenceDescriptor::isFindIVRecurrenceKind(
3873 RedR->getRecurrenceKind()));
3874 });
3875 if (HasSelectCmpReductions) {
3876 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
3877 return 1;
3878 }
3879
3880 // If we have a scalar reduction (vector reductions are already dealt with
3881 // by this point), we can increase the critical path length if the loop
3882 // we're interleaving is inside another loop. For tree-wise reductions
3883 // set the limit to 2, and for ordered reductions it's best to disable
3884 // interleaving entirely.
3885 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
3886 bool HasOrderedReductions =
3888 [](VPRecipeBase &R) {
3889 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
3890
3891 return RedR && RedR->isOrdered();
3892 });
3893 if (HasOrderedReductions) {
3894 LLVM_DEBUG(
3895 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
3896 return 1;
3897 }
3898
3899 unsigned F = MaxNestedScalarReductionIC;
3900 SmallIC = std::min(SmallIC, F);
3901 StoresIC = std::min(StoresIC, F);
3902 LoadsIC = std::min(LoadsIC, F);
3903 }
3904
3906 std::max(StoresIC, LoadsIC) > SmallIC) {
3907 LLVM_DEBUG(
3908 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
3909 return std::max(StoresIC, LoadsIC);
3910 }
3911
3912 // If there are scalar reductions and TTI has enabled aggressive
3913 // interleaving for reductions, we will interleave to expose ILP.
3914 if (VF.isScalar() && AggressivelyInterleave) {
3915 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
3916 // Interleave no less than SmallIC but not as aggressive as the normal IC
3917 // to satisfy the rare situation when resources are too limited.
3918 return std::max(IC / 2, SmallIC);
3919 }
3920
3921 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
3922 return SmallIC;
3923 }
3924
3925 // Interleave if this is a large loop (small loops are already dealt with by
3926 // this point) that could benefit from interleaving.
3927 if (AggressivelyInterleave) {
3928 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
3929 return IC;
3930 }
3931
3932 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
3933 return 1;
3934}
3935
3937 ElementCount VF) {
3938 // TODO: Cost model for emulated masked load/store is completely
3939 // broken. This hack guides the cost model to use an artificially
3940 // high enough value to practically disable vectorization with such
3941 // operations, except where previously deployed legality hack allowed
3942 // using very low cost values. This is to avoid regressions coming simply
3943 // from moving "masked load/store" check from legality to cost model.
3944 // Masked Load/Gather emulation was previously never allowed.
3945 // Limited number of Masked Store/Scatter emulation was allowed.
3947 "Expecting a scalar emulated instruction");
3948 return isa<LoadInst>(I) ||
3949 (isa<StoreInst>(I) &&
3950 NumPredStores > NumberOfStoresToPredicate);
3951}
3952
3954 assert(VF.isVector() && "Expected VF >= 2");
3955
3956 // If we've already collected the instructions to scalarize or the predicated
3957 // BBs after vectorization, there's nothing to do. Collection may already have
3958 // occurred if we have a user-selected VF and are now computing the expected
3959 // cost for interleaving.
3960 if (InstsToScalarize.contains(VF) ||
3961 PredicatedBBsAfterVectorization.contains(VF))
3962 return;
3963
3964 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
3965 // not profitable to scalarize any instructions, the presence of VF in the
3966 // map will indicate that we've analyzed it already.
3967 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
3968
3969 // Find all the instructions that are scalar with predication in the loop and
3970 // determine if it would be better to not if-convert the blocks they are in.
3971 // If so, we also record the instructions to scalarize.
3972 for (BasicBlock *BB : TheLoop->blocks()) {
3974 continue;
3975 for (Instruction &I : *BB)
3976 if (isScalarWithPredication(&I, VF)) {
3977 ScalarCostsTy ScalarCosts;
3978 // Do not apply discount logic for:
3979 // 1. Scalars after vectorization, as there will only be a single copy
3980 // of the instruction.
3981 // 2. Scalable VF, as that would lead to invalid scalarization costs.
3982 // 3. Emulated masked memrefs, if a hacked cost is needed.
3983 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
3985 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
3986 for (const auto &[I, IC] : ScalarCosts)
3987 ScalarCostsVF.insert({I, IC});
3988 // Check if we decided to scalarize a call. If so, update the widening
3989 // decision of the call to CM_Scalarize with the computed scalar cost.
3990 for (const auto &[I, Cost] : ScalarCosts) {
3991 auto *CI = dyn_cast<CallInst>(I);
3992 if (!CI || !CallWideningDecisions.contains({CI, VF}))
3993 continue;
3994 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
3995 CallWideningDecisions[{CI, VF}].Cost = Cost;
3996 }
3997 }
3998 // Remember that BB will remain after vectorization.
3999 PredicatedBBsAfterVectorization[VF].insert(BB);
4000 for (auto *Pred : predecessors(BB)) {
4001 if (Pred->getSingleSuccessor() == BB)
4002 PredicatedBBsAfterVectorization[VF].insert(Pred);
4003 }
4004 }
4005 }
4006}
4007
4008InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4009 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4010 assert(!isUniformAfterVectorization(PredInst, VF) &&
4011 "Instruction marked uniform-after-vectorization will be predicated");
4012
4013 // Initialize the discount to zero, meaning that the scalar version and the
4014 // vector version cost the same.
4015 InstructionCost Discount = 0;
4016
4017 // Holds instructions to analyze. The instructions we visit are mapped in
4018 // ScalarCosts. Those instructions are the ones that would be scalarized if
4019 // we find that the scalar version costs less.
4021
4022 // Returns true if the given instruction can be scalarized.
4023 auto CanBeScalarized = [&](Instruction *I) -> bool {
4024 // We only attempt to scalarize instructions forming a single-use chain
4025 // from the original predicated block that would otherwise be vectorized.
4026 // Although not strictly necessary, we give up on instructions we know will
4027 // already be scalar to avoid traversing chains that are unlikely to be
4028 // beneficial.
4029 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
4031 return false;
4032
4033 // If the instruction is scalar with predication, it will be analyzed
4034 // separately. We ignore it within the context of PredInst.
4035 if (isScalarWithPredication(I, VF))
4036 return false;
4037
4038 // If any of the instruction's operands are uniform after vectorization,
4039 // the instruction cannot be scalarized. This prevents, for example, a
4040 // masked load from being scalarized.
4041 //
4042 // We assume we will only emit a value for lane zero of an instruction
4043 // marked uniform after vectorization, rather than VF identical values.
4044 // Thus, if we scalarize an instruction that uses a uniform, we would
4045 // create uses of values corresponding to the lanes we aren't emitting code
4046 // for. This behavior can be changed by allowing getScalarValue to clone
4047 // the lane zero values for uniforms rather than asserting.
4048 for (Use &U : I->operands())
4049 if (auto *J = dyn_cast<Instruction>(U.get()))
4050 if (isUniformAfterVectorization(J, VF))
4051 return false;
4052
4053 // Otherwise, we can scalarize the instruction.
4054 return true;
4055 };
4056
4057 // Compute the expected cost discount from scalarizing the entire expression
4058 // feeding the predicated instruction. We currently only consider expressions
4059 // that are single-use instruction chains.
4060 Worklist.push_back(PredInst);
4061 while (!Worklist.empty()) {
4062 Instruction *I = Worklist.pop_back_val();
4063
4064 // If we've already analyzed the instruction, there's nothing to do.
4065 if (ScalarCosts.contains(I))
4066 continue;
4067
4068 // Cannot scalarize fixed-order recurrence phis at the moment.
4069 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
4070 continue;
4071
4072 // Compute the cost of the vector instruction. Note that this cost already
4073 // includes the scalarization overhead of the predicated instruction.
4074 InstructionCost VectorCost = getInstructionCost(I, VF);
4075
4076 // Compute the cost of the scalarized instruction. This cost is the cost of
4077 // the instruction as if it wasn't if-converted and instead remained in the
4078 // predicated block. We will scale this cost by block probability after
4079 // computing the scalarization overhead.
4080 InstructionCost ScalarCost =
4082
4083 // Compute the scalarization overhead of needed insertelement instructions
4084 // and phi nodes.
4085 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
4086 Type *WideTy = toVectorizedTy(I->getType(), VF);
4087 for (Type *VectorTy : getContainedTypes(WideTy)) {
4088 ScalarCost += TTI.getScalarizationOverhead(
4090 /*Insert=*/true,
4091 /*Extract=*/false, Config.CostKind);
4092 }
4093 ScalarCost += VF.getFixedValue() *
4094 TTI.getCFInstrCost(Instruction::PHI, Config.CostKind);
4095 }
4096
4097 // Compute the scalarization overhead of needed extractelement
4098 // instructions. For each of the instruction's operands, if the operand can
4099 // be scalarized, add it to the worklist; otherwise, account for the
4100 // overhead.
4101 for (Use &U : I->operands())
4102 if (auto *J = dyn_cast<Instruction>(U.get())) {
4103 assert(canVectorizeTy(J->getType()) &&
4104 "Instruction has non-scalar type");
4105 if (CanBeScalarized(J))
4106 Worklist.push_back(J);
4107 else if (needsExtract(J, VF)) {
4108 Type *WideTy = toVectorizedTy(J->getType(), VF);
4109 for (Type *VectorTy : getContainedTypes(WideTy)) {
4110 ScalarCost += TTI.getScalarizationOverhead(
4111 cast<VectorType>(VectorTy),
4112 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
4113 /*Extract*/ true, Config.CostKind);
4114 }
4115 }
4116 }
4117
4118 // Scale the total scalar cost by block probability.
4119 ScalarCost /= getPredBlockCostDivisor(Config.CostKind, I->getParent());
4120
4121 // Compute the discount. A non-negative discount means the vector version
4122 // of the instruction costs more, and scalarizing would be beneficial.
4123 Discount += VectorCost - ScalarCost;
4124 ScalarCosts[I] = ScalarCost;
4125 }
4126
4127 return Discount;
4128}
4129
4132 assert(VF.isScalar() && "must only be called for scalar VFs");
4133
4134 // For each block.
4135 for (BasicBlock *BB : TheLoop->blocks()) {
4136 InstructionCost BlockCost;
4137
4138 // For each instruction in the old loop.
4139 for (Instruction &I : *BB) {
4140 // Skip ignored values.
4141 if (ValuesToIgnore.count(&I) ||
4142 (VF.isVector() && VecValuesToIgnore.count(&I)))
4143 continue;
4144
4146
4147 // Check if we should override the cost.
4148 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
4150
4151 BlockCost += C;
4152 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
4153 << VF << " For instruction: " << I << '\n');
4154 }
4155
4156 // In the scalar loop, we may not always execute the predicated block, if it
4157 // is an if-else block. Thus, scale the block's cost by the probability of
4158 // executing it. getPredBlockCostDivisor will return 1 for blocks that are
4159 // only predicated by the header mask when folding the tail.
4160 Cost += BlockCost / getPredBlockCostDivisor(Config.CostKind, BB);
4161 }
4162
4163 return Cost;
4164}
4165
4166/// Gets the address access SCEV for Ptr, if it should be used for cost modeling
4167/// according to isAddressSCEVForCost.
4168///
4169/// This SCEV can be sent to the Target in order to estimate the address
4170/// calculation cost.
4172 Value *Ptr,
4174 const Loop *TheLoop) {
4175 const SCEV *Addr = PSE.getSCEV(Ptr);
4176 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), TheLoop) ? Addr
4177 : nullptr;
4178}
4179
4181LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
4182 ElementCount VF) {
4183 assert(VF.isVector() &&
4184 "Scalarization cost of instruction implies vectorization.");
4185 if (VF.isScalable())
4187
4188 Type *ValTy = getLoadStoreType(I);
4189 auto *SE = PSE.getSE();
4190
4191 unsigned AS = getLoadStoreAddressSpace(I);
4193 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
4194 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
4195 // that it is being called from this specific place.
4196
4197 // Figure out whether the access is strided and get the stride value
4198 // if it's known in compile time
4199 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, PSE, TheLoop);
4200
4201 // Get the cost of the scalar memory instruction and address computation.
4203 VF.getFixedValue() *
4204 TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV, Config.CostKind);
4205
4206 // Don't pass *I here, since it is scalar but will actually be part of a
4207 // vectorized loop where the user of it is a vectorized instruction.
4208 const Align Alignment = getLoadStoreAlignment(I);
4209 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
4210 Cost += VF.getFixedValue() *
4211 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
4212 AS, Config.CostKind, OpInfo);
4213
4214 // Get the overhead of the extractelement and insertelement instructions
4215 // we might create due to scalarization.
4216 Cost += getScalarizationOverhead(I, VF);
4217
4218 // If we have a predicated load/store, it will need extra i1 extracts and
4219 // conditional branches, but may not be executed for each vector lane. Scale
4220 // the cost by the probability of executing the predicated block.
4221 if (isPredicatedInst(I)) {
4222 Cost /= getPredBlockCostDivisor(Config.CostKind, I->getParent());
4223
4224 // Add the cost of an i1 extract and a branch
4225 auto *VecI1Ty =
4227 Cost += TTI.getScalarizationOverhead(
4228 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
4229 /*Insert=*/false, /*Extract=*/true, Config.CostKind);
4230 Cost += TTI.getCFInstrCost(Instruction::CondBr, Config.CostKind);
4231
4233 // Artificially setting to a high enough value to practically disable
4234 // vectorization with such operations.
4235 Cost = 3000000;
4236 }
4237
4238 return Cost;
4239}
4240
4242LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
4243 ElementCount VF) {
4244 Type *ValTy = getLoadStoreType(I);
4245 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4247 unsigned AS = getLoadStoreAddressSpace(I);
4248 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
4249
4250 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
4251 "Stride should be 1 or -1 for consecutive memory access");
4252 const Align Alignment = getLoadStoreAlignment(I);
4254 if (isMaskRequired(I)) {
4255 unsigned IID = I->getOpcode() == Instruction::Load
4256 ? Intrinsic::masked_load
4257 : Intrinsic::masked_store;
4258 Cost += TTI.getMemIntrinsicInstrCost(
4259 MemIntrinsicCostAttributes(IID, VectorTy, Alignment, AS),
4260 Config.CostKind);
4261 } else {
4262 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
4263 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
4264 Config.CostKind, OpInfo, I);
4265 }
4266
4267 bool Reverse = ConsecutiveStride < 0;
4268 if (Reverse)
4269 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
4270 VectorTy, {}, Config.CostKind, 0);
4271 return Cost;
4272}
4273
4275LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
4276 ElementCount VF) {
4277 assert(Legal->isUniformMemOp(*I, VF));
4278
4279 Type *ValTy = getLoadStoreType(I);
4281 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4282 const Align Alignment = getLoadStoreAlignment(I);
4283 unsigned AS = getLoadStoreAddressSpace(I);
4284 if (isa<LoadInst>(I)) {
4285 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
4286 Config.CostKind) +
4287 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
4288 Config.CostKind) +
4289 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy,
4290 VectorTy, {}, Config.CostKind);
4291 }
4292 StoreInst *SI = cast<StoreInst>(I);
4293
4294 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
4295 // TODO: We have existing tests that request the cost of extracting element
4296 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
4297 // the actual generated code, which involves extracting the last element of
4298 // a scalable vector where the lane to extract is unknown at compile time.
4300 TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, Config.CostKind) +
4301 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
4302 Config.CostKind);
4303 if (!IsLoopInvariantStoreValue)
4304 Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
4305 VectorTy, Config.CostKind, 0);
4306 return Cost;
4307}
4308
4310LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
4311 ElementCount VF) {
4312 Type *ValTy = getLoadStoreType(I);
4313 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4314 const Align Alignment = getLoadStoreAlignment(I);
4316 Type *PtrTy = Ptr->getType();
4317
4318 if (!Legal->isUniform(Ptr, VF))
4319 PtrTy = toVectorTy(PtrTy, VF);
4320
4321 unsigned IID = I->getOpcode() == Instruction::Load
4322 ? Intrinsic::masked_gather
4323 : Intrinsic::masked_scatter;
4324 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
4325 Config.CostKind) +
4326 TTI.getMemIntrinsicInstrCost(
4327 MemIntrinsicCostAttributes(IID, VectorTy, Ptr, isMaskRequired(I),
4328 Alignment, I),
4329 Config.CostKind);
4330}
4331
4333LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
4334 ElementCount VF) {
4335 const auto *Group = getInterleavedAccessGroup(I);
4336 assert(Group && "Fail to get an interleaved access group.");
4337
4338 Instruction *InsertPos = Group->getInsertPos();
4339 Type *ValTy = getLoadStoreType(InsertPos);
4340 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4341 unsigned AS = getLoadStoreAddressSpace(InsertPos);
4342
4343 unsigned InterleaveFactor = Group->getFactor();
4344 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4345
4346 // Holds the indices of existing members in the interleaved group.
4347 SmallVector<unsigned, 4> Indices;
4348 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4349 if (Group->getMember(IF))
4350 Indices.push_back(IF);
4351
4352 // Calculate the cost of the whole interleaved group.
4353 bool UseMaskForGaps =
4354 (Group->requiresScalarEpilogue() && !isEpilogueAllowed()) ||
4355 (isa<StoreInst>(I) && !Group->isFull());
4356 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
4357 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
4358 Group->getAlign(), AS, Config.CostKind, isMaskRequired(I),
4359 UseMaskForGaps);
4360
4361 if (Group->isReverse()) {
4362 // TODO: Add support for reversed masked interleaved access.
4364 "Reverse masked interleaved access not supported.");
4365 Cost += Group->getNumMembers() *
4366 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
4367 VectorTy, {}, Config.CostKind, 0);
4368 }
4369 return Cost;
4370}
4371
4372std::optional<InstructionCost>
4374 ElementCount VF,
4375 Type *Ty) const {
4376 using namespace llvm::PatternMatch;
4377 // Early exit for no inloop reductions
4378 if (Config.getInLoopReductions().empty() || VF.isScalar() ||
4379 !isa<VectorType>(Ty))
4380 return std::nullopt;
4381 auto *VectorTy = cast<VectorType>(Ty);
4382
4383 // We are looking for a pattern of, and finding the minimal acceptable cost:
4384 // reduce(mul(ext(A), ext(B))) or
4385 // reduce(mul(A, B)) or
4386 // reduce(ext(A)) or
4387 // reduce(A).
4388 // The basic idea is that we walk down the tree to do that, finding the root
4389 // reduction instruction in InLoopReductionImmediateChains. From there we find
4390 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
4391 // of the components. If the reduction cost is lower then we return it for the
4392 // reduction instruction and 0 for the other instructions in the pattern. If
4393 // it is not we return an invalid cost specifying the orignal cost method
4394 // should be used.
4395 Instruction *RetI = I;
4396 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
4397 if (!RetI->hasOneUser())
4398 return std::nullopt;
4399 RetI = RetI->user_back();
4400 }
4401
4402 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
4403 RetI->user_back()->getOpcode() == Instruction::Add) {
4404 RetI = RetI->user_back();
4405 }
4406
4407 // Test if the found instruction is a reduction, and if not return an invalid
4408 // cost specifying the parent to use the original cost modelling.
4409 Instruction *LastChain = Config.getInLoopReductionImmediateChain(RetI);
4410 if (!LastChain)
4411 return std::nullopt;
4412
4413 // Find the reduction this chain is a part of and calculate the basic cost of
4414 // the reduction on its own.
4415 Instruction *ReductionPhi = LastChain;
4416 while (!isa<PHINode>(ReductionPhi))
4417 ReductionPhi = Config.getInLoopReductionImmediateChain(ReductionPhi);
4418
4419 const RecurrenceDescriptor &RdxDesc =
4420 Legal->getRecurrenceDescriptor(cast<PHINode>(ReductionPhi));
4421
4422 InstructionCost BaseCost;
4423 RecurKind RK = RdxDesc.getRecurrenceKind();
4426 BaseCost = TTI.getMinMaxReductionCost(
4427 MinMaxID, VectorTy, RdxDesc.getFastMathFlags(), Config.CostKind);
4428 } else {
4429 BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), VectorTy,
4430 RdxDesc.getFastMathFlags(),
4431 Config.CostKind);
4432 }
4433
4434 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
4435 // normal fmul instruction to the cost of the fadd reduction.
4436 if (RK == RecurKind::FMulAdd)
4437 BaseCost += TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy,
4438 Config.CostKind);
4439
4440 // If we're using ordered reductions then we can just return the base cost
4441 // here, since getArithmeticReductionCost calculates the full ordered
4442 // reduction cost when FP reassociation is not allowed.
4443 if (Config.useOrderedReductions(RdxDesc))
4444 return BaseCost;
4445
4446 // Get the operand that was not the reduction chain and match it to one of the
4447 // patterns, returning the better cost if it is found.
4448 Instruction *RedOp = RetI->getOperand(1) == LastChain
4451
4452 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
4453
4454 Instruction *Op0, *Op1;
4455 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
4456 match(RedOp,
4458 match(Op0, m_ZExtOrSExt(m_Value())) &&
4459 Op0->getOpcode() == Op1->getOpcode() &&
4460 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
4461 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
4462 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
4463
4464 // Matched reduce.add(ext(mul(ext(A), ext(B)))
4465 // Note that the extend opcodes need to all match, or if A==B they will have
4466 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
4467 // which is equally fine.
4468 bool IsUnsigned = isa<ZExtInst>(Op0);
4469 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
4470 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
4471
4472 InstructionCost ExtCost =
4473 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
4474 TTI::CastContextHint::None, Config.CostKind, Op0);
4475 InstructionCost MulCost =
4476 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, Config.CostKind);
4477 InstructionCost Ext2Cost = TTI.getCastInstrCost(
4478 RedOp->getOpcode(), VectorTy, MulType, TTI::CastContextHint::None,
4479 Config.CostKind, RedOp);
4480
4481 InstructionCost RedCost = TTI.getMulAccReductionCost(
4482 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
4483 Config.CostKind);
4484
4485 if (RedCost.isValid() &&
4486 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
4487 return I == RetI ? RedCost : 0;
4488 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
4489 !TheLoop->isLoopInvariant(RedOp)) {
4490 // Matched reduce(ext(A))
4491 bool IsUnsigned = isa<ZExtInst>(RedOp);
4492 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
4493 InstructionCost RedCost = TTI.getExtendedReductionCost(
4494 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
4495 RdxDesc.getFastMathFlags(), Config.CostKind);
4496
4497 InstructionCost ExtCost = TTI.getCastInstrCost(
4498 RedOp->getOpcode(), VectorTy, ExtType, TTI::CastContextHint::None,
4499 Config.CostKind, RedOp);
4500 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
4501 return I == RetI ? RedCost : 0;
4502 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
4503 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
4504 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
4505 Op0->getOpcode() == Op1->getOpcode() &&
4506 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
4507 bool IsUnsigned = isa<ZExtInst>(Op0);
4508 Type *Op0Ty = Op0->getOperand(0)->getType();
4509 Type *Op1Ty = Op1->getOperand(0)->getType();
4510 Type *LargestOpTy =
4511 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
4512 : Op0Ty;
4513 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
4514
4515 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
4516 // different sizes. We take the largest type as the ext to reduce, and add
4517 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
4518 InstructionCost ExtCost0 = TTI.getCastInstrCost(
4519 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
4520 TTI::CastContextHint::None, Config.CostKind, Op0);
4521 InstructionCost ExtCost1 = TTI.getCastInstrCost(
4522 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
4523 TTI::CastContextHint::None, Config.CostKind, Op1);
4524 InstructionCost MulCost = TTI.getArithmeticInstrCost(
4525 Instruction::Mul, VectorTy, Config.CostKind);
4526
4527 InstructionCost RedCost = TTI.getMulAccReductionCost(
4528 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
4529 Config.CostKind);
4530 InstructionCost ExtraExtCost = 0;
4531 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
4532 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
4533 ExtraExtCost = TTI.getCastInstrCost(
4534 ExtraExtOp->getOpcode(), ExtType,
4535 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
4536 TTI::CastContextHint::None, Config.CostKind, ExtraExtOp);
4537 }
4538
4539 if (RedCost.isValid() &&
4540 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
4541 return I == RetI ? RedCost : 0;
4542 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
4543 // Matched reduce.add(mul())
4544 InstructionCost MulCost = TTI.getArithmeticInstrCost(
4545 Instruction::Mul, VectorTy, Config.CostKind);
4546
4547 InstructionCost RedCost = TTI.getMulAccReductionCost(
4548 true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy,
4549 Config.CostKind);
4550
4551 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
4552 return I == RetI ? RedCost : 0;
4553 }
4554 }
4555
4556 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
4557}
4558
4560LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
4561 ElementCount VF) {
4562 // Calculate scalar cost only. Vectorization cost should be ready at this
4563 // moment.
4564 if (VF.isScalar()) {
4565 Type *ValTy = getLoadStoreType(I);
4567 const Align Alignment = getLoadStoreAlignment(I);
4568 unsigned AS = getLoadStoreAddressSpace(I);
4569
4570 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
4571 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
4572 Config.CostKind) +
4573 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
4574 Config.CostKind, OpInfo, I);
4575 }
4576 return getWideningCost(I, VF);
4577}
4578
4580LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
4581 ElementCount VF) const {
4582
4583 // There is no mechanism yet to create a scalable scalarization loop,
4584 // so this is currently Invalid.
4585 if (VF.isScalable())
4587
4588 if (VF.isScalar())
4589 return 0;
4590
4592 Type *RetTy = toVectorizedTy(I->getType(), VF);
4593 if (!RetTy->isVoidTy() &&
4594 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
4595
4597 if (isa<LoadInst>(I))
4599 else if (isa<StoreInst>(I))
4601
4602 for (Type *VectorTy : getContainedTypes(RetTy)) {
4603 Cost += TTI.getScalarizationOverhead(
4605 /*Insert=*/true, /*Extract=*/false, Config.CostKind,
4606 /*ForPoisonSrc=*/true, {}, VIC);
4607 }
4608 }
4609
4610 // Some targets keep addresses scalar.
4611 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
4612 return Cost;
4613
4614 // Some targets support efficient element stores.
4615 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
4616 return Cost;
4617
4618 // Collect operands to consider.
4619 CallInst *CI = dyn_cast<CallInst>(I);
4620 Instruction::op_range Ops = CI ? CI->args() : I->operands();
4621
4622 // Skip operands that do not require extraction/scalarization and do not incur
4623 // any overhead.
4625 for (auto *V : filterExtractingOperands(Ops, VF))
4626 Tys.push_back(maybeVectorizeType(V->getType(), VF));
4627
4631 return Cost +
4632 TTI.getOperandsScalarizationOverhead(Tys, Config.CostKind, OperandVIC);
4633}
4634
4636 if (VF.isScalar())
4637 return;
4638
4639 // TODO: We should generate better code and update the cost model for
4640 // predicated uniform stores. Today they are treated as any other
4641 // predicated store (see added test cases in
4642 // invariant-store-vectorization.ll).
4643 NumPredStores = 0;
4644 for (BasicBlock *BB : TheLoop->blocks())
4645 for (Instruction &I : *BB)
4647 ++NumPredStores;
4648
4649 for (BasicBlock *BB : TheLoop->blocks()) {
4650 // For each instruction in the old loop.
4651 for (Instruction &I : *BB) {
4653 if (!Ptr)
4654 continue;
4655
4656 if (Legal->isUniformMemOp(I, VF)) {
4657 auto IsLegalToScalarize = [&]() {
4658 if (!VF.isScalable())
4659 // Scalarization of fixed length vectors "just works".
4660 return true;
4661
4662 // We have dedicated lowering for unpredicated uniform loads and
4663 // stores. Note that even with tail folding we know that at least
4664 // one lane is active (i.e. generalized predication is not possible
4665 // here), and the logic below depends on this fact.
4666 if (!foldTailByMasking())
4667 return true;
4668
4669 // For scalable vectors, a uniform memop load is always
4670 // uniform-by-parts and we know how to scalarize that.
4671 if (isa<LoadInst>(I))
4672 return true;
4673
4674 // A uniform store isn't neccessarily uniform-by-part
4675 // and we can't assume scalarization.
4676 auto &SI = cast<StoreInst>(I);
4677 return TheLoop->isLoopInvariant(SI.getValueOperand());
4678 };
4679
4680 const InstructionCost GatherScatterCost =
4681 Config.isLegalGatherOrScatter(&I, VF)
4682 ? getGatherScatterCost(&I, VF)
4684
4685 // Load: Scalar load + broadcast
4686 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
4687 // FIXME: This cost is a significant under-estimate for tail folded
4688 // memory ops.
4689 const InstructionCost ScalarizationCost =
4690 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
4692
4693 // Choose better solution for the current VF, Note that Invalid
4694 // costs compare as maximumal large. If both are invalid, we get
4695 // scalable invalid which signals a failure and a vectorization abort.
4696 if (GatherScatterCost < ScalarizationCost)
4697 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
4698 else
4699 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
4700 continue;
4701 }
4702
4703 // We assume that widening is the best solution when possible.
4704 if (memoryInstructionCanBeWidened(&I, VF)) {
4705 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
4706 int ConsecutiveStride = Legal->isConsecutivePtr(
4708 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
4709 "Expected consecutive stride.");
4710 InstWidening Decision =
4711 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
4712 setWideningDecision(&I, VF, Decision, Cost);
4713 continue;
4714 }
4715
4716 // Choose between Interleaving, Gather/Scatter or Scalarization.
4718 unsigned NumAccesses = 1;
4719 if (isAccessInterleaved(&I)) {
4720 const auto *Group = getInterleavedAccessGroup(&I);
4721 assert(Group && "Fail to get an interleaved access group.");
4722
4723 // Make one decision for the whole group.
4724 if (getWideningDecision(&I, VF) != CM_Unknown)
4725 continue;
4726
4727 NumAccesses = Group->getNumMembers();
4729 InterleaveCost = getInterleaveGroupCost(&I, VF);
4730 }
4731
4732 InstructionCost GatherScatterCost =
4733 Config.isLegalGatherOrScatter(&I, VF)
4734 ? getGatherScatterCost(&I, VF) * NumAccesses
4736
4737 InstructionCost ScalarizationCost =
4738 getMemInstScalarizationCost(&I, VF) * NumAccesses;
4739
4740 // Choose better solution for the current VF,
4741 // write down this decision and use it during vectorization.
4743 InstWidening Decision;
4744 if (InterleaveCost <= GatherScatterCost &&
4745 InterleaveCost < ScalarizationCost) {
4746 Decision = CM_Interleave;
4747 Cost = InterleaveCost;
4748 } else if (GatherScatterCost < ScalarizationCost) {
4749 Decision = CM_GatherScatter;
4750 Cost = GatherScatterCost;
4751 } else {
4752 Decision = CM_Scalarize;
4753 Cost = ScalarizationCost;
4754 }
4755 // If the instructions belongs to an interleave group, the whole group
4756 // receives the same decision. The whole group receives the cost, but
4757 // the cost will actually be assigned to one instruction.
4758 if (const auto *Group = getInterleavedAccessGroup(&I)) {
4759 if (Decision == CM_Scalarize) {
4760 for (Instruction *I : Group->members())
4761 setWideningDecision(I, VF, Decision,
4762 getMemInstScalarizationCost(I, VF));
4763 } else {
4764 setWideningDecision(Group, VF, Decision, Cost);
4765 }
4766 } else
4767 setWideningDecision(&I, VF, Decision, Cost);
4768 }
4769 }
4770
4771 // Make sure that any load of address and any other address computation
4772 // remains scalar unless there is gather/scatter support. This avoids
4773 // inevitable extracts into address registers, and also has the benefit of
4774 // activating LSR more, since that pass can't optimize vectorized
4775 // addresses.
4776 if (TTI.prefersVectorizedAddressing())
4777 return;
4778
4779 // Start with all scalar pointer uses.
4781 for (BasicBlock *BB : TheLoop->blocks())
4782 for (Instruction &I : *BB) {
4783 Instruction *PtrDef =
4785 if (PtrDef && TheLoop->contains(PtrDef) &&
4787 AddrDefs.insert(PtrDef);
4788 }
4789
4790 // Add all instructions used to generate the addresses.
4792 append_range(Worklist, AddrDefs);
4793 while (!Worklist.empty()) {
4794 Instruction *I = Worklist.pop_back_val();
4795 for (auto &Op : I->operands())
4796 if (auto *InstOp = dyn_cast<Instruction>(Op))
4797 if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) &&
4798 AddrDefs.insert(InstOp).second)
4799 Worklist.push_back(InstOp);
4800 }
4801
4802 auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
4803 // If there are direct memory op users of the newly scalarized load,
4804 // their cost may have changed because there's no scalarization
4805 // overhead for the operand. Update it.
4806 for (User *U : LI->users()) {
4808 continue;
4810 continue;
4813 getMemInstScalarizationCost(cast<Instruction>(U), VF));
4814 }
4815 };
4816 for (auto *I : AddrDefs) {
4817 if (isa<LoadInst>(I)) {
4818 // Setting the desired widening decision should ideally be handled in
4819 // by cost functions, but since this involves the task of finding out
4820 // if the loaded register is involved in an address computation, it is
4821 // instead changed here when we know this is the case.
4822 InstWidening Decision = getWideningDecision(I, VF);
4823 if (!isPredicatedInst(I) &&
4824 (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
4825 (!Legal->isUniformMemOp(*I, VF) && Decision == CM_Scalarize))) {
4826 // Scalarize a widened load of address or update the cost of a scalar
4827 // load of an address.
4829 I, VF, CM_Scalarize,
4830 (VF.getKnownMinValue() *
4831 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
4832 UpdateMemOpUserCost(cast<LoadInst>(I));
4833 } else if (const auto *Group = getInterleavedAccessGroup(I)) {
4834 // Scalarize all members of this interleaved group when any member
4835 // is used as an address. The address-used load skips scalarization
4836 // overhead, other members include it.
4837 for (Instruction *Member : Group->members()) {
4838 InstructionCost Cost = AddrDefs.contains(Member)
4839 ? (VF.getKnownMinValue() *
4840 getMemoryInstructionCost(
4841 Member, ElementCount::getFixed(1)))
4842 : getMemInstScalarizationCost(Member, VF);
4844 UpdateMemOpUserCost(cast<LoadInst>(Member));
4845 }
4846 }
4847 } else {
4848 // Cannot scalarize fixed-order recurrence phis at the moment.
4849 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
4850 continue;
4851
4852 // Make sure I gets scalarized and a cost estimate without
4853 // scalarization overhead.
4854 ForcedScalars[VF].insert(I);
4855 }
4856 }
4857}
4858
4860 assert(!VF.isScalar() &&
4861 "Trying to set a vectorization decision for a scalar VF");
4862
4863 auto ForcedScalar = ForcedScalars.find(VF);
4864 for (BasicBlock *BB : TheLoop->blocks()) {
4865 // For each instruction in the old loop.
4866 for (Instruction &I : *BB) {
4868
4869 if (!CI)
4870 continue;
4871
4875 Function *ScalarFunc = CI->getCalledFunction();
4876 Type *ScalarRetTy = CI->getType();
4877 SmallVector<Type *, 4> Tys, ScalarTys;
4878 for (auto &ArgOp : CI->args())
4879 ScalarTys.push_back(ArgOp->getType());
4880
4881 // Estimate cost of scalarized vector call. The source operands are
4882 // assumed to be vectors, so we need to extract individual elements from
4883 // there, execute VF scalar calls, and then gather the result into the
4884 // vector return value.
4885 if (VF.isFixed()) {
4886 InstructionCost ScalarCallCost = TTI.getCallInstrCost(
4887 ScalarFunc, ScalarRetTy, ScalarTys, Config.CostKind);
4888
4889 // Compute costs of unpacking argument values for the scalar calls and
4890 // packing the return values to a vector.
4891 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
4892 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
4893 } else {
4894 // There is no point attempting to calculate the scalar cost for a
4895 // scalable VF as we know it will be Invalid.
4896 assert(!getScalarizationOverhead(CI, VF).isValid() &&
4897 "Unexpected valid cost for scalarizing scalable vectors");
4898 ScalarCost = InstructionCost::getInvalid();
4899 }
4900
4901 // Honor ForcedScalars and UniformAfterVectorization decisions.
4902 // TODO: For calls, it might still be more profitable to widen. Use
4903 // VPlan-based cost model to compare different options.
4904 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
4905 ForcedScalar->second.contains(CI)) ||
4906 isUniformAfterVectorization(CI, VF))) {
4907 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
4908 Intrinsic::not_intrinsic, ScalarCost);
4909 continue;
4910 }
4911
4912 bool MaskRequired = isMaskRequired(CI);
4913 // Compute corresponding vector type for return value and arguments.
4914 Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
4915 for (Type *ScalarTy : ScalarTys)
4916 Tys.push_back(toVectorizedTy(ScalarTy, VF));
4917
4918 // An in-loop reduction using an fmuladd intrinsic is a special case;
4919 // we don't want the normal cost for that intrinsic.
4921 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
4924 *RedCost);
4925 continue;
4926 }
4927
4928 // Find the cost of vectorizing the call, if we can find a suitable
4929 // vector variant of the function.
4930 VFInfo FuncInfo;
4931 Function *VecFunc = nullptr;
4932 // Search through any available variants for one we can use at this VF.
4933 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
4934 // Must match requested VF.
4935 if (Info.Shape.VF != VF)
4936 continue;
4937
4938 // Must take a mask argument if one is required
4939 if (MaskRequired && !Info.isMasked())
4940 continue;
4941
4942 // Check that all parameter kinds are supported
4943 bool ParamsOk = true;
4944 for (VFParameter Param : Info.Shape.Parameters) {
4945 switch (Param.ParamKind) {
4947 break;
4949 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
4950 // Make sure the scalar parameter in the loop is invariant.
4951 if (!PSE.getSE()->isSCEVable(ScalarParam->getType()) ||
4952 !PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
4953 TheLoop))
4954 ParamsOk = false;
4955 break;
4956 }
4958 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
4959 // Find the stride for the scalar parameter in this loop and see if
4960 // it matches the stride for the variant.
4961 // TODO: do we need to figure out the cost of an extract to get the
4962 // first lane? Or do we hope that it will be folded away?
4963 ScalarEvolution *SE = PSE.getSE();
4964 if (!SE->isSCEVable(ScalarParam->getType()) ||
4965 !match(SE->getSCEV(ScalarParam),
4967 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
4969 ParamsOk = false;
4970 break;
4971 }
4973 break;
4974 default:
4975 ParamsOk = false;
4976 break;
4977 }
4978 }
4979
4980 if (!ParamsOk)
4981 continue;
4982
4983 // Found a suitable candidate, stop here.
4984 VecFunc = CI->getModule()->getFunction(Info.VectorName);
4985 FuncInfo = Info;
4986 break;
4987 }
4988
4989 if (TLI && VecFunc && !CI->isNoBuiltin())
4990 VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, Config.CostKind);
4991
4992 // Find the cost of an intrinsic; some targets may have instructions that
4993 // perform the operation without needing an actual call.
4995 if (IID != Intrinsic::not_intrinsic)
4997
4998 InstructionCost Cost = ScalarCost;
4999 InstWidening Decision = CM_Scalarize;
5000
5001 if (VectorCost.isValid() && VectorCost <= Cost) {
5002 Cost = VectorCost;
5003 Decision = CM_VectorCall;
5004 }
5005
5006 if (IntrinsicCost.isValid() && IntrinsicCost <= Cost) {
5008 Decision = CM_IntrinsicCall;
5009 }
5010
5011 setCallWideningDecision(CI, VF, Decision, VecFunc, IID, Cost);
5012 }
5013 }
5014}
5015
5017 if (!Legal->isInvariant(Op))
5018 return false;
5019 // Consider Op invariant, if it or its operands aren't predicated
5020 // instruction in the loop. In that case, it is not trivially hoistable.
5021 auto *OpI = dyn_cast<Instruction>(Op);
5022 return !OpI || !TheLoop->contains(OpI) ||
5023 (!isPredicatedInst(OpI) &&
5024 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
5025 all_of(OpI->operands(),
5026 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
5027}
5028
5031 ElementCount VF) {
5032 // If we know that this instruction will remain uniform, check the cost of
5033 // the scalar version.
5035 VF = ElementCount::getFixed(1);
5036
5037 if (VF.isVector() && isProfitableToScalarize(I, VF))
5038 return InstsToScalarize[VF][I];
5039
5040 // Forced scalars do not have any scalarization overhead.
5041 auto ForcedScalar = ForcedScalars.find(VF);
5042 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
5043 auto InstSet = ForcedScalar->second;
5044 if (InstSet.count(I))
5046 VF.getKnownMinValue();
5047 }
5048
5049 const auto &MinBWs = Config.getMinimalBitwidths();
5050 uint64_t InstrMinBWs = MinBWs.lookup(I);
5051 Type *RetTy = I->getType();
5053 RetTy = IntegerType::get(RetTy->getContext(), InstrMinBWs);
5054 auto *SE = PSE.getSE();
5055
5056 Type *VectorTy;
5057 if (isScalarAfterVectorization(I, VF)) {
5058 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
5059 [this](Instruction *I, ElementCount VF) -> bool {
5060 if (VF.isScalar())
5061 return true;
5062
5063 auto Scalarized = InstsToScalarize.find(VF);
5064 assert(Scalarized != InstsToScalarize.end() &&
5065 "VF not yet analyzed for scalarization profitability");
5066 return !Scalarized->second.count(I) &&
5067 llvm::all_of(I->users(), [&](User *U) {
5068 auto *UI = cast<Instruction>(U);
5069 return !Scalarized->second.count(UI);
5070 });
5071 };
5072
5073 // With the exception of GEPs and PHIs, after scalarization there should
5074 // only be one copy of the instruction generated in the loop. This is
5075 // because the VF is either 1, or any instructions that need scalarizing
5076 // have already been dealt with by the time we get here. As a result,
5077 // it means we don't have to multiply the instruction cost by VF.
5078 assert(I->getOpcode() == Instruction::GetElementPtr ||
5079 I->getOpcode() == Instruction::PHI ||
5080 (I->getOpcode() == Instruction::BitCast &&
5081 I->getType()->isPointerTy()) ||
5082 HasSingleCopyAfterVectorization(I, VF));
5083 VectorTy = RetTy;
5084 } else
5085 VectorTy = toVectorizedTy(RetTy, VF);
5086
5087 if (VF.isVector() && VectorTy->isVectorTy() &&
5088 !TTI.getNumberOfParts(VectorTy))
5090
5091 // TODO: We need to estimate the cost of intrinsic calls.
5092 switch (I->getOpcode()) {
5093 case Instruction::GetElementPtr:
5094 // We mark this instruction as zero-cost because the cost of GEPs in
5095 // vectorized code depends on whether the corresponding memory instruction
5096 // is scalarized or not. Therefore, we handle GEPs with the memory
5097 // instruction cost.
5098 return 0;
5099 case Instruction::UncondBr:
5100 case Instruction::CondBr: {
5101 // In cases of scalarized and predicated instructions, there will be VF
5102 // predicated blocks in the vectorized loop. Each branch around these
5103 // blocks requires also an extract of its vector compare i1 element.
5104 // Note that the conditional branch from the loop latch will be replaced by
5105 // a single branch controlling the loop, so there is no extra overhead from
5106 // scalarization.
5107 bool ScalarPredicatedBB = false;
5109 if (VF.isVector() && BI &&
5110 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
5111 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
5112 BI->getParent() != TheLoop->getLoopLatch())
5113 ScalarPredicatedBB = true;
5114
5115 if (ScalarPredicatedBB) {
5116 // Not possible to scalarize scalable vector with predicated instructions.
5117 if (VF.isScalable())
5119 // Return cost for branches around scalarized and predicated blocks.
5120 auto *VecI1Ty =
5122 return (TTI.getScalarizationOverhead(
5123 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
5124 /*Insert*/ false, /*Extract*/ true, Config.CostKind) +
5125 (TTI.getCFInstrCost(Instruction::CondBr, Config.CostKind) *
5126 VF.getFixedValue()));
5127 }
5128
5129 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
5130 // The back-edge branch will remain, as will all scalar branches.
5131 return TTI.getCFInstrCost(Instruction::UncondBr, Config.CostKind);
5132
5133 // This branch will be eliminated by if-conversion.
5134 return 0;
5135 // Note: We currently assume zero cost for an unconditional branch inside
5136 // a predicated block since it will become a fall-through, although we
5137 // may decide in the future to call TTI for all branches.
5138 }
5139 case Instruction::Switch: {
5140 if (VF.isScalar())
5141 return TTI.getCFInstrCost(Instruction::Switch, Config.CostKind);
5142 auto *Switch = cast<SwitchInst>(I);
5143 return Switch->getNumCases() *
5144 TTI.getCmpSelInstrCost(
5145 Instruction::ICmp,
5146 toVectorTy(Switch->getCondition()->getType(), VF),
5147 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
5148 CmpInst::ICMP_EQ, Config.CostKind);
5149 }
5150 case Instruction::PHI: {
5151 auto *Phi = cast<PHINode>(I);
5152
5153 // First-order recurrences are replaced by vector shuffles inside the loop.
5154 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
5155 return TTI.getShuffleCost(
5157 cast<VectorType>(VectorTy), {}, Config.CostKind, -1);
5158 }
5159
5160 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
5161 // converted into select instructions. We require N - 1 selects per phi
5162 // node, where N is the number of incoming values.
5163 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
5164 Type *ResultTy = Phi->getType();
5165
5166 // All instructions in an Any-of reduction chain are narrowed to bool.
5167 // Check if that is the case for this phi node.
5168 auto *HeaderUser = cast_if_present<PHINode>(
5169 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
5170 auto *Phi = dyn_cast<PHINode>(U);
5171 if (Phi && Phi->getParent() == TheLoop->getHeader())
5172 return Phi;
5173 return nullptr;
5174 }));
5175 if (HeaderUser) {
5176 auto &ReductionVars = Legal->getReductionVars();
5177 auto Iter = ReductionVars.find(HeaderUser);
5178 if (Iter != ReductionVars.end() &&
5180 Iter->second.getRecurrenceKind()))
5181 ResultTy = Type::getInt1Ty(Phi->getContext());
5182 }
5183 return (Phi->getNumIncomingValues() - 1) *
5184 TTI.getCmpSelInstrCost(
5185 Instruction::Select, toVectorTy(ResultTy, VF),
5186 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
5187 CmpInst::BAD_ICMP_PREDICATE, Config.CostKind);
5188 }
5189
5190 // When tail folding with EVL, if the phi is part of an out of loop
5191 // reduction then it will be transformed into a wide vp_merge.
5192 if (VF.isVector() && foldTailWithEVL() &&
5193 Legal->getReductionVars().contains(Phi) &&
5194 !Config.isInLoopReduction(Phi)) {
5196 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
5197 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
5198 return TTI.getIntrinsicInstrCost(ICA, Config.CostKind);
5199 }
5200
5201 return TTI.getCFInstrCost(Instruction::PHI, Config.CostKind);
5202 }
5203 case Instruction::UDiv:
5204 case Instruction::SDiv:
5205 case Instruction::URem:
5206 case Instruction::SRem:
5207 if (VF.isVector() && isPredicatedInst(I)) {
5208 const auto [ScalarCost, MaskedCost] = getDivRemSpeculationCost(I, VF);
5209 return isDivRemScalarWithPredication(ScalarCost, MaskedCost) ? ScalarCost
5210 : MaskedCost;
5211 }
5212 // We've proven all lanes safe to speculate, fall through.
5213 [[fallthrough]];
5214 case Instruction::Add:
5215 case Instruction::Sub: {
5216 auto Info = Legal->getHistogramInfo(I);
5217 if (Info && VF.isVector()) {
5218 const HistogramInfo *HGram = Info.value();
5219 // Assume that a non-constant update value (or a constant != 1) requires
5220 // a multiply, and add that into the cost.
5222 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
5223 if (!RHS || RHS->getZExtValue() != 1)
5224 MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
5225 Config.CostKind);
5226
5227 // Find the cost of the histogram operation itself.
5228 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
5229 Type *ScalarTy = I->getType();
5230 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
5231 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
5232 Type::getVoidTy(I->getContext()),
5233 {PtrTy, ScalarTy, MaskTy});
5234
5235 // Add the costs together with the add/sub operation.
5236 return TTI.getIntrinsicInstrCost(ICA, Config.CostKind) + MulCost +
5237 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy,
5238 Config.CostKind);
5239 }
5240 [[fallthrough]];
5241 }
5242 case Instruction::FAdd:
5243 case Instruction::FSub:
5244 case Instruction::Mul:
5245 case Instruction::FMul:
5246 case Instruction::FDiv:
5247 case Instruction::FRem:
5248 case Instruction::Shl:
5249 case Instruction::LShr:
5250 case Instruction::AShr:
5251 case Instruction::And:
5252 case Instruction::Or:
5253 case Instruction::Xor: {
5254 // If we're speculating on the stride being 1, the multiplication may
5255 // fold away. We can generalize this for all operations using the notion
5256 // of neutral elements. (TODO)
5257 if (I->getOpcode() == Instruction::Mul &&
5258 ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
5259 PSE.getSCEV(I->getOperand(0))->isOne()) ||
5260 (TheLoop->isLoopInvariant(I->getOperand(1)) &&
5261 PSE.getSCEV(I->getOperand(1))->isOne())))
5262 return 0;
5263
5264 // Detect reduction patterns
5265 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
5266 return *RedCost;
5267
5268 // Certain instructions can be cheaper to vectorize if they have a constant
5269 // second vector operand. One example of this are shifts on x86.
5270 Value *Op2 = I->getOperand(1);
5271 if (!isa<Constant>(Op2) && TheLoop->isLoopInvariant(Op2) &&
5272 PSE.getSE()->isSCEVable(Op2->getType()) &&
5273 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
5274 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
5275 }
5276 auto Op2Info = TTI.getOperandInfo(Op2);
5277 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
5280
5281 SmallVector<const Value *, 4> Operands(I->operand_values());
5282 return TTI.getArithmeticInstrCost(
5283 I->getOpcode(), VectorTy, Config.CostKind,
5284 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5285 Op2Info, Operands, I, TLI);
5286 }
5287 case Instruction::FNeg: {
5288 return TTI.getArithmeticInstrCost(
5289 I->getOpcode(), VectorTy, Config.CostKind,
5290 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5291 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5292 I->getOperand(0), I);
5293 }
5294 case Instruction::Select: {
5296 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
5297 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
5298
5299 const Value *Op0, *Op1;
5300 using namespace llvm::PatternMatch;
5301 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
5302 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
5303 // select x, y, false --> x & y
5304 // select x, true, y --> x | y
5305 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
5306 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
5307 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
5308 Op1->getType()->getScalarSizeInBits() == 1);
5309
5310 return TTI.getArithmeticInstrCost(
5311 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And,
5312 VectorTy, Config.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1},
5313 I);
5314 }
5315
5316 Type *CondTy = SI->getCondition()->getType();
5317 if (!ScalarCond)
5318 CondTy = VectorType::get(CondTy, VF);
5319
5321 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
5322 Pred = Cmp->getPredicate();
5323 return TTI.getCmpSelInstrCost(
5324 I->getOpcode(), VectorTy, CondTy, Pred, Config.CostKind,
5325 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
5326 }
5327 case Instruction::ICmp:
5328 case Instruction::FCmp: {
5329 Type *ValTy = I->getOperand(0)->getType();
5330
5332 [[maybe_unused]] Instruction *Op0AsInstruction =
5333 dyn_cast<Instruction>(I->getOperand(0));
5334 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
5335 InstrMinBWs == MinBWs.lookup(Op0AsInstruction)) &&
5336 "if both the operand and the compare are marked for "
5337 "truncation, they must have the same bitwidth");
5338 ValTy = IntegerType::get(ValTy->getContext(), InstrMinBWs);
5339 }
5340
5341 VectorTy = toVectorTy(ValTy, VF);
5342 return TTI.getCmpSelInstrCost(
5343 I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
5344 cast<CmpInst>(I)->getPredicate(), Config.CostKind,
5345 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
5346 }
5347 case Instruction::Store:
5348 case Instruction::Load: {
5349 ElementCount Width = VF;
5350 if (Width.isVector()) {
5351 InstWidening Decision = getWideningDecision(I, Width);
5352 assert(Decision != CM_Unknown &&
5353 "CM decision should be taken at this point");
5356 if (Decision == CM_Scalarize)
5357 Width = ElementCount::getFixed(1);
5358 }
5359 VectorTy = toVectorTy(getLoadStoreType(I), Width);
5360 return getMemoryInstructionCost(I, VF);
5361 }
5362 case Instruction::BitCast:
5363 if (I->getType()->isPointerTy())
5364 return 0;
5365 [[fallthrough]];
5366 case Instruction::ZExt:
5367 case Instruction::SExt:
5368 case Instruction::FPToUI:
5369 case Instruction::FPToSI:
5370 case Instruction::FPExt:
5371 case Instruction::PtrToInt:
5372 case Instruction::IntToPtr:
5373 case Instruction::SIToFP:
5374 case Instruction::UIToFP:
5375 case Instruction::Trunc:
5376 case Instruction::FPTrunc: {
5377 // Computes the CastContextHint from a Load/Store instruction.
5378 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
5380 "Expected a load or a store!");
5381
5382 if (VF.isScalar() || !TheLoop->contains(I))
5384
5385 switch (getWideningDecision(I, VF)) {
5397 llvm_unreachable("Instr did not go through cost modelling?");
5400 llvm_unreachable_internal("Instr has invalid widening decision");
5403 }
5404
5405 llvm_unreachable("Unhandled case!");
5406 };
5407
5408 unsigned Opcode = I->getOpcode();
5410 // For Trunc, the context is the only user, which must be a StoreInst.
5411 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
5412 if (I->hasOneUse())
5413 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
5414 CCH = ComputeCCH(Store);
5415 }
5416 // For Z/Sext, the context is the operand, which must be a LoadInst.
5417 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
5418 Opcode == Instruction::FPExt) {
5419 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
5420 CCH = ComputeCCH(Load);
5421 }
5422
5423 // We optimize the truncation of induction variables having constant
5424 // integer steps. The cost of these truncations is the same as the scalar
5425 // operation.
5426 if (isOptimizableIVTruncate(I, VF)) {
5427 auto *Trunc = cast<TruncInst>(I);
5428 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
5429 Trunc->getSrcTy(), CCH, Config.CostKind,
5430 Trunc);
5431 }
5432
5433 // Detect reduction patterns
5434 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
5435 return *RedCost;
5436
5437 Type *SrcScalarTy = I->getOperand(0)->getType();
5438 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
5439 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
5440 SrcScalarTy = IntegerType::get(SrcScalarTy->getContext(),
5441 MinBWs.lookup(Op0AsInstruction));
5442 Type *SrcVecTy =
5443 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
5444
5446 // If the result type is <= the source type, there will be no extend
5447 // after truncating the users to the minimal required bitwidth.
5448 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
5449 (I->getOpcode() == Instruction::ZExt ||
5450 I->getOpcode() == Instruction::SExt))
5451 return 0;
5452 }
5453
5454 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH,
5455 Config.CostKind, I);
5456 }
5457 case Instruction::Call:
5458 return getVectorCallCost(cast<CallInst>(I), VF);
5459 case Instruction::ExtractValue:
5460 return TTI.getInstructionCost(I, Config.CostKind);
5461 case Instruction::Alloca:
5462 // We cannot easily widen alloca to a scalable alloca, as
5463 // the result would need to be a vector of pointers.
5464 if (VF.isScalable())
5466 return TTI.getArithmeticInstrCost(Instruction::Mul, RetTy, Config.CostKind);
5467 case Instruction::Freeze:
5468 return TTI::TCC_Free;
5469 default:
5470 // This opcode is unknown. Assume that it is the same as 'mul'.
5471 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
5472 Config.CostKind);
5473 } // end of switch.
5474}
5475
5477 // Ignore ephemeral values.
5479
5480 SmallVector<Value *, 4> DeadInterleavePointerOps;
5482
5483 // If a scalar epilogue is required, users outside the loop won't use
5484 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
5485 // that is the case.
5486 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
5487 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
5488 return RequiresScalarEpilogue &&
5489 !TheLoop->contains(cast<Instruction>(U)->getParent());
5490 };
5491
5493 DFS.perform(LI);
5494 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
5495 for (Instruction &I : reverse(*BB)) {
5496 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
5497 continue;
5498
5499 // Add instructions that would be trivially dead and are only used by
5500 // values already ignored to DeadOps to seed worklist.
5502 all_of(I.users(), [this, IsLiveOutDead](User *U) {
5503 return VecValuesToIgnore.contains(U) ||
5504 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
5505 }))
5506 DeadOps.push_back(&I);
5507
5508 // For interleave groups, we only create a pointer for the start of the
5509 // interleave group. Queue up addresses of group members except the insert
5510 // position for further processing.
5511 if (isAccessInterleaved(&I)) {
5512 auto *Group = getInterleavedAccessGroup(&I);
5513 if (Group->getInsertPos() == &I)
5514 continue;
5515 Value *PointerOp = getLoadStorePointerOperand(&I);
5516 DeadInterleavePointerOps.push_back(PointerOp);
5517 }
5518
5519 // Queue branches for analysis. They are dead, if their successors only
5520 // contain dead instructions.
5521 if (isa<CondBrInst>(&I))
5522 DeadOps.push_back(&I);
5523 }
5524
5525 // Mark ops feeding interleave group members as free, if they are only used
5526 // by other dead computations.
5527 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
5528 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
5529 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
5530 Instruction *UI = cast<Instruction>(U);
5531 return !VecValuesToIgnore.contains(U) &&
5532 (!isAccessInterleaved(UI) ||
5533 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
5534 }))
5535 continue;
5536 VecValuesToIgnore.insert(Op);
5537 append_range(DeadInterleavePointerOps, Op->operands());
5538 }
5539
5540 // Mark ops that would be trivially dead and are only used by ignored
5541 // instructions as free.
5542 BasicBlock *Header = TheLoop->getHeader();
5543
5544 // Returns true if the block contains only dead instructions. Such blocks will
5545 // be removed by VPlan-to-VPlan transforms and won't be considered by the
5546 // VPlan-based cost model, so skip them in the legacy cost-model as well.
5547 auto IsEmptyBlock = [this](BasicBlock *BB) {
5548 return all_of(*BB, [this](Instruction &I) {
5549 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
5551 });
5552 };
5553 for (unsigned I = 0; I != DeadOps.size(); ++I) {
5554 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
5555
5556 // Check if the branch should be considered dead.
5557 if (auto *Br = dyn_cast_or_null<CondBrInst>(Op)) {
5558 BasicBlock *ThenBB = Br->getSuccessor(0);
5559 BasicBlock *ElseBB = Br->getSuccessor(1);
5560 // Don't considers branches leaving the loop for simplification.
5561 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
5562 continue;
5563 bool ThenEmpty = IsEmptyBlock(ThenBB);
5564 bool ElseEmpty = IsEmptyBlock(ElseBB);
5565 if ((ThenEmpty && ElseEmpty) ||
5566 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
5567 ElseBB->phis().empty()) ||
5568 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
5569 ThenBB->phis().empty())) {
5570 VecValuesToIgnore.insert(Br);
5571 DeadOps.push_back(Br->getCondition());
5572 }
5573 continue;
5574 }
5575
5576 // Skip any op that shouldn't be considered dead.
5577 if (!Op || !TheLoop->contains(Op) ||
5578 (isa<PHINode>(Op) && Op->getParent() == Header) ||
5580 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
5581 return !VecValuesToIgnore.contains(U) &&
5582 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
5583 }))
5584 continue;
5585
5586 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
5587 // which applies for both scalar and vector versions. Otherwise it is only
5588 // dead in vector versions, so only add it to VecValuesToIgnore.
5589 if (all_of(Op->users(),
5590 [this](User *U) { return ValuesToIgnore.contains(U); }))
5591 ValuesToIgnore.insert(Op);
5592
5593 VecValuesToIgnore.insert(Op);
5594 append_range(DeadOps, Op->operands());
5595 }
5596
5597 // Ignore type-promoting instructions we identified during reduction
5598 // detection.
5599 for (const auto &Reduction : Legal->getReductionVars()) {
5600 const RecurrenceDescriptor &RedDes = Reduction.second;
5601 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
5602 VecValuesToIgnore.insert_range(Casts);
5603 }
5604 // Ignore type-casting instructions we identified during induction
5605 // detection.
5606 for (const auto &Induction : Legal->getInductionVars()) {
5607 const InductionDescriptor &IndDes = Induction.second;
5608 VecValuesToIgnore.insert_range(IndDes.getCastInsts());
5609 }
5610}
5611
5612void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
5613 CM.collectValuesToIgnore();
5614 Config.collectElementTypesForWidening(&CM.ValuesToIgnore);
5615
5616 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
5617 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
5618 return;
5619
5620 if (!OrigLoop->isInnermost()) {
5621 // For outer loops, computeMaxVF returns a single non-scalar VF; build a
5622 // plan for only that VF.
5623 ElementCount VF =
5624 MaxFactors.FixedVF ? MaxFactors.FixedVF : MaxFactors.ScalableVF;
5625 buildVPlans(VF, VF);
5627 return;
5628 }
5629
5630 // Compute the minimal bitwidths required for integer operations in the loop
5631 // for later use by the cost model.
5632 Config.computeMinimalBitwidths();
5633
5634 // Invalidate interleave groups if all blocks of loop will be predicated.
5635 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
5637 LLVM_DEBUG(
5638 dbgs()
5639 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
5640 "which requires masked-interleaved support.\n");
5641 if (CM.InterleaveInfo.invalidateGroups())
5642 // Invalidating interleave groups also requires invalidating all decisions
5643 // based on them, which includes widening decisions and uniform and scalar
5644 // values.
5645 CM.invalidateCostModelingDecisions();
5646 }
5647
5648 if (CM.foldTailByMasking())
5649 Legal->prepareToFoldTailByMasking();
5650
5651 ElementCount MaxUserVF =
5652 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
5653 if (UserVF) {
5654 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
5656 "UserVF ignored because it may be larger than the maximal safe VF",
5657 "InvalidUserVF", ORE, OrigLoop);
5658 } else {
5660 "VF needs to be a power of two");
5661 // Collect the instructions (and their associated costs) that will be more
5662 // profitable to scalarize.
5663 Config.collectInLoopReductions();
5664 CM.collectNonVectorizedAndSetWideningDecisions(UserVF);
5665 ElementCount EpilogueUserVF =
5667 if (EpilogueUserVF.isVector() &&
5668 ElementCount::isKnownLT(EpilogueUserVF, UserVF)) {
5669 CM.collectNonVectorizedAndSetWideningDecisions(EpilogueUserVF);
5670 buildVPlans(EpilogueUserVF, EpilogueUserVF);
5671 }
5672 buildVPlans(UserVF, UserVF);
5673 if (!VPlans.empty() && VPlans.back()->getSingleVF() == UserVF) {
5674 // For scalar VF, skip VPlan cost check as VPlan cost is designed for
5675 // vector VFs only.
5676 if (UserVF.isScalar() ||
5677 cost(*VPlans.back(), UserVF, /*RU=*/nullptr).isValid()) {
5678 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
5680 return;
5681 }
5682 }
5683 VPlans.clear();
5684 reportVectorizationInfo("UserVF ignored because of invalid costs.",
5685 "InvalidCost", ORE, OrigLoop);
5686 }
5687 }
5688
5689 // Collect the Vectorization Factor Candidates.
5690 SmallVector<ElementCount> VFCandidates;
5691 for (auto VF = ElementCount::getFixed(1);
5692 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
5693 VFCandidates.push_back(VF);
5694 for (auto VF = ElementCount::getScalable(1);
5695 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
5696 VFCandidates.push_back(VF);
5697
5698 Config.collectInLoopReductions();
5699 for (const auto &VF : VFCandidates) {
5700 // Collect Uniform and Scalar instructions after vectorization with VF.
5701 CM.collectNonVectorizedAndSetWideningDecisions(VF);
5702 }
5703
5704 buildVPlans(ElementCount::getFixed(1), MaxFactors.FixedVF);
5705 buildVPlans(ElementCount::getScalable(1), MaxFactors.ScalableVF);
5706
5708}
5709
5711 ElementCount VF) const {
5712 InstructionCost Cost = CM.getInstructionCost(UI, VF);
5713 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
5715 return Cost;
5716}
5717
5718bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
5719 return CM.ValuesToIgnore.contains(UI) ||
5720 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
5721 SkipCostComputation.contains(UI);
5722}
5723
5729
5731 return CM.getPredBlockCostDivisor(CostKind, BB);
5732}
5733
5735 return CM.isScalarWithPredication(I, VF) ||
5736 CM.isUniformAfterVectorization(I, VF) || CM.isForcedScalar(I, VF) ||
5737 (VF.isVector() && CM.isProfitableToScalarize(I, VF));
5738}
5739
5741 return CM.isMaskRequired(I);
5742}
5743
5744std::optional<VPCostContext::CallWideningKind>
5746 if (VF.isScalar())
5748 switch (CM.getCallWideningDecision(CI, VF).Kind) {
5755 default:
5756 return std::nullopt;
5757 }
5758}
5759
5761LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
5762 VPCostContext &CostCtx) const {
5764 // Cost modeling for inductions is inaccurate in the legacy cost model
5765 // compared to the recipes that are generated. To match here initially during
5766 // VPlan cost model bring up directly use the induction costs from the legacy
5767 // cost model. Note that we do this as pre-processing; the VPlan may not have
5768 // any recipes associated with the original induction increment instruction
5769 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
5770 // the cost of induction phis and increments (both that are represented by
5771 // recipes and those that are not), to avoid distinguishing between them here,
5772 // and skip all recipes that represent induction phis and increments (the
5773 // former case) later on, if they exist, to avoid counting them twice.
5774 // Similarly we pre-compute the cost of any optimized truncates.
5775 // TODO: Switch to more accurate costing based on VPlan.
5776 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
5778 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
5779 SmallVector<Instruction *> IVInsts = {IVInc};
5780 for (unsigned I = 0; I != IVInsts.size(); I++) {
5781 for (Value *Op : IVInsts[I]->operands()) {
5782 auto *OpI = dyn_cast<Instruction>(Op);
5783 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
5784 continue;
5785 IVInsts.push_back(OpI);
5786 }
5787 }
5788 IVInsts.push_back(IV);
5789 for (User *U : IV->users()) {
5790 auto *CI = cast<Instruction>(U);
5791 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
5792 continue;
5793 IVInsts.push_back(CI);
5794 }
5795
5796 // If the vector loop gets executed exactly once with the given VF, ignore
5797 // the costs of comparison and induction instructions, as they'll get
5798 // simplified away.
5799 // TODO: Remove this code after stepping away from the legacy cost model and
5800 // adding code to simplify VPlans before calculating their costs.
5801 auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
5802 if (TC == VF && !CM.foldTailByMasking())
5803 addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
5804 CostCtx.SkipCostComputation);
5805
5806 for (Instruction *IVInst : IVInsts) {
5807 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
5808 continue;
5809 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
5810 LLVM_DEBUG({
5811 dbgs() << "Cost of " << InductionCost << " for VF " << VF
5812 << ": induction instruction " << *IVInst << "\n";
5813 });
5814 Cost += InductionCost;
5815 CostCtx.SkipCostComputation.insert(IVInst);
5816 }
5817 }
5818
5819 /// Compute the cost of all exiting conditions of the loop using the legacy
5820 /// cost model. This is to match the legacy behavior, which adds the cost of
5821 /// all exit conditions. Note that this over-estimates the cost, as there will
5822 /// be a single condition to control the vector loop.
5824 CM.TheLoop->getExitingBlocks(Exiting);
5825 SetVector<Instruction *> ExitInstrs;
5826 // Collect all exit conditions.
5827 for (BasicBlock *EB : Exiting) {
5828 auto *Term = dyn_cast<CondBrInst>(EB->getTerminator());
5829 if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
5830 continue;
5831 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
5832 ExitInstrs.insert(CondI);
5833 }
5834 }
5835 // Compute the cost of all instructions only feeding the exit conditions.
5836 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
5837 Instruction *CondI = ExitInstrs[I];
5838 if (!OrigLoop->contains(CondI) ||
5839 !CostCtx.SkipCostComputation.insert(CondI).second)
5840 continue;
5841 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
5842 LLVM_DEBUG({
5843 dbgs() << "Cost of " << CondICost << " for VF " << VF
5844 << ": exit condition instruction " << *CondI << "\n";
5845 });
5846 Cost += CondICost;
5847 for (Value *Op : CondI->operands()) {
5848 auto *OpI = dyn_cast<Instruction>(Op);
5849 if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
5850 any_of(OpI->users(), [&ExitInstrs](User *U) {
5851 return !ExitInstrs.contains(cast<Instruction>(U));
5852 }))
5853 continue;
5854 ExitInstrs.insert(OpI);
5855 }
5856 }
5857
5858 // Pre-compute the costs for branches except for the backedge, as the number
5859 // of replicate regions in a VPlan may not directly match the number of
5860 // branches, which would lead to different decisions.
5861 // TODO: Compute cost of branches for each replicate region in the VPlan,
5862 // which is more accurate than the legacy cost model.
5863 for (BasicBlock *BB : OrigLoop->blocks()) {
5864 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
5865 continue;
5866 CostCtx.SkipCostComputation.insert(BB->getTerminator());
5867 if (BB == OrigLoop->getLoopLatch())
5868 continue;
5869 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
5870 Cost += BranchCost;
5871 }
5872
5873 // Don't apply special costs when instruction cost is forced to make sure the
5874 // forced cost is used for each recipe.
5875 if (ForceTargetInstructionCost.getNumOccurrences())
5876 return Cost;
5877
5878 // Pre-compute costs for instructions that are forced-scalar or profitable to
5879 // scalarize. For most such instructions, their scalarization costs are
5880 // accounted for here using the legacy cost model. However, some opcodes
5881 // are excluded from these precomputed scalarization costs and are instead
5882 // modeled later by the VPlan cost model (see UseVPlanCostModel below).
5883 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
5884 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
5885 continue;
5886 CostCtx.SkipCostComputation.insert(ForcedScalar);
5887 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
5888 LLVM_DEBUG({
5889 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
5890 << ": forced scalar " << *ForcedScalar << "\n";
5891 });
5892 Cost += ForcedCost;
5893 }
5894
5895 auto UseVPlanCostModel = [](Instruction *I) -> bool {
5896 switch (I->getOpcode()) {
5897 case Instruction::SDiv:
5898 case Instruction::UDiv:
5899 case Instruction::SRem:
5900 case Instruction::URem:
5901 return true;
5902 default:
5903 return false;
5904 }
5905 };
5906 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
5907 if (UseVPlanCostModel(Scalarized) ||
5908 CostCtx.skipCostComputation(Scalarized, VF.isVector()))
5909 continue;
5910 CostCtx.SkipCostComputation.insert(Scalarized);
5911 LLVM_DEBUG({
5912 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
5913 << ": profitable to scalarize " << *Scalarized << "\n";
5914 });
5915 Cost += ScalarCost;
5916 }
5917
5918 return Cost;
5919}
5920
5921InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
5922 VPRegisterUsage *RU) const {
5923 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, Config.CostKind, PSE,
5924 OrigLoop);
5925 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
5926
5927 // Now compute and add the VPlan-based cost.
5928 Cost += Plan.cost(VF, CostCtx);
5929
5930 // Add the cost of spills due to excess register usage
5931 if (RU && Config.shouldConsiderRegPressureForVF(VF))
5932 Cost += RU->spillCost(CM.TTI, Config.CostKind, ForceTargetNumVectorRegs);
5933
5934#ifndef NDEBUG
5935 unsigned EstimatedWidth =
5936 estimateElementCount(VF, Config.getVScaleForTuning());
5937 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
5938 << " (Estimated cost per lane: ");
5939 if (Cost.isValid()) {
5940 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
5941 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
5942 } else /* No point dividing an invalid cost - it will still be invalid */
5943 LLVM_DEBUG(dbgs() << "Invalid");
5944 LLVM_DEBUG(dbgs() << ")\n");
5945#endif
5946 return Cost;
5947}
5948
5949std::pair<VectorizationFactor, VPlan *>
5951 if (VPlans.empty())
5952 return {VectorizationFactor::Disabled(), nullptr};
5953 // If there is a single VPlan with a single VF, return it directly.
5954 VPlan &FirstPlan = *VPlans[0];
5955
5956 ElementCount UserVF = Hints.getWidth();
5957 if (VPlans.size() == 1) {
5958 // For outer loops, the plan has a single vector VF determined by the
5959 // heuristic.
5960 assert((FirstPlan.hasScalarVFOnly() || hasPlanWithVF(UserVF) ||
5961 FirstPlan.isOuterLoop()) &&
5962 "must have a single scalar VF, UserVF or an outer loop");
5963 return {VectorizationFactor(FirstPlan.getSingleVF(), 0, 0), &FirstPlan};
5964 }
5965
5966 if (hasPlanWithVF(UserVF) && EpilogueVectorizationForceVF > 1) {
5967 assert(VPlans.size() == 2 && "Must have exactly 2 VPlans built");
5968 assert(VPlans[0]->getSingleVF() ==
5970 "expected first plan to be for the forced epilogue VF");
5971 assert(VPlans[1]->getSingleVF() == UserVF &&
5972 "expected second plan to be for the forced UserVF");
5973 return {VectorizationFactor(UserVF, 0, 0), VPlans[1].get()};
5974 }
5975
5976 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
5977 << (Config.CostKind == TTI::TCK_RecipThroughput
5978 ? "Reciprocal Throughput\n"
5979 : Config.CostKind == TTI::TCK_Latency
5980 ? "Instruction Latency\n"
5981 : Config.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
5982 : Config.CostKind == TTI::TCK_SizeAndLatency
5983 ? "Code Size and Latency\n"
5984 : "Unknown\n"));
5985
5987 assert(FirstPlan.hasVF(ScalarVF) &&
5988 "More than a single plan/VF w/o any plan having scalar VF");
5989
5990 // TODO: Compute scalar cost using VPlan-based cost model.
5991 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
5992 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
5993 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
5994 VectorizationFactor BestFactor = ScalarFactor;
5995
5996 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
5997 if (ForceVectorization) {
5998 // Ignore scalar width, because the user explicitly wants vectorization.
5999 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
6000 // evaluation.
6001 BestFactor.Cost = InstructionCost::getMax();
6002 }
6003
6004 VPlan *PlanForBestVF = &FirstPlan;
6005
6006 for (auto &P : VPlans) {
6007 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
6008 P->vectorFactors().end());
6009
6011 bool ConsiderRegPressure = any_of(VFs, [this](ElementCount VF) {
6012 return Config.shouldConsiderRegPressureForVF(VF);
6013 });
6015 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
6016
6017 for (unsigned I = 0; I < VFs.size(); I++) {
6018 ElementCount VF = VFs[I];
6019 if (VF.isScalar())
6020 continue;
6021 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
6022 LLVM_DEBUG(
6023 dbgs()
6024 << "LV: Not considering vector loop of width " << VF
6025 << " because it will not generate any vector instructions.\n");
6026 continue;
6027 }
6028 if (Config.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
6029 LLVM_DEBUG(
6030 dbgs()
6031 << "LV: Not considering vector loop of width " << VF
6032 << " because it would cause replicated blocks to be generated,"
6033 << " which isn't allowed when optimizing for size.\n");
6034 continue;
6035 }
6036
6038 cost(*P, VF, ConsiderRegPressure ? &RUs[I] : nullptr);
6039 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
6040
6041 if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail())) {
6042 BestFactor = CurrentFactor;
6043 PlanForBestVF = P.get();
6044 }
6045
6046 // If profitable add it to ProfitableVF list.
6047 if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
6048 ProfitableVFs.push_back(CurrentFactor);
6049 }
6050 }
6051
6052 VPlan &BestPlan = *PlanForBestVF;
6053
6054 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
6055 "when vectorizing, the scalar cost must be computed.");
6056
6057 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
6058 return {BestFactor, &BestPlan};
6059}
6060
6062 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
6064 EpilogueVectorizationKind EpilogueVecKind) {
6065 assert(BestVPlan.hasVF(BestVF) &&
6066 "Trying to execute plan with unsupported VF");
6067 assert(BestVPlan.hasUF(BestUF) &&
6068 "Trying to execute plan with unsupported UF");
6069 if (BestVPlan.hasEarlyExit())
6070 ++LoopsEarlyExitVectorized;
6071
6073 BestVPlan, *PSE.getSE(), CM.TTI, Config.CostKind, BestVF, BestUF,
6074 CM.ValuesToIgnore);
6075 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
6076 // cost model is complete for better cost estimates.
6077 RUN_VPLAN_PASS(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
6081 bool HasBranchWeights =
6082 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
6083 if (HasBranchWeights) {
6084 std::optional<unsigned> VScale = Config.getVScaleForTuning();
6086 BestVPlan, BestVF, VScale);
6087 }
6088
6089 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
6090 VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
6091
6093 BestVF, BestUF, PSE);
6094 RUN_VPLAN_PASS(VPlanTransforms::optimizeForVFAndUF, BestVPlan, BestVF, BestUF,
6095 PSE);
6097 if (EpilogueVecKind == EpilogueVectorizationKind::None)
6099 /*OnlyLatches=*/false);
6100 if (BestVPlan.getEntry()->getSingleSuccessor() ==
6101 BestVPlan.getScalarPreheader()) {
6102 // TODO: The vector loop would be dead, should not even try to vectorize.
6103 ORE->emit([&]() {
6104 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationDead",
6105 OrigLoop->getStartLoc(),
6106 OrigLoop->getHeader())
6107 << "Created vector loop never executes due to insufficient trip "
6108 "count.";
6109 });
6111 }
6112
6114
6116 // Convert the exit condition to AVLNext == 0 for EVL tail folded loops.
6118 // Regions are dissolved after optimizing for VF and UF, which completely
6119 // removes unneeded loop regions first.
6121 // Expand BranchOnTwoConds after dissolution, when latch has direct access to
6122 // its successors.
6124 // Convert loops with variable-length stepping after regions are dissolved.
6126 // Remove dead back-edges for single-iteration loops with BranchOnCond(true).
6127 // Only process loop latches to avoid removing edges from the middle block,
6128 // which may be needed for epilogue vectorization.
6129 VPlanTransforms::removeBranchOnConst(BestVPlan, /*OnlyLatches=*/true);
6131 std::optional<uint64_t> MaxRuntimeStep;
6132 if (auto MaxVScale = getMaxVScale(*CM.TheFunction, CM.TTI))
6133 MaxRuntimeStep = uint64_t(*MaxVScale) * BestVF.getKnownMinValue() * BestUF;
6135 BestVPlan, VectorPH, CM.foldTailByMasking(),
6136 CM.requiresScalarEpilogue(BestVF.isVector()), &BestVPlan.getVFxUF(),
6137 MaxRuntimeStep);
6138 VPlanTransforms::materializeFactors(BestVPlan, VectorPH, BestVF);
6139 VPlanTransforms::cse(BestVPlan);
6141 VPlanTransforms::simplifyKnownEVL(BestVPlan, BestVF, PSE);
6142
6143 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
6144 // making any changes to the CFG.
6145 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
6146 VPlanTransforms::expandSCEVs(BestVPlan, *PSE.getSE());
6147
6148 // Perform the actual loop transformation.
6149 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
6150 OrigLoop->getParentLoop(),
6151 Legal->getWidestInductionType());
6152
6153#ifdef EXPENSIVE_CHECKS
6154 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
6155#endif
6156
6157 // 1. Set up the skeleton for vectorization, including vector pre-header and
6158 // middle block. The vector loop is created during VPlan execution.
6159 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6160 if (VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader())
6161 replaceVPBBWithIRVPBB(ScalarPH, State.CFG.PrevBB->getSingleSuccessor(),
6162 &BestVPlan);
6164
6165 assert(verifyVPlanIsValid(BestVPlan) && "final VPlan is invalid");
6166
6167 // After vectorization, the exit blocks of the original loop will have
6168 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
6169 // looked through single-entry phis.
6170 ScalarEvolution &SE = *PSE.getSE();
6171 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
6172 if (!Exit->hasPredecessors())
6173 continue;
6174 for (VPRecipeBase &PhiR : Exit->phis())
6176 &cast<VPIRPhi>(PhiR).getIRPhi());
6177 }
6178 // Forget the original loop and block dispositions.
6179 SE.forgetLoop(OrigLoop);
6181
6183
6184 //===------------------------------------------------===//
6185 //
6186 // Notice: any optimization or new instruction that go
6187 // into the code below should also be implemented in
6188 // the cost-model.
6189 //
6190 //===------------------------------------------------===//
6191
6192 // Retrieve loop information before executing the plan, which may remove the
6193 // original loop, if it becomes unreachable.
6194 MDNode *LID = OrigLoop->getLoopID();
6195 unsigned OrigLoopInvocationWeight = 0;
6196 std::optional<unsigned> OrigAverageTripCount =
6197 getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
6198
6199 BestVPlan.execute(&State);
6200
6201 // 2.6. Maintain Loop Hints
6202 // Keep all loop hints from the original loop on the vector loop (we'll
6203 // replace the vectorizer-specific hints below).
6204 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
6205 // Add metadata to disable runtime unrolling a scalar loop when there
6206 // are no runtime checks about strides and memory. A scalar loop that is
6207 // rarely used is not worth unrolling.
6208 bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
6210 HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB))
6211 : nullptr,
6212 HeaderVPBB, BestVPlan,
6213 EpilogueVecKind == EpilogueVectorizationKind::Epilogue, LID,
6214 OrigAverageTripCount, OrigLoopInvocationWeight,
6215 estimateElementCount(BestVF * BestUF, Config.getVScaleForTuning()),
6216 DisableRuntimeUnroll);
6217
6218 // 3. Fix the vectorized code: take care of header phi's, live-outs,
6219 // predication, updating analyses.
6220 ILV.fixVectorizedLoop(State);
6221
6223
6224 return ExpandedSCEVs;
6225}
6226
6227//===--------------------------------------------------------------------===//
6228// EpilogueVectorizerMainLoop
6229//===--------------------------------------------------------------------===//
6230
6232 LLVM_DEBUG({
6233 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
6234 << "Main Loop VF:" << EPI.MainLoopVF
6235 << ", Main Loop UF:" << EPI.MainLoopUF
6236 << ", Epilogue Loop VF:" << EPI.EpilogueVF
6237 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
6238 });
6239}
6240
6243 dbgs() << "intermediate fn:\n"
6244 << *OrigLoop->getHeader()->getParent() << "\n";
6245 });
6246}
6247
6248//===--------------------------------------------------------------------===//
6249// EpilogueVectorizerEpilogueLoop
6250//===--------------------------------------------------------------------===//
6251
6252/// This function creates a new scalar preheader, using the previous one as
6253/// entry block to the epilogue VPlan. The minimum iteration check is being
6254/// represented in VPlan.
6256 BasicBlock *NewScalarPH = createScalarPreheader("vec.epilog.");
6257 BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
6258 OriginalScalarPH->setName("vec.epilog.iter.check");
6259 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH);
6260 VPBasicBlock *OldEntry = Plan.getEntry();
6261 for (auto &R : make_early_inc_range(*OldEntry)) {
6262 // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
6263 // defining.
6264 if (isa<VPIRInstruction>(&R))
6265 continue;
6266 R.moveBefore(*NewEntry, NewEntry->end());
6267 }
6268
6269 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
6270 Plan.setEntry(NewEntry);
6271 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
6272
6273 return OriginalScalarPH;
6274}
6275
6277 LLVM_DEBUG({
6278 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
6279 << "Epilogue Loop VF:" << EPI.EpilogueVF
6280 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
6281 });
6282}
6283
6286 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
6287 });
6288}
6289
6291 VFRange &Range) {
6292 assert((VPI->getOpcode() == Instruction::Load ||
6293 VPI->getOpcode() == Instruction::Store) &&
6294 "Must be called with either a load or store");
6296
6297 auto WillWiden = [&](ElementCount VF) -> bool {
6299 CM.getWideningDecision(I, VF);
6301 "CM decision should be taken at this point.");
6303 return true;
6304 if (CM.isScalarAfterVectorization(I, VF) ||
6305 CM.isProfitableToScalarize(I, VF))
6306 return false;
6308 };
6309
6311 return nullptr;
6312
6313 // If a mask is not required, drop it - use unmasked version for safe loads.
6314 // TODO: Determine if mask is needed in VPlan.
6315 VPValue *Mask = CM.isMaskRequired(I) ? VPI->getMask() : nullptr;
6316
6317 // Determine if the pointer operand of the access is either consecutive or
6318 // reverse consecutive.
6320 CM.getWideningDecision(I, Range.Start);
6322 bool Consecutive =
6324
6325 VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(0)
6326 : VPI->getOperand(1);
6327 if (Consecutive) {
6329 VPSingleDefRecipe *VectorPtr;
6330 if (Reverse) {
6331 // When folding the tail, we may compute an address that we don't in the
6332 // original scalar loop: drop the GEP no-wrap flags in this case.
6333 // Otherwise preserve existing flags without no-unsigned-wrap, as we will
6334 // emit negative indices.
6335 GEPNoWrapFlags ReverseFlags = CM.foldTailByMasking()
6337 : Flags.withoutNoUnsignedWrap();
6338 VectorPtr = new VPVectorEndPointerRecipe(
6339 Ptr, &Plan.getVF(), getLoadStoreType(I),
6340 /*Stride*/ -1, ReverseFlags, VPI->getDebugLoc());
6341 } else {
6342 const DataLayout &DL = I->getDataLayout();
6343 auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType());
6344 VPValue *StrideOne = Plan.getConstantInt(StrideTy, 1);
6345 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne,
6346 Flags, VPI->getDebugLoc());
6347 }
6348 Builder.setInsertPoint(VPI);
6349 Builder.insert(VectorPtr);
6350 Ptr = VectorPtr;
6351 }
6352
6353 if (Reverse && Mask)
6354 Mask = Builder.createNaryOp(VPInstruction::Reverse, Mask, I->getDebugLoc());
6355
6356 if (VPI->getOpcode() == Instruction::Load) {
6357 auto *Load = cast<LoadInst>(I);
6358 auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, *VPI,
6359 Load->getDebugLoc());
6360 if (Reverse) {
6361 Builder.insert(LoadR);
6362 return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
6363 LoadR->getDebugLoc());
6364 }
6365 return LoadR;
6366 }
6367
6368 StoreInst *Store = cast<StoreInst>(I);
6369 VPValue *StoredVal = VPI->getOperand(0);
6370 if (Reverse)
6371 StoredVal = Builder.createNaryOp(VPInstruction::Reverse, StoredVal,
6372 Store->getDebugLoc());
6373 return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive, *VPI,
6374 Store->getDebugLoc());
6375}
6376
6378VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
6379 VFRange &Range) {
6380 auto *I = cast<TruncInst>(VPI->getUnderlyingInstr());
6381 // Optimize the special case where the source is a constant integer
6382 // induction variable. Notice that we can only optimize the 'trunc' case
6383 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6384 // (c) other casts depend on pointer size.
6385
6386 // Determine whether \p K is a truncation based on an induction variable that
6387 // can be optimized.
6390 I),
6391 Range))
6392 return nullptr;
6393
6395 VPI->getOperand(0)->getDefiningRecipe());
6396 PHINode *Phi = WidenIV->getPHINode();
6397 VPIRValue *Start = WidenIV->getStartValue();
6398 const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
6399
6400 // Wrap flags from the original induction do not apply to the truncated type,
6401 // so do not propagate them.
6402 VPIRFlags Flags = VPIRFlags::WrapFlagsTy(false, false);
6403 VPValue *Step =
6406 Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
6407}
6408
6409bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
6411 "Instruction should have been handled earlier");
6412 // Instruction should be widened, unless it is scalar after vectorization,
6413 // scalarization is profitable or it is predicated.
6414 auto WillScalarize = [this, I](ElementCount VF) -> bool {
6415 return CM.isScalarAfterVectorization(I, VF) ||
6416 CM.isProfitableToScalarize(I, VF) ||
6417 CM.isScalarWithPredication(I, VF);
6418 };
6420 Range);
6421}
6422
6423VPRecipeWithIRFlags *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
6424 auto *I = VPI->getUnderlyingInstr();
6425 switch (VPI->getOpcode()) {
6426 default:
6427 return nullptr;
6428 case Instruction::SDiv:
6429 case Instruction::UDiv:
6430 case Instruction::SRem:
6431 case Instruction::URem:
6432 // If not provably safe, use a masked intrinsic.
6433 if (CM.isPredicatedInst(I))
6434 return new VPWidenIntrinsicRecipe(
6436 I->getType(), {}, {}, VPI->getDebugLoc());
6437 [[fallthrough]];
6438 case Instruction::Add:
6439 case Instruction::And:
6440 case Instruction::AShr:
6441 case Instruction::FAdd:
6442 case Instruction::FCmp:
6443 case Instruction::FDiv:
6444 case Instruction::FMul:
6445 case Instruction::FNeg:
6446 case Instruction::FRem:
6447 case Instruction::FSub:
6448 case Instruction::ICmp:
6449 case Instruction::LShr:
6450 case Instruction::Mul:
6451 case Instruction::Or:
6452 case Instruction::Select:
6453 case Instruction::Shl:
6454 case Instruction::Sub:
6455 case Instruction::Xor:
6456 case Instruction::Freeze:
6457 return new VPWidenRecipe(*I, VPI->operandsWithoutMask(), *VPI, *VPI,
6458 VPI->getDebugLoc());
6459 case Instruction::ExtractValue: {
6461 auto *EVI = cast<ExtractValueInst>(I);
6462 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
6463 unsigned Idx = EVI->getIndices()[0];
6464 NewOps.push_back(Plan.getConstantInt(32, Idx));
6465 return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
6466 }
6467 };
6468}
6469
6471 if (VPI->getOpcode() != Instruction::Store)
6472 return nullptr;
6473
6474 auto HistInfo =
6475 Legal->getHistogramInfo(cast<StoreInst>(VPI->getUnderlyingInstr()));
6476 if (!HistInfo)
6477 return nullptr;
6478
6479 const HistogramInfo *HI = *HistInfo;
6480 // FIXME: Support other operations.
6481 unsigned Opcode = HI->Update->getOpcode();
6482 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
6483 "Histogram update operation must be an Add or Sub");
6484
6486 // Bucket address.
6487 HGramOps.push_back(VPI->getOperand(1));
6488 // Increment value.
6489 HGramOps.push_back(Plan.getOrAddLiveIn(HI->Update->getOperand(1)));
6490
6491 // In case of predicated execution (due to tail-folding, or conditional
6492 // execution, or both), pass the relevant mask.
6493 if (CM.isMaskRequired(HI->Store))
6494 HGramOps.push_back(VPI->getMask());
6495
6496 return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
6497}
6498
6500 VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder) {
6501 StoreInst *SI;
6502 if ((SI = dyn_cast<StoreInst>(VPI->getUnderlyingInstr())) &&
6503 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6504 // Only create recipe for the final invariant store of the reduction.
6505 if (Legal->isInvariantStoreOfReduction(SI)) {
6506 VPValue *Val = VPI->getOperand(0);
6507 VPValue *Addr = VPI->getOperand(1);
6508 // We need to store the exiting value of the reduction, so use the blend
6509 // if tail folded.
6510 if (auto *Blend = vputils::findUserOf<VPBlendRecipe>(Val))
6511 Val = Blend;
6512 assert(
6513 vputils::findUserOf<VPReductionPHIRecipe>(Val)->getBackedgeValue() ==
6514 Val &&
6515 "Store isn't backedge value?");
6516 auto *Recipe = new VPReplicateRecipe(
6517 SI, {Val, Addr}, true /* IsUniform */, nullptr /*Mask*/, *VPI, *VPI,
6518 VPI->getDebugLoc());
6519 FinalRedStoresBuilder.insert(Recipe);
6520 }
6521 VPI->eraseFromParent();
6522 return true;
6523 }
6524
6525 return false;
6526}
6527
6529 VFRange &Range) {
6530 auto *I = VPI->getUnderlyingInstr();
6532 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
6533 Range);
6534
6535 bool IsPredicated = CM.isPredicatedInst(I);
6536
6537 // Even if the instruction is not marked as uniform, there are certain
6538 // intrinsic calls that can be effectively treated as such, so we check for
6539 // them here. Conservatively, we only do this for scalable vectors, since
6540 // for fixed-width VFs we can always fall back on full scalarization.
6541 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
6542 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
6543 case Intrinsic::assume:
6544 case Intrinsic::lifetime_start:
6545 case Intrinsic::lifetime_end:
6546 // For scalable vectors if one of the operands is variant then we still
6547 // want to mark as uniform, which will generate one instruction for just
6548 // the first lane of the vector. We can't scalarize the call in the same
6549 // way as for fixed-width vectors because we don't know how many lanes
6550 // there are.
6551 //
6552 // The reasons for doing it this way for scalable vectors are:
6553 // 1. For the assume intrinsic generating the instruction for the first
6554 // lane is still be better than not generating any at all. For
6555 // example, the input may be a splat across all lanes.
6556 // 2. For the lifetime start/end intrinsics the pointer operand only
6557 // does anything useful when the input comes from a stack object,
6558 // which suggests it should always be uniform. For non-stack objects
6559 // the effect is to poison the object, which still allows us to
6560 // remove the call.
6561 IsUniform = true;
6562 break;
6563 default:
6564 break;
6565 }
6566 }
6567 VPValue *BlockInMask = nullptr;
6568 if (!IsPredicated) {
6569 // Finalize the recipe for Instr, first if it is not predicated.
6570 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6571 } else {
6572 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6573 // Instructions marked for predication are replicated and a mask operand is
6574 // added initially. Masked replicate recipes will later be placed under an
6575 // if-then construct to prevent side-effects. Generate recipes to compute
6576 // the block mask for this region.
6577 BlockInMask = VPI->getMask();
6578 }
6579
6580 // Note that there is some custom logic to mark some intrinsics as uniform
6581 // manually above for scalable vectors, which this assert needs to account for
6582 // as well.
6583 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
6584 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
6585 "Should not predicate a uniform recipe");
6586 auto *Recipe =
6587 new VPReplicateRecipe(I, VPI->operandsWithoutMask(), IsUniform,
6588 BlockInMask, *VPI, *VPI, VPI->getDebugLoc());
6589 return Recipe;
6590}
6591
6594 VFRange &Range) {
6595 assert(!R->isPhi() && "phis must be handled earlier");
6596 // First, check for specific widening recipes that deal with optimizing
6597 // truncates and memory operations.
6598 auto *VPI = cast<VPInstruction>(R);
6599 assert(VPI->getOpcode() != Instruction::Call &&
6600 "Call should have been handled by makeCallWideningDecisions");
6601
6602 VPRecipeBase *Recipe;
6603 if (VPI->getOpcode() == Instruction::Trunc &&
6604 (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
6605 return Recipe;
6606
6607 // All widen recipes below deal only with VF > 1.
6609 [&](ElementCount VF) { return VF.isScalar(); }, Range))
6610 return nullptr;
6611
6612 Instruction *Instr = R->getUnderlyingInstr();
6613 assert(!is_contained({Instruction::Load, Instruction::Store},
6614 VPI->getOpcode()) &&
6615 "Should have been handled prior to this!");
6616
6617 if (!shouldWiden(Instr, Range))
6618 return nullptr;
6619
6620 if (VPI->getOpcode() == Instruction::GetElementPtr)
6621 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr),
6622 VPI->operandsWithoutMask(), *VPI,
6623 VPI->getDebugLoc());
6624
6625 if (Instruction::isCast(VPI->getOpcode())) {
6626 auto *CI = cast<CastInst>(Instr);
6627 auto *CastR = cast<VPInstructionWithType>(VPI);
6628 return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
6629 CastR->getResultType(), CI, *VPI, *VPI,
6630 VPI->getDebugLoc());
6631 }
6632
6633 return tryToWiden(VPI);
6634}
6635
6636// To allow RUN_VPLAN_PASS to print the VPlan after VF/UF independent
6637// optimizations.
6639
6640void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
6641 ElementCount MaxVF) {
6642 if (ElementCount::isKnownGT(MinVF, MaxVF))
6643 return;
6644
6645 bool IsInnerLoop = OrigLoop->isInnermost();
6646
6647 // Set up loop versioning for inner loops with memory runtime checks.
6648 // Outer loops don't have LoopAccessInfo since canVectorizeMemory() is not
6649 // called for them.
6650 std::optional<LoopVersioning> LVer;
6651 if (IsInnerLoop) {
6652 const LoopAccessInfo *LAI = Legal->getLAI();
6653 LVer.emplace(*LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop,
6654 LI, DT, PSE.getSE());
6655 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
6657 // Only use noalias metadata when using memory checks guaranteeing no
6658 // overlap across all iterations.
6659 LVer->prepareNoAliasMetadata();
6660 }
6661 }
6662
6663 // Create initial base VPlan0, to serve as common starting point for all
6664 // candidates built later for specific VF ranges.
6665 auto VPlan0 = VPlanTransforms::buildVPlan0(OrigLoop, *LI,
6666 Legal->getWidestInductionType(),
6667 PSE, LVer ? &*LVer : nullptr);
6668
6669 // Create recipes for header phis. For outer loops, reductions, recurrences
6670 // and in-loop reductions are empty since legality doesn't detect them.
6672 *OrigLoop, Legal->getInductionVars(),
6673 Legal->getReductionVars(),
6674 Legal->getFixedOrderRecurrences(),
6675 Config.getInLoopReductions(), Hints.allowReordering()))
6676 return;
6677
6678 if (const LoopAccessInfo *LAI = Legal->getLAI())
6680 LAI->getSymbolicStrides());
6684 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()));
6685 // If we're vectorizing a loop with an uncountable exit, make sure that the
6686 // recipes are safe to handle.
6687 // TODO: Remove this once we can properly check the VPlan itself for both
6688 // the presence of an uncountable exit and the presence of stores in
6689 // the loop inside handleEarlyExits itself.
6691 if (Legal->hasUncountableEarlyExit())
6692 EEStyle = Legal->hasUncountableExitWithSideEffects()
6695
6697 OrigLoop, PSE, *DT, Legal->getAssumptionCache()))
6698 return;
6699
6701 CM.foldTailByMasking());
6703 if (CM.foldTailByMasking())
6706
6707 auto MaxVFTimes2 = MaxVF * 2;
6708 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
6709 VFRange SubRange = {VF, MaxVFTimes2};
6710 auto Plan =
6711 tryToBuildVPlan(std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange);
6712 VF = SubRange.End;
6713
6714 if (!Plan)
6715 continue;
6716
6717 // Now optimize the initial VPlan.
6721 Config.getMinimalBitwidths());
6723 // TODO: try to put addExplicitVectorLength close to addActiveLaneMask
6724 if (CM.foldTailWithEVL()) {
6726 Config.getMaxSafeElements());
6728 }
6729
6730 if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
6731 VPlans.push_back(std::move(P));
6732
6734 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
6735 VPlans.push_back(std::move(Plan));
6736 }
6737}
6738
6739VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VPlanPtr Plan,
6740 VFRange &Range) {
6741
6742 // For outer loops, the plan only needs basic recipe conversion and induction
6743 // live-out optimization; the full inner-loop recipe building below does not
6744 // apply (no widening decisions, interleave groups, reductions, etc.).
6745 if (Plan->isOuterLoop()) {
6746 for (ElementCount VF : Range)
6747 Plan->addVF(VF);
6749 return nullptr;
6751 /*FoldTail=*/false);
6752 return Plan;
6753 }
6754
6755 using namespace llvm::VPlanPatternMatch;
6756 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
6757
6758 // ---------------------------------------------------------------------------
6759 // Build initial VPlan: Scan the body of the loop in a topological order to
6760 // visit each basic block after having visited its predecessor basic blocks.
6761 // ---------------------------------------------------------------------------
6762
6763 bool RequiresScalarEpilogueCheck =
6765 [this](ElementCount VF) {
6766 return !CM.requiresScalarEpilogue(VF.isVector());
6767 },
6768 Range);
6769 // Update the branch in the middle block if a scalar epilogue is required.
6770 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
6771 if (!RequiresScalarEpilogueCheck && MiddleVPBB->getNumSuccessors() == 2) {
6772 auto *BranchOnCond = cast<VPInstruction>(MiddleVPBB->getTerminator());
6773 assert(MiddleVPBB->getSuccessors()[1] == Plan->getScalarPreheader() &&
6774 "second successor must be scalar preheader");
6775 BranchOnCond->setOperand(0, Plan->getFalse());
6776 }
6777
6778 // Don't use getDecisionAndClampRange here, because we don't know the UF
6779 // so this function is better to be conservative, rather than to split
6780 // it up into different VPlans.
6781 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
6782 bool IVUpdateMayOverflow = false;
6783 for (ElementCount VF : Range)
6784 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
6785
6786 TailFoldingStyle Style = CM.getTailFoldingStyle();
6787 // Use NUW for the induction increment if we proved that it won't overflow in
6788 // the vector loop or when not folding the tail. In the later case, we know
6789 // that the canonical induction increment will not overflow as the vector trip
6790 // count is >= increment and a multiple of the increment.
6791 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
6792 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
6793 if (!HasNUW) {
6794 auto *IVInc =
6795 LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(0);
6796 assert(match(IVInc,
6797 m_VPInstruction<Instruction::Add>(
6798 m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
6799 "Did not find the canonical IV increment");
6800 LoopRegion->clearCanonicalIVNUW(cast<VPInstruction>(IVInc));
6801 }
6802
6803 // ---------------------------------------------------------------------------
6804 // Pre-construction: record ingredients whose recipes we'll need to further
6805 // process after constructing the initial VPlan.
6806 // ---------------------------------------------------------------------------
6807
6808 // For each interleave group which is relevant for this (possibly trimmed)
6809 // Range, add it to the set of groups to be later applied to the VPlan and add
6810 // placeholders for its members' Recipes which we'll be replacing with a
6811 // single VPInterleaveRecipe.
6812 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
6813 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
6814 bool Result = (VF.isVector() && // Query is illegal for VF == 1
6815 CM.getWideningDecision(IG->getInsertPos(), VF) ==
6817 // For scalable vectors, the interleave factors must be <= 8 since we
6818 // require the (de)interleaveN intrinsics instead of shufflevectors.
6819 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
6820 "Unsupported interleave factor for scalable vectors");
6821 return Result;
6822 };
6823 if (!getDecisionAndClampRange(ApplyIG, Range))
6824 continue;
6825 InterleaveGroups.insert(IG);
6826 }
6827
6828 // ---------------------------------------------------------------------------
6829 // Construct wide recipes and apply predication for original scalar
6830 // VPInstructions in the loop.
6831 // ---------------------------------------------------------------------------
6832 VPRecipeBuilder RecipeBuilder(*Plan, Legal, CM, Builder);
6833
6834 // Scan the body of the loop in a topological order to visit each basic block
6835 // after having visited its predecessor basic blocks.
6836 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
6837 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
6838 HeaderVPBB);
6839
6841 Range.Start);
6842
6843 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, Config.CostKind, CM.PSE,
6844 OrigLoop);
6845
6847 RecipeBuilder);
6848
6850
6852 RecipeBuilder, CostCtx);
6853
6854 // Now process all other blocks and instructions.
6855 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
6856 // Convert input VPInstructions to widened recipes.
6857 for (VPRecipeBase &R : make_early_inc_range(
6858 make_range(VPBB->getFirstNonPhi(), VPBB->end()))) {
6859 // Skip recipes that do not need transforming or have already been
6860 // transformed.
6861 if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe,
6862 VPReplicateRecipe, VPWidenLoadRecipe, VPWidenStoreRecipe,
6863 VPWidenCallRecipe, VPWidenIntrinsicRecipe, VPVectorPointerRecipe,
6864 VPVectorEndPointerRecipe, VPHistogramRecipe>(&R))
6865 continue;
6866 auto *VPI = cast<VPInstruction>(&R);
6867 if (!VPI->getUnderlyingValue())
6868 continue;
6869
6870 // TODO: Gradually replace uses of underlying instruction by analyses on
6871 // VPlan. Migrate code relying on the underlying instruction from VPlan0
6872 // to construct recipes below to not use the underlying instruction.
6874 Builder.setInsertPoint(VPI);
6875
6876 VPRecipeBase *Recipe =
6877 RecipeBuilder.tryToCreateWidenNonPhiRecipe(VPI, Range);
6878 if (!Recipe)
6879 Recipe =
6880 RecipeBuilder.handleReplication(cast<VPInstruction>(VPI), Range);
6881
6882 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
6883 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
6884 // moved to the phi section in the header.
6885 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
6886 } else {
6887 Builder.insert(Recipe);
6888 }
6889 if (Recipe->getNumDefinedValues() == 1) {
6890 VPI->replaceAllUsesWith(Recipe->getVPSingleValue());
6891 } else {
6892 assert(Recipe->getNumDefinedValues() == 0 &&
6893 "Unexpected multidef recipe");
6894 }
6895 R.eraseFromParent();
6896 }
6897 }
6898
6899 assert(isa<VPRegionBlock>(LoopRegion) &&
6900 !LoopRegion->getEntryBasicBlock()->empty() &&
6901 "entry block must be set to a VPRegionBlock having a non-empty entry "
6902 "VPBasicBlock");
6903
6905 Range);
6906
6907 // ---------------------------------------------------------------------------
6908 // Transform initial VPlan: Apply previously taken decisions, in order, to
6909 // bring the VPlan to its final state.
6910 // ---------------------------------------------------------------------------
6911
6912 addReductionResultComputation(Plan, RecipeBuilder, Range.Start);
6913
6914 // Optimize FindIV reductions to use sentinel-based approach when possible.
6916 *OrigLoop);
6918 CM.foldTailByMasking());
6919
6920 // Apply mandatory transformation to handle reductions with multiple in-loop
6921 // uses if possible, bail out otherwise.
6923 OrigLoop))
6924 return nullptr;
6925 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
6926 // NaNs if possible, bail out otherwise.
6928 return nullptr;
6929
6930 // Create whole-vector selects for find-last recurrences.
6932 return nullptr;
6933
6935
6936 // Create partial reduction recipes for scaled reductions and transform
6937 // recipes to abstract recipes if it is legal and beneficial and clamp the
6938 // range for better cost estimation.
6939 // TODO: Enable following transform when the EVL-version of extended-reduction
6940 // and mulacc-reduction are implemented.
6941 if (!CM.foldTailWithEVL()) {
6942 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, Config.CostKind, CM.PSE,
6943 OrigLoop);
6945 Range);
6947 Range);
6948 }
6949
6950 // Interleave memory: for each Interleave Group we marked earlier as relevant
6951 // for this VPlan, replace the Recipes widening its memory instructions with a
6952 // single VPInterleaveRecipe at its insertion point.
6954 InterleaveGroups, CM.isEpilogueAllowed());
6955
6956 // Convert memory recipes to strided access recipes if the strided access is
6957 // legal and profitable. Use a new VPCostContext to ensure type inference
6958 // reflects the current plan state.
6959 // TODO: Remove this VPCostContext scope once VPTypeAnalysis is removed.
6960 {
6961 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, Config.CostKind, CM.PSE,
6962 OrigLoop);
6964 *OrigLoop, CostCtx, Range);
6965 }
6966
6967 // Ensure scalar VF plans only contain VF=1, as required by hasScalarVFOnly.
6968 if (Range.Start.isScalar())
6969 Range.End = Range.Start * 2;
6970
6971 for (ElementCount VF : Range)
6972 Plan->addVF(VF);
6973 Plan->setName("Initial VPlan");
6974
6976
6977 if (useActiveLaneMask(Style)) {
6978 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
6979 // TailFoldingStyle is visible there.
6980 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
6981 RUN_VPLAN_PASS(VPlanTransforms::addActiveLaneMask, *Plan, ForControlFlow);
6982 }
6983
6984 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
6985 return Plan;
6986}
6987
6988void LoopVectorizationPlanner::addReductionResultComputation(
6989 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
6990 using namespace VPlanPatternMatch;
6991 VPTypeAnalysis TypeInfo(*Plan);
6992 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
6993 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
6994 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
6995 Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end())));
6996 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
6997 VPValue *HeaderMask = vputils::findHeaderMask(*Plan);
6998 for (VPRecipeBase &R :
6999 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
7000 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
7001 if (!PhiR)
7002 continue;
7003
7004 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
7005 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
7007 Type *PhiTy = TypeInfo.inferScalarType(PhiR);
7008
7009 // Convert a VPBlendRecipe backedge to a select.
7010 if (auto *Blend = dyn_cast<VPBlendRecipe>(PhiR->getBackedgeValue())) {
7011 if (Blend->getNumIncomingValues() == 2 &&
7012 Blend->getMask(0) == HeaderMask) {
7013 auto *Sel = VPBuilder(Blend).createSelect(
7014 Blend->getMask(0), Blend->getIncomingValue(0),
7015 Blend->getIncomingValue(1), {}, "", *Blend);
7016 Blend->replaceAllUsesWith(Sel);
7017 Blend->eraseFromParent();
7018 }
7019 }
7020
7021 auto *OrigExitingVPV = PhiR->getBackedgeValue();
7022 auto *NewExitingVPV = OrigExitingVPV;
7023
7024 // Remove the predicated select if the target doesn't want it.
7025 VPValue *V;
7026 if (!CM.usePredicatedReductionSelect(RecurrenceKind) &&
7027 match(PhiR->getBackedgeValue(),
7028 m_Select(m_Specific(HeaderMask), m_VPValue(V), m_Specific(PhiR))))
7029 PhiR->setBackedgeValue(V);
7030
7031 // We want code in the middle block to appear to execute on the location of
7032 // the scalar loop's latch terminator because: (a) it is all compiler
7033 // generated, (b) these instructions are always executed after evaluating
7034 // the latch conditional branch, and (c) other passes may add new
7035 // predecessors which terminate on this line. This is the easiest way to
7036 // ensure we don't accidentally cause an extra step back into the loop while
7037 // debugging.
7038 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
7039
7040 // TODO: At the moment ComputeReductionResult also drives creation of the
7041 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
7042 // even for in-loop reductions, until the reduction resume value handling is
7043 // also modeled in VPlan.
7044 VPInstruction *FinalReductionResult;
7045 VPBuilder::InsertPointGuard Guard(Builder);
7046 Builder.setInsertPoint(MiddleVPBB, IP);
7047 // For AnyOf reductions, find the select among PhiR's users and convert
7048 // the reduction phi to operate on bools before creating the final
7049 // reduction result.
7050 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
7051 auto *AnyOfSelect =
7052 cast<VPSingleDefRecipe>(*find_if(PhiR->users(), [](VPUser *U) {
7053 return match(U, m_Select(m_VPValue(), m_VPValue(), m_VPValue()));
7054 }));
7055 VPValue *Start = PhiR->getStartValue();
7056 bool TrueValIsPhi = AnyOfSelect->getOperand(1) == PhiR;
7057 // NewVal is the non-phi operand of the select.
7058 VPValue *NewVal = TrueValIsPhi ? AnyOfSelect->getOperand(2)
7059 : AnyOfSelect->getOperand(1);
7060
7061 // Adjust AnyOf reductions; replace the reduction phi for the selected
7062 // value with a boolean reduction phi node to check if the condition is
7063 // true in any iteration. The final value is selected by the final
7064 // ComputeReductionResult.
7065 VPValue *Cmp = AnyOfSelect->getOperand(0);
7066 // If the compare is checking the reduction PHI node, adjust it to check
7067 // the start value.
7068 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
7069 CmpR->replaceUsesOfWith(PhiR, PhiR->getStartValue());
7070 Builder.setInsertPoint(AnyOfSelect);
7071
7072 // If the true value of the select is the reduction phi, the new value
7073 // is selected if the negated condition is true in any iteration.
7074 if (TrueValIsPhi)
7075 Cmp = Builder.createNot(Cmp);
7076
7077 // Build a fresh i1 chain (phi, or, and i1 versions of any blend/select
7078 // the exiting value flows through).
7079 auto *NewPhiR =
7080 PhiR->cloneWithOperands(Plan->getFalse(), Plan->getFalse());
7081 NewPhiR->insertBefore(PhiR);
7082 VPValue *NewExiting = Builder.createOr(NewPhiR, Cmp);
7083
7084 // The exiting value may flow through a VPBlendRecipe and/or a wrapping
7085 // VPInstruction::Select before reaching OrigExitingVPV. Clone each level
7086 // of the chain with the i1 substitutions propagated through.
7087 DenseMap<VPValue *, VPValue *> Substitutions = {{AnyOfSelect, NewExiting},
7088 {PhiR, NewPhiR}};
7089 auto CloneWithSubstitutions = [&](VPSingleDefRecipe *Old) {
7091 for (VPValue *Op : Old->operands())
7092 NewOps.push_back(Substitutions.lookup_or(Op, Op));
7093 VPSingleDefRecipe *New;
7094 if (auto *B = dyn_cast<VPBlendRecipe>(Old))
7095 New = B->cloneWithOperands(NewOps);
7096 else
7097 New = cast<VPInstruction>(Old)->cloneWithOperands(NewOps);
7098 New->insertBefore(Old);
7099 NewExiting = New;
7100 Substitutions[Old] = New;
7101 };
7102
7103 // If there's an outer Select wrapping a Blend, clone the inner Blend
7104 // first so the outer Select clone can refer to it.
7105 if (OrigExitingVPV != AnyOfSelect) {
7106 VPValue *Inner;
7107 if (match(OrigExitingVPV,
7108 m_Select(m_VPValue(), m_VPValue(Inner), m_VPValue())))
7109 if (auto *InnerBlend = dyn_cast<VPBlendRecipe>(Inner))
7110 CloneWithSubstitutions(InnerBlend);
7111 CloneWithSubstitutions(cast<VPSingleDefRecipe>(OrigExitingVPV));
7112 }
7113 NewPhiR->setOperand(1, NewExiting);
7114 PhiR->replaceAllUsesWith(
7115 Plan->getOrAddLiveIn(PoisonValue::get(PhiR->getScalarType())));
7116
7117 Builder.setInsertPoint(MiddleVPBB, IP);
7118 FinalReductionResult =
7119 Builder.createAnyOfReduction(NewExiting, NewVal, Start, ExitDL);
7120 } else {
7121 // If the vector reduction can be performed in a smaller type, we
7122 // truncate then extend the loop exit value to enable InstCombine to
7123 // evaluate the entire expression in the smaller type.
7124 VPValue *ReductionOp = NewExitingVPV;
7125 Instruction::CastOps ExtendOpc = Instruction::CastOpsEnd;
7126 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
7127 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
7129 "Unexpected truncated min-max recurrence!");
7130 Type *RdxTy = RdxDesc.getRecurrenceType();
7131 ExtendOpc = RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
7132 {
7133 VPBuilder::InsertPointGuard Guard(Builder);
7134 Builder.setInsertPoint(
7135 NewExitingVPV->getDefiningRecipe()->getParent(),
7136 std::next(NewExitingVPV->getDefiningRecipe()->getIterator()));
7137 ReductionOp =
7138 Builder.createWidenCast(Instruction::Trunc, NewExitingVPV, RdxTy);
7139 VPWidenCastRecipe *Extnd =
7140 Builder.createWidenCast(ExtendOpc, ReductionOp, PhiTy);
7141 if (PhiR->getOperand(1) == NewExitingVPV)
7142 PhiR->setOperand(1, Extnd);
7143 }
7144 }
7145
7146 VPIRFlags Flags(RecurrenceKind, PhiR->isOrdered(), PhiR->isInLoop(),
7147 PhiR->getFastMathFlags());
7148 FinalReductionResult = Builder.createNaryOp(
7149 VPInstruction::ComputeReductionResult, {ReductionOp}, Flags, ExitDL);
7150 if (ExtendOpc != Instruction::CastOpsEnd)
7151 FinalReductionResult = Builder.createScalarCast(
7152 ExtendOpc, FinalReductionResult, PhiTy, {});
7153 }
7154
7155 // Update all users outside the vector region. Also replace redundant
7156 // extracts.
7157 for (auto *U : to_vector(OrigExitingVPV->users())) {
7158 auto *Parent = cast<VPRecipeBase>(U)->getParent();
7159 if (FinalReductionResult == U || Parent->getParent())
7160 continue;
7161 // Skip ComputeReductionResult and FindIV reductions when they are not the
7162 // final result.
7163 if (match(U, m_VPInstruction<VPInstruction::ComputeReductionResult>()) ||
7165 match(U, m_VPInstruction<Instruction::ICmp>())))
7166 continue;
7167 U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
7168
7169 // Look through ExtractLastPart.
7171 U = cast<VPInstruction>(U)->getSingleUser();
7172
7175 cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
7176 }
7177
7178 RecurKind RK = PhiR->getRecurrenceKind();
7183 VPBuilder PHBuilder(Plan->getVectorPreheader());
7184 VPValue *Iden = Plan->getOrAddLiveIn(
7185 getRecurrenceIdentity(RK, PhiTy, PhiR->getFastMathFlags()));
7186 auto *ScaleFactorVPV = Plan->getConstantInt(32, 1);
7187 VPValue *StartV = PHBuilder.createNaryOp(
7189 {PhiR->getStartValue(), Iden, ScaleFactorVPV}, *PhiR);
7190 PhiR->setOperand(0, StartV);
7191 }
7192 }
7193
7195}
7196
7198 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
7199 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
7200 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
7201 assert((!Config.OptForSize ||
7202 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
7203 "Cannot SCEV check stride or overflow when optimizing for size");
7205 SCEVCheckBlock, HasBranchWeights);
7206 }
7207 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
7208 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
7209 // VPlan-native path does not do any analysis for runtime checks
7210 // currently.
7212 "Runtime checks are not supported for outer loops yet");
7213
7214 if (Config.OptForSize) {
7215 assert(
7216 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
7217 "Cannot emit memory checks when optimizing for size, unless forced "
7218 "to vectorize.");
7219 ORE->emit([&]() {
7220 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
7221 OrigLoop->getStartLoc(),
7222 OrigLoop->getHeader())
7223 << "Code-size may be reduced by not forcing "
7224 "vectorization, or by source-code modifications "
7225 "eliminating the need for runtime checks "
7226 "(e.g., adding 'restrict').";
7227 });
7228 }
7230 MemCheckBlock, HasBranchWeights);
7231 }
7232}
7233
7235 VPlan &Plan, ElementCount VF, unsigned UF,
7236 ElementCount MinProfitableTripCount) const {
7237 const uint32_t *BranchWeights =
7238 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
7240 : nullptr;
7242 MinProfitableTripCount,
7243 CM.requiresScalarEpilogue(VF.isVector()),
7244 CM.foldTailByMasking(), OrigLoop, BranchWeights,
7245 OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
7246 PSE, Plan.getEntry());
7247}
7248
7249// Determine how to lower the epilogue, which depends on 1) optimising
7250// for minimum code-size, 2) tail-folding compiler options, 3) loop
7251// hints forcing tail-folding, and 4) a TTI hook that analyses whether the loop
7252// is suitable for tail-folding.
7253// This function determines epilogue lowering for the main vector loop while
7254// epilogue lowering for the tail-folded epilogue path will be handled
7255// separately in getEpilogueTailLowering.
7256static EpilogueLowering
7258 bool OptForSize, TargetTransformInfo *TTI,
7260 InterleavedAccessInfo *IAI) {
7261 // 1) OptSize takes precedence over all other options, i.e. if this is set,
7262 // don't look at hints or options, and don't request an epilogue.
7263 if (F->hasOptSize() ||
7264 (OptForSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled))
7266
7267 // 2) If set, obey the directives
7268 if (TailFoldingPolicy.getNumOccurrences()) {
7269 switch (TailFoldingPolicy) {
7271 return CM_EpilogueAllowed;
7276 };
7277 }
7278
7279 // 3) If set, obey the hints
7280 switch (Hints.getPredicate()) {
7284 return CM_EpilogueAllowed;
7285 };
7286
7287 // 4) if the TTI hook indicates this is profitable, request tail-folding.
7288 TailFoldingInfo TFI(TLI, &LVL, IAI);
7289 if (TTI->preferTailFoldingOverEpilogue(&TFI))
7291
7292 return CM_EpilogueAllowed;
7293}
7294
7295/// Determine how to lower the epilogue for the vector epilogue loop.
7296/// Check if there are any conflicts that prevent tail-folding the epilogue.
7297/// \return CM_EpilogueNotNeededFoldTail if epilogue tail-folding is possible,
7298/// otherwise CM_EpilogueAllowed.
7299static EpilogueLowering
7302 // Epilogue TF is only enabled when explicitly requested via command line.
7303 if (!EpilogueTailFoldingPolicy.getNumOccurrences() ||
7305 return CM_EpilogueAllowed;
7306
7309 "Options conflict, epilogue vectorization is disallowed while "
7310 "epilogue tail-folding allowed!\n",
7311 "UnsupportedEpilogueTailFoldingPolicy", ORE, L);
7312 return CM_EpilogueAllowed;
7313 }
7314
7315 // If scalar epilogue is explicitly required, we can't apply TF.
7316 if (MainCM.requiresScalarEpilogue(/*IsVectorizing*/ true)) {
7317 LLVM_DEBUG(dbgs() << "LV: Epilogue tail-folding can't be applied because "
7318 "scalar epilogue is required\n"
7319 "LV: Fall back to a normal epilogue\n");
7320 return CM_EpilogueAllowed;
7321 }
7322
7323 // If having epilogue is NOT allowed, then no epilogue to apply TF for.
7324 if (!MainCM.isEpilogueAllowed()) {
7325 LLVM_DEBUG(dbgs() << "LV: No epilogue to apply tail-folding for.\n"
7326 "LV: Fall back to a normal epilogue\n");
7327 return CM_EpilogueAllowed;
7328 }
7329
7330 // We can apply tail-folding on the vectorized epilogue loop.
7332}
7333
7334// Emit a remark if there are stores to floats that required a floating point
7335// extension. If the vectorized loop was generated with floating point there
7336// will be a performance penalty from the conversion overhead and the change in
7337// the vector width.
7340 for (BasicBlock *BB : L->getBlocks()) {
7341 for (Instruction &Inst : *BB) {
7342 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
7343 if (S->getValueOperand()->getType()->isFloatTy())
7344 Worklist.push_back(S);
7345 }
7346 }
7347 }
7348
7349 // Traverse the floating point stores upwards searching, for floating point
7350 // conversions.
7353 while (!Worklist.empty()) {
7354 auto *I = Worklist.pop_back_val();
7355 if (!L->contains(I))
7356 continue;
7357 if (!Visited.insert(I).second)
7358 continue;
7359
7360 // Emit a remark if the floating point store required a floating
7361 // point conversion.
7362 // TODO: More work could be done to identify the root cause such as a
7363 // constant or a function return type and point the user to it.
7364 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
7365 ORE->emit([&]() {
7366 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
7367 I->getDebugLoc(), L->getHeader())
7368 << "floating point conversion changes vector width. "
7369 << "Mixed floating point precision requires an up/down "
7370 << "cast that will negatively impact performance.";
7371 });
7372
7373 for (Use &Op : I->operands())
7374 if (auto *OpI = dyn_cast<Instruction>(Op))
7375 Worklist.push_back(OpI);
7376 }
7377}
7378
7379/// For loops with uncountable early exits, find the cost of doing work when
7380/// exiting the loop early, such as calculating the final exit values of
7381/// variables used outside the loop.
7382/// TODO: This is currently overly pessimistic because the loop may not take
7383/// the early exit, but better to keep this conservative for now. In future,
7384/// it might be possible to relax this by using branch probabilities.
7386 VPlan &Plan, ElementCount VF) {
7387 InstructionCost Cost = 0;
7388 for (auto *ExitVPBB : Plan.getExitBlocks()) {
7389 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
7390 // If the predecessor is not the middle.block, then it must be the
7391 // vector.early.exit block, which may contain work to calculate the exit
7392 // values of variables used outside the loop.
7393 if (PredVPBB != Plan.getMiddleBlock()) {
7394 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
7395 << PredVPBB->getName() << ":\n");
7396 Cost += PredVPBB->cost(VF, CostCtx);
7397 }
7398 }
7399 }
7400 return Cost;
7401}
7402
7403/// This function determines whether or not it's still profitable to vectorize
7404/// the loop given the extra work we have to do outside of the loop:
7405/// 1. Perform the runtime checks before entering the loop to ensure it's safe
7406/// to vectorize.
7407/// 2. In the case of loops with uncountable early exits, we may have to do
7408/// extra work when exiting the loop early, such as calculating the final
7409/// exit values of variables used outside the loop.
7410/// 3. The middle block.
7411static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
7412 VectorizationFactor &VF, Loop *L,
7414 VPCostContext &CostCtx, VPlan &Plan,
7415 EpilogueLowering SEL,
7416 std::optional<unsigned> VScale) {
7417 InstructionCost RtC = Checks.getCost();
7418 if (!RtC.isValid())
7419 return false;
7420
7421 // When interleaving only scalar and vector cost will be equal, which in turn
7422 // would lead to a divide by 0. Fall back to hard threshold.
7423 if (VF.Width.isScalar()) {
7424 // TODO: Should we rename VectorizeMemoryCheckThreshold?
7426 LLVM_DEBUG(
7427 dbgs()
7428 << "LV: Interleaving only is not profitable due to runtime checks\n");
7429 return false;
7430 }
7431 return true;
7432 }
7433
7434 // The scalar cost should only be 0 when vectorizing with a user specified
7435 // VF/IC. In those cases, runtime checks should always be generated.
7436 uint64_t ScalarC = VF.ScalarCost.getValue();
7437 if (ScalarC == 0)
7438 return true;
7439
7440 InstructionCost TotalCost = RtC;
7441 // Add on the cost of any work required in the vector early exit block, if
7442 // one exists.
7443 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
7444 TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx);
7445
7446 // First, compute the minimum iteration count required so that the vector
7447 // loop outperforms the scalar loop.
7448 // The total cost of the scalar loop is
7449 // ScalarC * TC
7450 // where
7451 // * TC is the actual trip count of the loop.
7452 // * ScalarC is the cost of a single scalar iteration.
7453 //
7454 // The total cost of the vector loop is
7455 // TotalCost + VecC * (TC / VF) + EpiC
7456 // where
7457 // * TotalCost is the sum of the costs cost of
7458 // - the generated runtime checks, i.e. RtC
7459 // - performing any additional work in the vector.early.exit block for
7460 // loops with uncountable early exits.
7461 // - the middle block, if ExpectedTC <= VF.Width.
7462 // * VecC is the cost of a single vector iteration.
7463 // * TC is the actual trip count of the loop
7464 // * VF is the vectorization factor
7465 // * EpiCost is the cost of the generated epilogue, including the cost
7466 // of the remaining scalar operations.
7467 //
7468 // Vectorization is profitable once the total vector cost is less than the
7469 // total scalar cost:
7470 // TotalCost + VecC * (TC / VF) + EpiC < ScalarC * TC
7471 //
7472 // Now we can compute the minimum required trip count TC as
7473 // VF * (TotalCost + EpiC) / (ScalarC * VF - VecC) < TC
7474 //
7475 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
7476 // the computations are performed on doubles, not integers and the result
7477 // is rounded up, hence we get an upper estimate of the TC.
7478 unsigned IntVF = estimateElementCount(VF.Width, VScale);
7479 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
7480 uint64_t MinTC1 =
7481 Div == 0 ? 0 : divideCeil(TotalCost.getValue() * IntVF, Div);
7482
7483 // Second, compute a minimum iteration count so that the cost of the
7484 // runtime checks is only a fraction of the total scalar loop cost. This
7485 // adds a loop-dependent bound on the overhead incurred if the runtime
7486 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
7487 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
7488 // cost, compute
7489 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
7490 uint64_t MinTC2 = divideCeil(RtC.getValue() * 10, ScalarC);
7491
7492 // Now pick the larger minimum. If it is not a multiple of VF and an epilogue
7493 // is allowed, choose the next closest multiple of VF. This should partly
7494 // compensate for ignoring the epilogue cost.
7495 uint64_t MinTC = std::max(MinTC1, MinTC2);
7496 if (SEL == CM_EpilogueAllowed)
7497 MinTC = alignTo(MinTC, IntVF);
7499
7500 LLVM_DEBUG(
7501 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
7502 << VF.MinProfitableTripCount << "\n");
7503
7504 // Skip vectorization if the expected trip count is less than the minimum
7505 // required trip count.
7506 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
7507 if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
7508 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
7509 "trip count < minimum profitable VF ("
7510 << *ExpectedTC << " < " << VF.MinProfitableTripCount
7511 << ")\n");
7512
7513 return false;
7514 }
7515 }
7516 return true;
7517}
7518
7520 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
7522 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
7524
7525/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
7526/// vectorization.
7529 using namespace VPlanPatternMatch;
7530 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
7531 // introduce multiple uses of undef/poison. If the reduction start value may
7532 // be undef or poison it needs to be frozen and the frozen start has to be
7533 // used when computing the reduction result. We also need to use the frozen
7534 // value in the resume phi generated by the main vector loop, as this is also
7535 // used to compute the reduction result after the epilogue vector loop.
7536 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
7537 bool UpdateResumePhis) {
7538 VPBuilder Builder(Plan.getEntry());
7539 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
7540 auto *VPI = dyn_cast<VPInstruction>(&R);
7541 if (!VPI)
7542 continue;
7543 VPValue *OrigStart;
7544 if (!matchFindIVResult(VPI, m_VPValue(), m_VPValue(OrigStart)))
7545 continue;
7547 continue;
7548 VPInstruction *Freeze =
7549 Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
7550 VPI->setOperand(2, Freeze);
7551 if (UpdateResumePhis)
7552 OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
7553 return Freeze != &U && isa<VPPhi>(&U);
7554 });
7555 }
7556 };
7557 AddFreezeForFindLastIVReductions(MainPlan, true);
7558 AddFreezeForFindLastIVReductions(EpiPlan, false);
7559
7560 VPValue *VectorTC = nullptr;
7561 auto *Term =
7563 [[maybe_unused]] bool MatchedTC =
7564 match(Term, m_BranchOnCount(m_VPValue(), m_VPValue(VectorTC)));
7565 assert(MatchedTC && "must match vector trip count");
7566
7567 // If there is a suitable resume value for the canonical induction in the
7568 // scalar (which will become vector) epilogue loop, use it and move it to the
7569 // beginning of the scalar preheader. Otherwise create it below.
7570 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
7571 auto ResumePhiIter =
7572 find_if(MainScalarPH->phis(), [VectorTC](VPRecipeBase &R) {
7573 return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
7574 m_ZeroInt()));
7575 });
7576 VPPhi *ResumePhi = nullptr;
7577 if (ResumePhiIter == MainScalarPH->phis().end()) {
7578 Type *Ty = VPTypeAnalysis(MainPlan).inferScalarType(VectorTC);
7579 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
7580 ResumePhi = ScalarPHBuilder.createScalarPhi(
7581 {VectorTC, MainPlan.getZero(Ty)}, {}, "vec.epilog.resume.val");
7582 } else {
7583 ResumePhi = cast<VPPhi>(&*ResumePhiIter);
7584 ResumePhi->setName("vec.epilog.resume.val");
7585 if (&MainScalarPH->front() != ResumePhi)
7586 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->begin());
7587 }
7588
7589 // Create a ResumeForEpilogue for the canonical IV resume as the
7590 // first non-phi, to keep it alive for the epilogue.
7591 VPBuilder ResumeBuilder(MainScalarPH);
7592 ResumeBuilder.createNaryOp(VPInstruction::ResumeForEpilogue, ResumePhi);
7593
7594 // Create ResumeForEpilogue instructions for the resume phis of the
7595 // VPIRPhis in the scalar header of the main plan and return them so they can
7596 // be used as resume values when vectorizing the epilogue.
7597 return to_vector(
7598 map_range(MainPlan.getScalarHeader()->phis(), [&](VPRecipeBase &R) {
7599 assert(isa<VPIRPhi>(R) &&
7600 "only VPIRPhis expected in the scalar header");
7601 return ResumeBuilder.createNaryOp(VPInstruction::ResumeForEpilogue,
7602 R.getOperand(0));
7603 }));
7604}
7605
7606/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
7607/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
7608/// reductions require creating new instructions to compute the resume values.
7609/// They are collected in a vector and returned. They must be moved to the
7610/// preheader of the vector epilogue loop, after created by the execution of \p
7611/// Plan.
7613 VPlan &MainPlan, VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
7615 VFSelectionContext &Config, ScalarEvolution &SE) {
7616 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
7617 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
7618 Header->setName("vec.epilog.vector.body");
7619
7620 VPValue *IV = VectorLoop->getCanonicalIV();
7621 // When vectorizing the epilogue loop, the canonical induction needs to start
7622 // at the resume value from the main vector loop. Find the resume value
7623 // created during execution of the main VPlan. Add this resume value as an
7624 // offset to the canonical IV of the epilogue loop.
7625 using namespace llvm::PatternMatch;
7626 VPInstruction *ResumeForEpilogue =
7628 Value *EPResumeVal = ResumeForEpilogue->getUnderlyingValue();
7629 if (auto *ResumePhi = dyn_cast<PHINode>(EPResumeVal)) {
7630 for (Value *Inc : ResumePhi->incoming_values()) {
7631 if (match(Inc, m_SpecificInt(0)))
7632 continue;
7633 assert(!EPI.VectorTripCount &&
7634 "Must only have a single non-zero incoming value");
7635 EPI.VectorTripCount = Inc;
7636 }
7637 // If we didn't find a non-zero vector trip count, all incoming values
7638 // must be zero, which also means the vector trip count is zero.
7639 if (!EPI.VectorTripCount) {
7640 assert(ResumePhi->getNumIncomingValues() > 0 &&
7641 all_of(ResumePhi->incoming_values(), match_fn(m_SpecificInt(0))) &&
7642 "all incoming values must be 0");
7643 EPI.VectorTripCount = ResumePhi->getIncomingValue(0);
7644 }
7645 } else {
7646 EPI.VectorTripCount = EPResumeVal;
7647 }
7648 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
7649 assert(all_of(IV->users(),
7650 [](const VPUser *U) {
7651 if (isa<VPScalarIVStepsRecipe, VPDerivedIVRecipe>(U))
7652 return true;
7653 unsigned Opc = cast<VPInstruction>(U)->getOpcode();
7654 return Instruction::isCast(Opc) || Opc == Instruction::Add;
7655 }) &&
7656 "the canonical IV should only be used by its increment or "
7657 "ScalarIVSteps when resetting the start value");
7658 VPBuilder Builder(Header, Header->getFirstNonPhi());
7659 VPInstruction *Add = Builder.createAdd(IV, VPV);
7660 // Replace all users of the canonical IV and its increment with the offset
7661 // version, except for the Add itself and the canonical IV increment.
7663 assert(Increment && "Must have a canonical IV increment at this point");
7664 IV->replaceUsesWithIf(Add, [Add, Increment](VPUser &U, unsigned) {
7665 return &U != Add && &U != Increment;
7666 });
7667 VPInstruction *OffsetIVInc =
7669 Increment->replaceAllUsesWith(OffsetIVInc);
7670 OffsetIVInc->setOperand(0, Increment);
7671
7673 SmallVector<Instruction *> InstsToMove;
7674 // Ensure that the start values for all header phi recipes are updated before
7675 // vectorizing the epilogue loop.
7676 for (VPRecipeBase &R : Header->phis()) {
7677 Value *ResumeV = nullptr;
7678 // TODO: Move setting of resume values to prepareToExecute.
7679 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
7680 // Find the reduction result by searching users of the phi or its backedge
7681 // value.
7682 auto IsReductionResult = [](VPRecipeBase *R) {
7683 auto *VPI = dyn_cast<VPInstruction>(R);
7684 return VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult;
7685 };
7686 auto *RdxResult = cast<VPInstruction>(
7687 vputils::findRecipe(ReductionPhi->getBackedgeValue(), IsReductionResult));
7688 assert(RdxResult && "expected to find reduction result");
7689
7690 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
7691 ->getIncomingValueForBlock(L->getLoopPreheader());
7692
7693 // Check for FindIV pattern by looking for icmp user of RdxResult.
7694 // The pattern is: select(icmp ne RdxResult, Sentinel), RdxResult, Start
7695 using namespace VPlanPatternMatch;
7696 VPValue *SentinelVPV = nullptr;
7697 bool IsFindIV = any_of(RdxResult->users(), [&](VPUser *U) {
7698 return match(U, VPlanPatternMatch::m_SpecificICmp(
7699 ICmpInst::ICMP_NE, m_Specific(RdxResult),
7700 m_VPValue(SentinelVPV)));
7701 });
7702
7703 RecurKind RK = ReductionPhi->getRecurrenceKind();
7704 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || IsFindIV) {
7705 auto *ResumePhi = cast<PHINode>(ResumeV);
7706 Value *StartV = ResumePhi->getIncomingValueForBlock(
7708 IRBuilder<> Builder(ResumePhi->getParent(),
7709 ResumePhi->getParent()->getFirstNonPHIIt());
7710
7712 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
7713 // start value; compare the final value from the main vector loop
7714 // to the start value.
7715 ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
7716 if (auto *I = dyn_cast<Instruction>(ResumeV))
7717 InstsToMove.push_back(I);
7718 } else {
7719 assert(SentinelVPV && "expected to find icmp using RdxResult");
7720 if (auto *FreezeI = dyn_cast<FreezeInst>(StartV))
7721 ToFrozen[FreezeI->getOperand(0)] = StartV;
7722
7723 // Adjust resume: select(icmp eq ResumeV, StartV), Sentinel, ResumeV
7724 Value *Cmp = Builder.CreateICmpEQ(ResumeV, StartV);
7725 if (auto *I = dyn_cast<Instruction>(Cmp))
7726 InstsToMove.push_back(I);
7727 ResumeV = Builder.CreateSelect(Cmp, SentinelVPV->getLiveInIRValue(),
7728 ResumeV);
7729 if (auto *I = dyn_cast<Instruction>(ResumeV))
7730 InstsToMove.push_back(I);
7731 }
7732 } else {
7733 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
7734 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
7735 if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
7737 "unexpected start value");
7738 // Partial sub-reductions always start at 0 and account for the
7739 // reduction start value in a final subtraction. Update it to use the
7740 // resume value from the main vector loop.
7741 if (PhiR->getVFScaleFactor() > 1 &&
7743 PhiR->getRecurrenceKind())) {
7744 auto *Sub = cast<VPInstruction>(RdxResult->getSingleUser());
7745 assert((Sub->getOpcode() == Instruction::Sub ||
7746 Sub->getOpcode() == Instruction::FSub) &&
7747 "Unexpected opcode");
7748 assert(isa<VPIRValue>(Sub->getOperand(0)) &&
7749 "Expected operand to match the original start value of the "
7750 "reduction");
7751 // For integer sub-reductions, verify start value is zero.
7752 // For FP sub-reductions, verify start value is negative zero.
7753 [[maybe_unused]] auto StartValueIsIdentity = [&] {
7754 Value *IdentityValue = getRecurrenceIdentity(
7755 PhiR->getRecurrenceKind(), ResumeV->getType(),
7756 PhiR->getFastMathFlags());
7757 auto *StartValue = dyn_cast<VPIRValue>(VPI->getOperand(0));
7758 return StartValue && StartValue->getValue() == IdentityValue;
7759 };
7760 assert(StartValueIsIdentity() &&
7761 "Expected start value for partial sub-reduction to be zero "
7762 "(or negative zero)");
7763
7764 Sub->setOperand(0, StartVal);
7765 } else
7766 VPI->setOperand(0, StartVal);
7767 continue;
7768 }
7769 }
7770 } else {
7771 // Retrieve the induction resume values for wide inductions from
7772 // their original phi nodes in the scalar loop.
7773 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
7774 // Hook up to the PHINode generated by a ResumePhi recipe of main
7775 // loop VPlan, which feeds the scalar loop.
7776 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
7777 }
7778 assert(ResumeV && "Must have a resume value");
7779 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
7780 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
7781 }
7782
7783 // For some VPValues in the epilogue plan we must re-use the generated IR
7784 // values from the main plan. Replace them with live-in VPValues.
7785 // TODO: This is a workaround needed for epilogue vectorization and it
7786 // should be removed once induction resume value creation is done
7787 // directly in VPlan.
7788 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
7789 // Re-use frozen values from the main plan for Freeze VPInstructions in the
7790 // epilogue plan. This ensures all users use the same frozen value.
7791 auto *VPI = dyn_cast<VPInstruction>(&R);
7792 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
7794 ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
7795 continue;
7796 }
7797
7798 // Re-use the trip count and steps expanded for the main loop, as
7799 // skeleton creation needs it as a value that dominates both the scalar
7800 // and vector epilogue loops
7801 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
7802 if (!ExpandR)
7803 continue;
7804 VPValue *ExpandedVal =
7805 Plan.getOrAddLiveIn(ExpandedSCEVs.lookup(ExpandR->getSCEV()));
7806 ExpandR->replaceAllUsesWith(ExpandedVal);
7807 if (Plan.getTripCount() == ExpandR)
7808 Plan.resetTripCount(ExpandedVal);
7809 ExpandR->eraseFromParent();
7810 }
7811
7812 auto VScale = Config.getVScaleForTuning();
7813 unsigned MainLoopStep =
7814 estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
7815 unsigned EpilogueLoopStep =
7816 estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
7820 EPI.EpilogueVF, EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
7821
7822 return InstsToMove;
7823}
7824
7825static void
7827 VPlan &BestEpiPlan,
7828 ArrayRef<VPInstruction *> ResumeValues) {
7829 // Fix resume values from the additional bypass block.
7830 BasicBlock *PH = L->getLoopPreheader();
7831 for (auto *Pred : predecessors(PH)) {
7832 for (PHINode &Phi : PH->phis()) {
7833 if (Phi.getBasicBlockIndex(Pred) != -1)
7834 continue;
7835 Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
7836 }
7837 }
7838 auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader());
7839 if (ScalarPH->hasPredecessors()) {
7840 // Fix resume values for inductions and reductions from the additional
7841 // bypass block using the incoming values from the main loop's resume phis.
7842 // ResumeValues correspond 1:1 with the scalar loop header phis.
7843 for (auto [ResumeV, HeaderPhi] :
7844 zip(ResumeValues, BestEpiPlan.getScalarHeader()->phis())) {
7845 auto *HeaderPhiR = cast<VPIRPhi>(&HeaderPhi);
7846 auto *EpiResumePhi =
7847 cast<PHINode>(HeaderPhiR->getIRPhi().getIncomingValueForBlock(PH));
7848 if (EpiResumePhi->getBasicBlockIndex(BypassBlock) == -1)
7849 continue;
7850 auto *MainResumePhi = cast<PHINode>(ResumeV->getUnderlyingValue());
7851 EpiResumePhi->setIncomingValueForBlock(
7852 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7853 }
7854 }
7855}
7856
7857/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
7858/// loop, after both plans have executed, updating branches from the iteration
7859/// and runtime checks of the main loop, as well as updating various phis. \p
7860/// InstsToMove contains instructions that need to be moved to the preheader of
7861/// the epilogue vector loop.
7862static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L,
7864 DominatorTree *DT,
7865 GeneratedRTChecks &Checks,
7866 ArrayRef<Instruction *> InstsToMove,
7867 ArrayRef<VPInstruction *> ResumeValues) {
7868 BasicBlock *VecEpilogueIterationCountCheck =
7869 cast<VPIRBasicBlock>(EpiPlan.getEntry())->getIRBasicBlock();
7870
7871 BasicBlock *VecEpiloguePreHeader =
7872 cast<CondBrInst>(VecEpilogueIterationCountCheck->getTerminator())
7873 ->getSuccessor(1);
7874 // Adjust the control flow taking the state info from the main loop
7875 // vectorization into account.
7877 "expected this to be saved from the previous pass.");
7878 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
7879
7880 // Helper to redirect an edge from \p BB to \p VecEpilogueIterationCountCheck
7881 // to \p NewSucc instead, updating the DomTree.
7882 auto RedirectEdge = [&](BasicBlock *BB, BasicBlock *NewSucc) {
7883 BB->getTerminator()->replaceUsesOfWith(VecEpilogueIterationCountCheck,
7884 NewSucc);
7885 DTU.applyUpdates(
7886 {{DominatorTree::Delete, BB, VecEpilogueIterationCountCheck},
7887 {DominatorTree::Insert, BB, NewSucc}});
7888 };
7889
7890 RedirectEdge(EPI.MainLoopIterationCountCheck, VecEpiloguePreHeader);
7891
7892 BasicBlock *ScalarPH =
7893 cast<VPIRBasicBlock>(EpiPlan.getScalarPreheader())->getIRBasicBlock();
7894 RedirectEdge(EPI.EpilogueIterationCountCheck, ScalarPH);
7895
7896 // Adjust the terminators of runtime check blocks and phis using them.
7897 BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
7898 BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
7899 if (SCEVCheckBlock)
7900 RedirectEdge(SCEVCheckBlock, ScalarPH);
7901 if (MemCheckBlock)
7902 RedirectEdge(MemCheckBlock, ScalarPH);
7903
7904 // The vec.epilog.iter.check block may contain Phi nodes from inductions
7905 // or reductions which merge control-flow from the latch block and the
7906 // middle block. Update the incoming values here and move the Phi into the
7907 // preheader.
7908 SmallVector<PHINode *, 4> PhisInBlock(
7909 llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
7910
7911 for (PHINode *Phi : PhisInBlock) {
7912 Phi->moveBefore(VecEpiloguePreHeader->getFirstNonPHIIt());
7913 Phi->replaceIncomingBlockWith(
7914 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7915 VecEpilogueIterationCountCheck);
7916
7917 // If the phi doesn't have an incoming value from the
7918 // EpilogueIterationCountCheck, we are done. Otherwise remove the
7919 // incoming value and also those from other check blocks. This is needed
7920 // for reduction phis only.
7921 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7922 return EPI.EpilogueIterationCountCheck == IncB;
7923 }))
7924 continue;
7925 for (BasicBlock *BB :
7926 {EPI.EpilogueIterationCountCheck, SCEVCheckBlock, MemCheckBlock}) {
7927 if (BB)
7928 Phi->removeIncomingValue(BB);
7929 }
7930 }
7931
7932 auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
7933 for (auto *I : InstsToMove)
7934 I->moveBefore(IP);
7935
7936 // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
7937 // after executing the main loop. We need to update the resume values of
7938 // inductions and reductions during epilogue vectorization.
7939 fixScalarResumeValuesFromBypass(VecEpilogueIterationCountCheck, L, EpiPlan,
7940 ResumeValues);
7941
7942 // Remove dead phis that were moved to the epilogue preheader but are unused
7943 // (e.g., resume phis for inductions not widened in the epilogue vector loop).
7944 for (PHINode &Phi : make_early_inc_range(VecEpiloguePreHeader->phis()))
7945 if (Phi.use_empty())
7946 Phi.eraseFromParent();
7947}
7948
7950 assert((EnableVPlanNativePath || L->isInnermost()) &&
7951 "VPlan-native path is not enabled. Only process inner loops.");
7952
7953 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
7954 << L->getHeader()->getParent()->getName() << "' from "
7955 << L->getLocStr() << "\n");
7956
7957 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
7958
7959 LLVM_DEBUG(
7960 dbgs() << "LV: Loop hints:"
7961 << " force="
7963 ? "disabled"
7965 ? "enabled"
7966 : "?"))
7967 << " width=" << Hints.getWidth()
7968 << " interleave=" << Hints.getInterleave() << "\n");
7969
7970 // Function containing loop
7971 Function *F = L->getHeader()->getParent();
7972
7973 // Looking at the diagnostic output is the only way to determine if a loop
7974 // was vectorized (other than looking at the IR or machine code), so it
7975 // is important to generate an optimization remark for each loop. Most of
7976 // these messages are generated as OptimizationRemarkAnalysis. Remarks
7977 // generated as OptimizationRemark and OptimizationRemarkMissed are
7978 // less verbose reporting vectorized loops and unvectorized loops that may
7979 // benefit from vectorization, respectively.
7980
7981 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7982 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7983 return false;
7984 }
7985
7986 PredicatedScalarEvolution PSE(*SE, *L);
7987
7988 // Query this against the original loop and save it here because the profile
7989 // of the original loop header may change as the transformation happens.
7990 bool OptForSize = llvm::shouldOptimizeForSize(
7991 L->getHeader(), PSI,
7992 PSI && PSI->hasProfileSummary() ? &GetBFI() : nullptr,
7994
7995 // Check if it is legal to vectorize the loop.
7996 LoopVectorizationRequirements Requirements;
7997 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
7998 &Requirements, &Hints, DB, AC,
7999 /*AllowRuntimeSCEVChecks=*/!OptForSize, AA);
8001 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8002 Hints.emitRemarkWithHints();
8003 return false;
8004 }
8005
8006 bool IsInnerLoop = L->isInnermost();
8007
8008 // Outer loops require a computable trip count.
8009 if (!IsInnerLoop && isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8010 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8011 return false;
8012 }
8013
8014 if (LVL.hasUncountableEarlyExit()) {
8016 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
8017 "early exit is not enabled",
8018 "UncountableEarlyExitLoopsDisabled", ORE, L);
8019 return false;
8020 }
8021 }
8022
8023 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8024 bool UseInterleaved =
8025 IsInnerLoop && TTI->enableInterleavedAccessVectorization();
8026
8027 // If an override option has been passed in for interleaved accesses, use it.
8028 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8029 UseInterleaved = IsInnerLoop && EnableInterleavedMemAccesses;
8030
8031 // Analyze interleaved memory accesses.
8032 if (UseInterleaved)
8034
8035 if (LVL.hasUncountableEarlyExit()) {
8036 BasicBlock *LoopLatch = L->getLoopLatch();
8037 if (IAI.requiresScalarEpilogue() ||
8038 any_of(LVL.getCountableExitingBlocks(), not_equal_to(LoopLatch))) {
8039 reportVectorizationFailure("Auto-vectorization of early exit loops "
8040 "requiring a scalar epilogue is unsupported",
8041 "UncountableEarlyExitUnsupported", ORE, L);
8042 return false;
8043 }
8044 }
8045
8046 // Check the function attributes and profiles to find out if this function
8047 // should be optimized for size.
8048 EpilogueLowering SEL =
8049 getEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, &IAI);
8050
8051 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8052 // count by optimizing for size, to minimize overheads.
8053 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
8054 if (ExpectedTC && ExpectedTC->isFixed() &&
8055 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
8056 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8057 << "This loop is worth vectorizing only if no scalar "
8058 << "iteration overheads are incurred.");
8060 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8061 else {
8062 LLVM_DEBUG(dbgs() << "\n");
8063 // Tail-folded loops are efficient even when the loop
8064 // iteration count is low. However, setting the epilogue policy to
8065 // `CM_EpilogueNotAllowedLowTripLoop` prevents vectorizing loops
8066 // with runtime checks. It's more effective to let
8067 // `isOutsideLoopWorkProfitable` determine if vectorization is
8068 // beneficial for the loop.
8071 }
8072 }
8073
8074 // Check the function attributes to see if implicit floats or vectors are
8075 // allowed.
8076 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8078 "Can't vectorize when the NoImplicitFloat attribute is used",
8079 "loop not vectorized due to NoImplicitFloat attribute",
8080 "NoImplicitFloat", ORE, L);
8081 Hints.emitRemarkWithHints();
8082 return false;
8083 }
8084
8085 // Check if the target supports potentially unsafe FP vectorization.
8086 // FIXME: Add a check for the type of safety issue (denormal, signaling)
8087 // for the target we're vectorizing for, to make sure none of the
8088 // additional fp-math flags can help.
8089 if (Hints.isPotentiallyUnsafe() &&
8090 TTI->isFPVectorizationPotentiallyUnsafe()) {
8092 "Potentially unsafe FP op prevents vectorization",
8093 "loop not vectorized due to unsafe FP support.",
8094 "UnsafeFP", ORE, L);
8095 Hints.emitRemarkWithHints();
8096 return false;
8097 }
8098
8099 bool AllowOrderedReductions;
8100 // If the flag is set, use that instead and override the TTI behaviour.
8101 if (ForceOrderedReductions.getNumOccurrences() > 0)
8102 AllowOrderedReductions = ForceOrderedReductions;
8103 else
8104 AllowOrderedReductions = TTI->enableOrderedReductions();
8105 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
8106 ORE->emit([&]() {
8107 auto *ExactFPMathInst = Requirements.getExactFPInst();
8108 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
8109 ExactFPMathInst->getDebugLoc(),
8110 ExactFPMathInst->getParent())
8111 << "loop not vectorized: cannot prove it is safe to reorder "
8112 "floating-point operations";
8113 });
8114 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
8115 "reorder floating-point operations\n");
8116 Hints.emitRemarkWithHints();
8117 return false;
8118 }
8119
8120 // Use the cost model.
8121 VFSelectionContext Config(*TTI, &LVL, L, *F, PSE, DB, ORE, &Hints,
8122 OptForSize);
8123 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, AC, ORE,
8124 GetBFI, F, &Hints, IAI, Config);
8125 // Use the planner for vectorization.
8126 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, Config, IAI, PSE,
8127 Hints, ORE);
8128
8129 EpilogueLowering EpilogueTailLoweringStatus =
8131 if (EpilogueTailLoweringStatus ==
8133 // TODO: Apply tail-folding on the vectorized epilogue loop.
8134 LLVM_DEBUG(dbgs() << "LV: epilogue tail-folding is not supported yet\n");
8136 "The epilogue-tail-folding policy prefer-fold-tail is not supported "
8137 "yet, fall back to a normal epilogue",
8138 "UnsupportedEpilogueTailFoldingPolicy", ORE, L);
8139 }
8140
8141 // Get user vectorization factor and interleave count.
8142 ElementCount UserVF = Hints.getWidth();
8143 unsigned UserIC = Hints.getInterleave();
8144 // Outer loops don't have LoopAccessInfo, so skip the safety check and reset
8145 // UserIC (interleaving is not supported for outer loops).
8146 if (!IsInnerLoop)
8147 UserIC = 0;
8148 else if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
8149 UserIC = 1;
8150
8151 // Plan how to best vectorize.
8152 LVP.plan(UserVF, UserIC);
8153 auto [VF, BestPlanPtr] = LVP.computeBestVF();
8154 unsigned IC = 1;
8155
8156 // For VPlan build stress testing of outer loops, bail after plan
8157 // construction.
8158 if (!IsInnerLoop && VPlanBuildOuterloopStressTest)
8159 return false;
8160
8161 if (IsInnerLoop && ORE->allowExtraAnalysis(LV_NAME))
8163
8164 GeneratedRTChecks Checks(PSE, DT, LI, TTI, Config.CostKind);
8165 if (IsInnerLoop && LVP.hasPlanWithVF(VF.Width)) {
8166 // Select the interleave count.
8167 IC = LVP.selectInterleaveCount(*BestPlanPtr, VF.Width, VF.Cost);
8168
8169 unsigned SelectedIC = std::max(IC, UserIC);
8170 // Optimistically generate runtime checks if they are needed. Drop them if
8171 // they turn out to not be profitable.
8172 if (VF.Width.isVector() || SelectedIC > 1) {
8173 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
8174 *ORE);
8175
8176 // Bail out early if either the SCEV or memory runtime checks are known to
8177 // fail. In that case, the vector loop would never execute.
8178 using namespace llvm::PatternMatch;
8179 if (Checks.getSCEVChecks().first &&
8180 match(Checks.getSCEVChecks().first, m_One()))
8181 return false;
8182 if (Checks.getMemRuntimeChecks().first &&
8183 match(Checks.getMemRuntimeChecks().first, m_One()))
8184 return false;
8185 }
8186
8187 // Check if it is profitable to vectorize with runtime checks.
8188 bool ForceVectorization =
8190 VPCostContext CostCtx(CM.TTI, *CM.TLI, *BestPlanPtr, CM, Config.CostKind,
8191 CM.PSE, L);
8192 if (!ForceVectorization &&
8193 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, *BestPlanPtr,
8194 SEL, Config.getVScaleForTuning())) {
8195 ORE->emit([&]() {
8197 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
8198 L->getHeader())
8199 << "loop not vectorized: cannot prove it is safe to reorder "
8200 "memory operations";
8201 });
8202 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
8203 Hints.emitRemarkWithHints();
8204 return false;
8205 }
8206 }
8207
8208 // Identify the diagnostic messages that should be produced.
8209 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8210 bool VectorizeLoop = true, InterleaveLoop = true;
8211 if (VF.Width.isScalar()) {
8212 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8213 VecDiagMsg = {
8214 "VectorizationNotBeneficial",
8215 "the cost-model indicates that vectorization is not beneficial"};
8216 VectorizeLoop = false;
8217 }
8218
8219 if (UserIC == 1 && Hints.getInterleave() > 1) {
8221 "UserIC should only be ignored due to unsafe dependencies");
8222 LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
8223 IntDiagMsg = {"InterleavingUnsafe",
8224 "Ignoring user-specified interleave count due to possibly "
8225 "unsafe dependencies in the loop."};
8226 InterleaveLoop = false;
8227 } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
8228 // Tell the user interleaving was avoided up-front, despite being explicitly
8229 // requested.
8230 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8231 "interleaving should be avoided up front\n");
8232 IntDiagMsg = {"InterleavingAvoided",
8233 "Ignoring UserIC, because interleaving was avoided up front"};
8234 InterleaveLoop = false;
8235 } else if (IC == 1 && UserIC <= 1) {
8236 // Tell the user interleaving is not beneficial.
8237 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8238 IntDiagMsg = {
8239 "InterleavingNotBeneficial",
8240 "the cost-model indicates that interleaving is not beneficial"};
8241 InterleaveLoop = false;
8242 if (UserIC == 1) {
8243 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8244 IntDiagMsg.second +=
8245 " and is explicitly disabled or interleave count is set to 1";
8246 }
8247 } else if (IC > 1 && UserIC == 1) {
8248 // Tell the user interleaving is beneficial, but it explicitly disabled.
8249 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
8250 "disabled.\n");
8251 IntDiagMsg = {"InterleavingBeneficialButDisabled",
8252 "the cost-model indicates that interleaving is beneficial "
8253 "but is explicitly disabled or interleave count is set to 1"};
8254 InterleaveLoop = false;
8255 }
8256
8257 // If there is a histogram in the loop, do not just interleave without
8258 // vectorizing. The order of operations will be incorrect without the
8259 // histogram intrinsics, which are only used for recipes with VF > 1.
8260 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
8261 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
8262 << "to histogram operations.\n");
8263 IntDiagMsg = {
8264 "HistogramPreventsScalarInterleaving",
8265 "Unable to interleave without vectorization due to constraints on "
8266 "the order of histogram operations"};
8267 InterleaveLoop = false;
8268 }
8269
8270 // Override IC if user provided an interleave count.
8271 IC = UserIC > 0 ? UserIC : IC;
8272
8273 // Emit diagnostic messages, if any.
8274 if (!VectorizeLoop && !InterleaveLoop) {
8275 // Do not vectorize or interleaving the loop.
8276 ORE->emit([&]() {
8277 return OptimizationRemarkMissed(LV_NAME, VecDiagMsg.first,
8278 L->getStartLoc(), L->getHeader())
8279 << VecDiagMsg.second;
8280 });
8281 ORE->emit([&]() {
8282 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8283 L->getStartLoc(), L->getHeader())
8284 << IntDiagMsg.second;
8285 });
8286 return false;
8287 }
8288
8289 if (!VectorizeLoop && InterleaveLoop) {
8290 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8291 ORE->emit([&]() {
8292 return OptimizationRemarkAnalysis(LV_NAME, VecDiagMsg.first,
8293 L->getStartLoc(), L->getHeader())
8294 << VecDiagMsg.second;
8295 });
8296 } else if (VectorizeLoop && !InterleaveLoop) {
8297 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8298 << ") in " << L->getLocStr() << '\n');
8299 ORE->emit([&]() {
8300 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8301 L->getStartLoc(), L->getHeader())
8302 << IntDiagMsg.second;
8303 });
8304 } else if (VectorizeLoop && InterleaveLoop) {
8305 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8306 << ") in " << L->getLocStr() << '\n');
8307 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8308 }
8309
8310 // Report the vectorization decision.
8311 if (VF.Width.isScalar()) {
8312 using namespace ore;
8313 assert(IC > 1);
8314 ORE->emit([&]() {
8315 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8316 L->getHeader())
8317 << "interleaved loop (interleaved count: "
8318 << NV("InterleaveCount", IC) << ")";
8319 });
8320 } else {
8321 // Report the vectorization decision.
8322 reportVectorization(ORE, L, VF, IC);
8323 }
8324 if (ORE->allowExtraAnalysis(LV_NAME))
8326
8327 // If we decided that it is *legal* to interleave or vectorize the loop, then
8328 // do it.
8329
8330 VPlan &BestPlan = *BestPlanPtr;
8331 // Consider vectorizing the epilogue too if it's profitable.
8332 std::unique_ptr<VPlan> EpiPlan =
8333 LVP.selectBestEpiloguePlan(BestPlan, VF.Width, IC);
8334 bool HasBranchWeights =
8335 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
8336 if (EpiPlan) {
8337 VPlan &BestEpiPlan = *EpiPlan;
8338 VPlan &BestMainPlan = BestPlan;
8339 ElementCount EpilogueVF = BestEpiPlan.getSingleVF();
8340
8341 // The first pass vectorizes the main loop and creates a scalar epilogue
8342 // to be vectorized by executing the plan (potentially with a different
8343 // factor) again shortly afterwards.
8344 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
8345 BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
8346 SmallVector<VPInstruction *> ResumeValues =
8347 preparePlanForMainVectorLoop(BestMainPlan, BestEpiPlan);
8348 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF, 1, BestEpiPlan);
8349
8350 // Add minimum iteration check for the epilogue plan, followed by runtime
8351 // checks for the main plan.
8352 LVP.addMinimumIterationCheck(BestMainPlan, EPI.EpilogueVF, EPI.EpilogueUF,
8354 LVP.attachRuntimeChecks(BestMainPlan, Checks, HasBranchWeights);
8356 EPI.MainLoopVF, EPI.MainLoopUF,
8358 HasBranchWeights ? MinItersBypassWeights : nullptr,
8359 L->getLoopPredecessor()->getTerminator()->getDebugLoc(),
8360 PSE);
8361
8362 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
8363 Checks, BestMainPlan);
8364 auto ExpandedSCEVs = LVP.executePlan(
8365 EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT,
8367 ++LoopsVectorized;
8368
8369 // Derive EPI fields from VPlan-generated IR.
8370 BasicBlock *EntryBB =
8371 cast<VPIRBasicBlock>(BestMainPlan.getEntry())->getIRBasicBlock();
8372 EntryBB->setName("iter.check");
8373 EPI.EpilogueIterationCountCheck = EntryBB;
8374 // The check chain is: Entry -> [SCEV] -> [Mem] -> MainCheck -> VecPH.
8375 // MainCheck is the non-bypass successor of the last runtime check block
8376 // (or Entry if there are no runtime checks).
8377 BasicBlock *LastCheck = EntryBB;
8378 if (BasicBlock *MemBB = Checks.getMemRuntimeChecks().second)
8379 LastCheck = MemBB;
8380 else if (BasicBlock *SCEVBB = Checks.getSCEVChecks().second)
8381 LastCheck = SCEVBB;
8382 BasicBlock *ScalarPH = L->getLoopPreheader();
8383 auto *BI = cast<CondBrInst>(LastCheck->getTerminator());
8385 BI->getSuccessor(BI->getSuccessor(0) == ScalarPH);
8386
8387 // Second pass vectorizes the epilogue and adjusts the control flow
8388 // edges from the first pass.
8389 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
8390 Checks, BestEpiPlan);
8392 BestMainPlan, BestEpiPlan, L, ExpandedSCEVs, EPI, CM, Config,
8393 *PSE.getSE());
8394 LVP.attachRuntimeChecks(BestEpiPlan, Checks, HasBranchWeights);
8395 LVP.executePlan(
8396 EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
8398 connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, Checks, InstsToMove,
8399 ResumeValues);
8400 ++LoopsEpilogueVectorized;
8401 } else {
8402 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, Checks,
8403 BestPlan);
8404 LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
8405 VF.MinProfitableTripCount);
8406 LVP.attachRuntimeChecks(BestPlan, Checks, HasBranchWeights);
8407
8408 if (!IsInnerLoop)
8409 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << F->getName()
8410 << "\"\n");
8411 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
8412 ++LoopsVectorized;
8413 }
8414
8415 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
8416 "DT not preserved correctly");
8417 assert(!verifyFunction(*F, &dbgs()));
8418
8419 return true;
8420}
8421
8423
8424 // Don't attempt if
8425 // 1. the target claims to have no vector registers, and
8426 // 2. interleaving won't help ILP.
8427 //
8428 // The second condition is necessary because, even if the target has no
8429 // vector registers, loop vectorization may still enable scalar
8430 // interleaving.
8431 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8432 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
8433 return LoopVectorizeResult(false, false);
8434
8435 bool Changed = false, CFGChanged = false;
8436
8437 // The vectorizer requires loops to be in simplified form.
8438 // Since simplification may add new inner loops, it has to run before the
8439 // legality and profitability checks. This means running the loop vectorizer
8440 // will simplify all loops, regardless of whether anything end up being
8441 // vectorized.
8442 for (const auto &L : *LI)
8443 Changed |= CFGChanged |=
8444 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8445
8446 // Build up a worklist of inner-loops to vectorize. This is necessary as
8447 // the act of vectorizing or partially unrolling a loop creates new loops
8448 // and can invalidate iterators across the loops.
8449 SmallVector<Loop *, 8> Worklist;
8450
8451 for (Loop *L : *LI)
8452 collectSupportedLoops(*L, LI, ORE, Worklist);
8453
8454 LoopsAnalyzed += Worklist.size();
8455
8456 // Now walk the identified inner loops.
8457 while (!Worklist.empty()) {
8458 Loop *L = Worklist.pop_back_val();
8459
8460 // For the inner loops we actually process, form LCSSA to simplify the
8461 // transform.
8462 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8463
8464 Changed |= CFGChanged |= processLoop(L);
8465
8466 if (Changed) {
8467 LAIs->clear();
8468
8469#ifndef NDEBUG
8470 if (VerifySCEV)
8471 SE->verify();
8472#endif
8473 }
8474 }
8475
8476 // Process each loop nest in the function.
8477 return LoopVectorizeResult(Changed, CFGChanged);
8478}
8479
8482 LI = &AM.getResult<LoopAnalysis>(F);
8483 // There are no loops in the function. Return before computing other
8484 // expensive analyses.
8485 if (LI->empty())
8486 return PreservedAnalyses::all();
8495 AA = &AM.getResult<AAManager>(F);
8496
8497 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8498 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8499 GetBFI = [&AM, &F]() -> BlockFrequencyInfo & {
8501 };
8502 LoopVectorizeResult Result = runImpl(F);
8503 if (!Result.MadeAnyChange)
8504 return PreservedAnalyses::all();
8506
8507 if (isAssignmentTrackingEnabled(*F.getParent())) {
8508 for (auto &BB : F)
8510 }
8511
8512 PA.preserve<LoopAnalysis>();
8516
8517 if (Result.MadeCFGChange) {
8518 // Making CFG changes likely means a loop got vectorized. Indicate that
8519 // extra simplification passes should be run.
8520 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
8521 // be run if runtime checks have been added.
8524 } else {
8526 }
8527 return PA;
8528}
8529
8531 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
8532 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
8533 OS, MapClassName2PassName);
8534
8535 OS << '<';
8536 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
8537 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
8538 OS << '>';
8539}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Lower Kernel Arguments
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI)
Definition CostModel.cpp:73
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
static bool hasNoUnsignedWrap(BinaryOperator &I)
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
#define LV_NAME
This file defines the LoopVectorizationLegality class.
cl::opt< bool > VPlanBuildOuterloopStressTest
static cl::opt< bool > ConsiderRegPressure("vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, cl::desc("Discard VFs if their register pressure is too high."))
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static unsigned getMaxTCFromNonZeroRange(PredicatedScalarEvolution &PSE, Loop *L)
Get the maximum trip count for L from the SCEV unsigned range, excluding zero from the range.
static Type * maybeVectorizeType(Type *Ty, ElementCount VF)
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L)
A version of ScalarEvolution::getSmallConstantTripCount that returns an ElementCount to include loops...
static bool hasUnsupportedHeaderPhiRecipe(VPlan &Plan)
Returns true if the VPlan contains header phi recipes that are not currently supported for epilogue v...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI, DominatorTree *DT, GeneratedRTChecks &Checks, ArrayRef< Instruction * > InstsToMove, ArrayRef< VPInstruction * > ResumeValues)
Connect the epilogue vector loop generated for EpiPlan to the main vector loop, after both plans have...
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< cl::boolOrDefault > ForceMaskedDivRem("force-widen-divrem-via-masked-intrinsic", cl::Hidden, cl::desc("Override cost based masked intrinsic widening " "for div/rem instructions"))
static void legacyCSE(BasicBlock *BB)
FIXME: This legacy common-subexpression-elimination routine is scheduled for removal,...
static VPIRBasicBlock * replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB, VPlan *Plan=nullptr)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
static Intrinsic::ID getMaskedDivRemIntrinsic(unsigned Opcode)
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
TailFoldingPolicyTy
Option tail-folding-policy controls the tail-folding strategy and lists all available options.
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static cl::opt< TailFoldingPolicyTy > EpilogueTailFoldingPolicy("epilogue-tail-folding-policy", cl::Hidden, cl::desc("Epilogue-tail-folding preferences over creating an epilogue loop."), cl::values(clEnumValN(TailFoldingPolicyTy::None, "dont-fold-tail", "Don't tail-fold loops."), clEnumValN(TailFoldingPolicyTy::PreferFoldTail, "prefer-fold-tail", "prefer tail-folding, otherwise create an epilogue when " "appropriate.")))
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static unsigned estimateElementCount(ElementCount VF, std::optional< unsigned > VScale)
This function attempts to return a value that represents the ElementCount at runtime.
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static SmallVector< VPInstruction * > preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, const Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static SmallVector< Instruction * > preparePlanForEpilogueVectorLoop(VPlan &MainPlan, VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM, VFSelectionContext &Config, ScalarEvolution &SE)
Prepare Plan for vectorizing the epilogue loop.
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static void printOptimizedVPlan(VPlan &)
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static std::optional< ElementCount > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true, bool CanExcludeZeroTrips=false)
Returns "best known" trip count, which is either a valid positive trip count or std::nullopt when an ...
static const SCEV * getAddressAccessSCEV(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets the address access SCEV for Ptr, if it should be used for cost modeling according to isAddressSC...
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool hasReplicatorRegion(VPlan &Plan)
static EpilogueLowering getEpilogueTailLowering(const LoopVectorizationCostModel &MainCM, const Loop *L, OptimizationRemarkEmitter *ORE)
Determine how to lower the epilogue for the vector epilogue loop.
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static bool hasFindLastReductionPhi(VPlan &Plan)
Returns true if the VPlan contains a VPReductionPHIRecipe with FindLast recurrence kind.
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static cl::opt< TailFoldingPolicyTy > TailFoldingPolicy("tail-folding-policy", cl::init(TailFoldingPolicyTy::None), cl::Hidden, cl::desc("Tail-folding preferences over creating an epilogue loop."), cl::values(clEnumValN(TailFoldingPolicyTy::None, "dont-fold-tail", "Don't tail-fold loops."), clEnumValN(TailFoldingPolicyTy::PreferFoldTail, "prefer-fold-tail", "prefer tail-folding, otherwise create an epilogue when " "appropriate."), clEnumValN(TailFoldingPolicyTy::MustFoldTail, "must-fold-tail", "always tail-fold, don't attempt vectorization if " "tail-folding fails.")))
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, PredicatedScalarEvolution &PSE, VPCostContext &CostCtx, VPlan &Plan, EpilogueLowering SEL, std::optional< unsigned > VScale)
This function determines whether or not it's still profitable to vectorize the loop given the extra w...
static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx, VPlan &Plan, ElementCount VF)
For loops with uncountable early exits, find the cost of doing work when exiting the loop early,...
cl::opt< bool > VPlanBuildOuterloopStressTest("vplan-build-outerloop-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static EpilogueLowering getEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, bool OptForSize, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L, VPlan &BestEpiPlan, ArrayRef< VPInstruction * > ResumeValues)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file implements a map that provides insertion order iteration.
This file contains the declarations for metadata subclasses.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None)
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
This file contains some templates that are useful if you are working with the STL at all.
#define OP(OPC)
Definition Instruction.h:46
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file contains the declarations of different VPlan-related auxiliary helpers.
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
#define RUN_VPLAN_PASS_NO_VERIFY(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
This class represents a function call, abstracting a target machine's calling convention.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
Conditional Branch instruction.
BasicBlock * getSuccessor(unsigned i) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
This class represents a range of values.
LLVM_ABI APInt getUnsignedMax() const
Return the largest unsigned value contained in the ConstantRange.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getTemporary()
Definition DebugLoc.h:160
static DebugLoc getUnknown()
Definition DebugLoc.h:161
An analysis that produces DemandedBits for a function.
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
void insert_range(Range &&R)
Inserts range of 'std::pair<KeyT, ValueT>' values into the map.
Definition DenseMap.h:292
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition TypeSize.h:315
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, GeneratedRTChecks &Checks, VPlan &Plan)
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (i....
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, GeneratedRTChecks &Check, VPlan &Plan)
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Class to represent function types.
param_iterator param_begin() const
param_iterator param_end() const
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
A struct for saving information about induction variables.
const SCEV * getStep() const
ArrayRef< Instruction * > getCastInsts() const
Returns an ArrayRef to the type cast instructions in the induction update chain, that are redundant w...
@ IK_PtrInduction
Pointer induction var. Step = C.
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor)
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
const TargetTransformInfo * TTI
Target Transform Info.
LoopVectorizationCostModel * Cost
The profitablity analysis.
friend class LoopVectorizationPlanner
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationCostModel *CM, GeneratedRTChecks &RTChecks, VPlan &Plan)
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
DominatorTree * DT
Dominator Tree.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
virtual BasicBlock * createVectorizedLoopSkeleton()
Creates a basic block for the scalar preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
VPBasicBlock * VectorPHVPBB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
unsigned UF
The vectorization unroll factor to use.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
BasicBlock * createScalarPreheader(StringRef Prefix)
Create and return a new IR basic block for the scalar preheader whose name is prefixed with Prefix.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
LLVM_ABI APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition Type.cpp:378
The group of interleaved loads/stores sharing the same stride and close to each other.
auto members() const
Return an iterator range over the non-null members of this group, in index order.
InstTy * getInsertPos() const
uint32_t getNumMembers() const
Drive the analysis of interleaved memory accesses in the loop.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
LLVM_ABI void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
An instruction for reading from memory.
Type * getPointerOperandType() const
This analysis provides dependence information for the memory accesses of a loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
Store the result of a depth first search within basic blocks contained by a single loop.
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
RPOIterator endRPO() const
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool useWideActiveLaneMask() const
Returns true if the use of wide lane masks is requested and the loop is using tail-folding with a lan...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
BlockFrequencyInfo * BFI
The BlockFrequencyInfo returned from GetBFI.
BlockFrequencyInfo & getBFI()
Returns the BlockFrequencyInfo for the function if cached, otherwise fetches it via GetBFI.
bool isForcedScalar(Instruction *I, ElementCount VF) const
Returns true if I has been forced to be scalarized at VF.
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
bool preferTailFoldedLoop() const
Returns true if tail-folding is preferred over an epilogue.
bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF)
Returns true if an artificially high cost for emulated masked memrefs should be used.
void collectNonVectorizedAndSetWideningDecisions(ElementCount VF)
Collect values that will not be widened, including Uniforms, Scalars, and Instructions to Scalarize f...
bool isMaskRequired(Instruction *I) const
Wrapper function for LoopVectorizationLegality::isMaskRequired, that passes the Instruction I and if ...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationLegality * Legal
Vectorization legality.
uint64_t getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB)
A helper function that returns how much we should divide the cost of a predicated block by.
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, InstructionCost Cost)
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
void setTailFoldingStyle(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
bool isEpilogueAllowed() const
Returns true if an epilogue is allowed (e.g., not prevented by optsize or a loop hint annotation).
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
@ CM_InvalidatedDecision
A widening decision that has been invalidated after replacing the corresponding recipe during VPlan t...
bool usePredicatedReductionSelect(RecurKind RecurrenceKind) const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
LoopVectorizationCostModel(EpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, std::function< BlockFrequencyInfo &()> GetBFI, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI, VFSelectionContext &Config)
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF)
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool isScalarWithPredication(Instruction *I, ElementCount VF)
Returns true if I is an instruction which requires predication and for which our chosen predication s...
std::function< BlockFrequencyInfo &()> GetBFI
A function to lazily fetch BlockFrequencyInfo.
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost MaskedCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
TailFoldingStyle getTailFoldingStyle() const
Returns the TailFoldingStyle that is best for the current loop.
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
MapVector< PHINode *, InductionDescriptor > InductionList
InductionList saves induction variables and maps them to the induction descriptor.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
bool hasUncountableEarlyExit() const
Returns true if the loop has uncountable early exits, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
Planner drives the vectorization process after having passed Legality checks.
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, EpilogueVectorizationKind EpilogueVecKind=EpilogueVectorizationKind::None)
EpilogueVectorizationKind
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
@ MainLoop
Vectorizing the main loop of epilogue vectorization.
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition VPlan.cpp:1712
void updateLoopMetadataAndProfileInfo(Loop *VectorLoop, VPBasicBlock *HeaderVPBB, const VPlan &Plan, bool VectorizingEpilogue, MDNode *OrigLoopID, std::optional< unsigned > OrigAverageTripCount, unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF, bool DisableRuntimeUnroll)
Update loop metadata and profile info for both the scalar remainder loop and VectorLoop,...
Definition VPlan.cpp:1763
void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const
Attach the runtime checks of RTChecks to Plan.
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, InstructionCost LoopCost)
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1698
void printPlans(raw_ostream &O)
Definition VPlan.cpp:1869
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
std::unique_ptr< VPlan > selectBestEpiloguePlan(VPlan &MainPlan, ElementCount MainLoopVF, unsigned IC)
void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount) const
Create a check to Plan to see if the vector loop should be executed based on its trip count.
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
std::pair< VectorizationFactor, VPlan * > computeBestVF()
Compute and return the most profitable vectorization factor and the corresponding best VPlan.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
void emitRemarkWithHints() const
Dumps all the hint information.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition LoopInfo.cpp:73
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:659
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:67
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:126
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEVPredicate & getPredicate() const
LLVM_ABI unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
LLVM_ABI const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static LLVM_ABI bool isSubRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is for a sub operation.
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
LLVM_ABI Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
void eraseDeadInstructions(Value *Root)
Remove inserted instructions that are dead, e.g.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getURemExpr(SCEVUse LHS, SCEVUse RHS)
Represents an unsigned remainder expression based on unsigned division.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
LLVM_ABI void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
const SCEV * getMinusOne(Type *Ty)
Return a SCEV for the constant -1 of a specific type.
LLVM_ABI void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void insert_range(Range &&R)
Definition SetVector.h:176
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:262
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
@ Store
The extracted value is stored (ExtractElement only).
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
iterator_range< op_iterator > op_range
Definition User.h:256
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76
Holds state needed to make cost decisions before computing costs per-VF, including the maximum VFs.
const TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
std::optional< unsigned > getVScaleForTuning() const
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4263
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4290
iterator end()
Definition VPlan.h:4300
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4298
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4351
InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override
Return the cost of this VPBasicBlock.
Definition VPlan.cpp:778
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:266
const VPRecipeBase & front() const
Definition VPlan.h:4310
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:661
bool empty() const
Definition VPlan.h:4309
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:236
void setName(const Twine &newName)
Definition VPlan.h:178
VPlan * getPlan()
Definition VPlan.cpp:211
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:226
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition VPlanUtils.h:267
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:295
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createAdd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false})
T * insert(T *R)
Insert R at the current insertion point. Returns R unchanged.
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:559
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:532
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2395
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2442
void setBackedgeValue(VPValue *V)
Update the incoming value from the loop backedge.
Definition VPlan.h:2447
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2431
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2133
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4416
Class to record and manage LLVM IR flags.
Definition VPlan.h:696
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1227
iterator_range< operand_iterator > operandsWithoutMask()
Returns an iterator range over the operands excluding the mask operand if present.
Definition VPlan.h:1481
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1319
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1312
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1270
unsigned getOpcode() const
Definition VPlan.h:1410
void setName(StringRef NewName)
Set the symbolic name for the VPInstruction.
Definition VPlan.h:1509
VPValue * getMask() const
Returns the mask for the VPInstruction.
Definition VPlan.h:1475
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:3057
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1639
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:401
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:554
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R, VFRange &Range)
Create and return a widened recipe for a non-phi recipe R if one can be created within the given VF R...
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:337
bool isOrdered() const
Returns true, if the phi is part of an ordered reduction.
Definition VPlan.h:2854
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2833
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2857
VPReductionPHIRecipe * cloneWithOperands(VPValue *Start, VPValue *BackedgeValue)
Definition VPlan.h:2815
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2851
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3150
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4473
const VPBlockBase * getEntry() const
Definition VPlan.h:4517
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4601
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4585
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3304
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:610
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:681
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384
operand_range operands()
Definition VPlanValue.h:455
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:428
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:423
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1511
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1517
user_range users()
Definition VPlanValue.h:157
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2240
A recipe to compute the pointers for widened memory accesses of SourceElementTy, with the Stride expr...
Definition VPlan.h:2314
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1848
A recipe for handling GEP instructions.
Definition VPlan.h:2175
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2554
A recipe for widened phis.
Definition VPlan.h:2685
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1790
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4621
bool hasVF(ElementCount VF) const
Definition VPlan.h:4844
ElementCount getSingleVF() const
Returns the single VF of the plan, asserting that the plan has exactly one VF.
Definition VPlan.h:4857
VPBasicBlock * getEntry()
Definition VPlan.h:4717
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4780
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4820
bool hasUF(unsigned UF) const
Definition VPlan.h:4869
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4770
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4894
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4920
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1098
bool hasEarlyExit() const
Returns true if the VPlan is based on a loop with an early exit.
Definition VPlan.h:5017
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition VPlan.cpp:1080
LLVM_ABI_FOR_TEST bool isOuterLoop() const
Returns true if this VPlan is for an outer loop, i.e., its vector loop region contains a nested loop ...
Definition VPlan.cpp:1113
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4794
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4746
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4722
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4817
bool hasScalarVFOnly() const
Definition VPlan.h:4862
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4760
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition VPlan.cpp:950
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4766
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4813
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
constexpr bool isNonZero() const
Definition TypeSize.h:155
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr bool isZero() const
Definition TypeSize.h:153
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:223
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:237
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an std::string.
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
cst_pred_ty< is_specific_signed_cst > m_scev_SpecificSInt(int64_t V)
Match an SCEV constant with a plain signed integer (sign-extended value will be matched)
match_bind< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
SCEVBinaryExpr_match< SCEVMulExpr, Op0_t, Op1_t, SCEV::FlagAnyWrap, true > m_scev_c_Mul(const Op0_t &Op0, const Op1_t &Op1)
bool matchFindIVResult(VPInstruction *VPI, Op0_t ReducedIV, Op1_t Start)
Match FindIV result pattern: select(icmp ne ComputeReductionResult(ReducedIV), Sentinel),...
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
Add a small namespace to avoid name clashes with the classes used in the streaming interface.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPBasicBlock * getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT)
Returns the header block of the first, top-level loop, or null if none exist.
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:116
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:137
GEPNoWrapFlags getGEPFlagsForPtr(VPValue *Ptr)
Returns the GEP nowrap flags for Ptr, looking through pointer casts mirroring Value::stripPointerCast...
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
auto cast_if_present(const Y &Val)
cast_if_present<X> - Functionally identical to cast, except that a null value is accepted.
Definition Casting.h:683
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
LLVM_ABI_FOR_TEST cl::opt< bool > VerifyEachVPlan
LLVM_ABI std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Return either:
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
OuterAnalysisManagerProxy< ModuleAnalysisManager, Function > ModuleAnalysisManagerFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
LLVM_ABI bool VerifySCEV
LLVM_ABI_FOR_TEST cl::opt< bool > VPlanPrintAfterAll
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:288
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
constexpr auto bind_front(FnT &&Fn, BindArgsT &&...BindArgs)
C++20 bind_front.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
bool containsIrreducibleCFG(RPOTraversalT &RPOTraversal, const LoopInfoT &LI)
Return true if the control flow in RPOTraversal is irreducible.
Definition CFG.h:154
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:78
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:83
@ MaskedHandleExitInScalarLoop
All memory operations other than the load(s) required to determine whether an uncountable exit occurr...
Definition VPlan.h:88
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
LLVM_ABI cl::opt< bool > EnableLoopVectorization
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
LLVM_ABI_FOR_TEST cl::list< std::string > VPlanPrintAfterPasses
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:422
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
T * find_singleton(R &&Range, Predicate P, bool AllowRepeats=false)
Return the single value in Range that satisfies P(<member of Range> *, AllowRepeats)->T * returning n...
Definition STLExtras.h:1836
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
TargetTransformInfo TTI
@ CM_EpilogueNotAllowedLowTripLoop
@ CM_EpilogueNotNeededFoldTail
@ CM_EpilogueNotAllowedFoldTail
@ CM_EpilogueNotAllowedOptSize
@ CM_EpilogueAllowed
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
RecurKind
These are the kinds of recurrences that we support.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
DWARFExpression::Operation Op
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
cl::opt< bool > EnableVPlanNativePath
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
LLVM_ABI Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
bool pred_empty(const BasicBlock *BB)
Definition CFG.h:119
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataAndControlFlow
Use predicate to control both data and control flow.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:325
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
std::unique_ptr< VPlan > VPlanPtr
Definition VPlan.h:73
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI_FOR_TEST bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:305
LLVM_ABI_FOR_TEST cl::opt< bool > VPlanPrintVectorRegionScope
LLVM_ABI cl::opt< bool > EnableLoopInterleaving
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition Analysis.h:29
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
An information struct used to provide DenseMap with the various necessary components for a given valu...
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
TargetLibraryInfo * TLI
LLVM_ABI LoopVectorizeResult runImpl(Function &F)
LLVM_ABI bool processLoop(Loop *L)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
LLVM_ABI void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LLVM_ABI LoopVectorizePass(LoopVectorizeOptions Opts={})
ScalarEvolution * SE
AssumptionCache * AC
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
std::function< BlockFrequencyInfo &()> GetBFI
TargetTransformInfo * TTI
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:89
A marker analysis to determine if extra passes should be run after loop vectorization.
static LLVM_ABI AnalysisKey Key
Holds the VFShape for a specific scalar to vector function mapping.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
ElementCount End
Struct to hold various analysis needed for cost computations.
LoopVectorizationCostModel & CM
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
void invalidateWideningDecision(Instruction *I, ElementCount VF)
Mark the widening decision for I at VF as invalidated since a VPlan transform replaced the original r...
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
uint64_t getPredBlockCostDivisor(BasicBlock *BB) const
TargetTransformInfo::TargetCostKind CostKind
std::optional< CallWideningKind > getLegacyCallKind(CallInst *CI, ElementCount VF) const
Returns the legacy call widening decision for CI at VF, or std::nullopt if none was recorded.
SmallPtrSet< Instruction *, 8 > SkipCostComputation
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:246
A pure-virtual common base class for recipes defining a single VPValue and using IR flags.
Definition VPlan.h:1118
A struct that represents some properties of the register usage of a loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3672
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3770
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static bool createHeaderPhiRecipes(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &OrigLoop, const MapVector< PHINode *, InductionDescriptor > &Inductions, const MapVector< PHINode *, RecurrenceDescriptor > &Reductions, const SmallPtrSetImpl< const PHINode * > &FixedOrderRecurrences, const SmallPtrSetImpl< PHINode * > &InLoopReductions, bool AllowReordering)
Replace VPPhi recipes in Plan's header with corresponding VPHeaderPHIRecipe subclasses for inductions...
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void introduceMasksAndLinearize(VPlan &Plan)
Predicate and linearize the control-flow in the only loop region of Plan.
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void foldTailByMasking(VPlan &Plan)
Adapts the vector loop region for tail folding by introducing a header mask and conditionally executi...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addMinimumVectorEpilogueIterationCheck(VPlan &Plan, Value *VectorTripCount, bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF, unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE)
Add a check to Plan to see if the epilogue vector loop should be executed.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool handleMultiUseReductions(VPlan &Plan, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
Try to legalize reductions with multiple in-loop uses.
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static bool handleFindLastReductions(VPlan &Plan)
Check if Plan contains any FindLast reductions.
static void createInLoopReductionRecipes(VPlan &Plan, ElementCount MinVF)
Create VPReductionRecipes for in-loop reductions.
static void unrollByUF(VPlan &Plan, unsigned UF)
Explicitly unroll Plan by UF.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static LLVM_ABI_FOR_TEST std::unique_ptr< VPlan > buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, PredicatedScalarEvolution &PSE, LoopVersioning *LVer=nullptr)
Create a base VPlan0, serving as the common starting point for all later candidates.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void addCanonicalIVRecipes(VPlan &Plan, DebugLoc DL)
Add a canonical IV and its increment, using InductionTy and DL to Plan.
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static LLVM_ABI_FOR_TEST bool handleEarlyExits(VPlan &Plan, UncountableExitStyle Style, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC)
Update Plan to account for all early exits.
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool handleMaxMinNumReductions(VPlan &Plan)
Check if Plan contains any FMaxNum or FMinNum reductions.
static LLVM_ABI_FOR_TEST void createLoopRegions(VPlan &Plan)
Replace loops in Plan's flat CFG with VPRegionBlocks, turning Plan's flat CFG into a hierarchical CFG...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void replicateByVF(VPlan &Plan, ElementCount VF)
Replace replicating VPReplicateRecipe, VPScalarIVStepsRecipe and VPInstruction in Plan with VF single...
static void convertToStridedAccesses(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L, VPCostContext &Ctx, VFRange &Range)
Transform widen memory recipes into strided access recipes when legal and profitable.
static void addIterationCountCheckBlock(VPlan &Plan, ElementCount VF, unsigned UF, bool RequiresScalarEpilogue, Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL, PredicatedScalarEvolution &PSE)
Add a new check block before the vector preheader to Plan to check if the main vector loop should be ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, bool TailFolded, Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL, PredicatedScalarEvolution &PSE, VPBasicBlock *CheckBlock)
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...
static LLVM_ABI_FOR_TEST void addMiddleCheck(VPlan &Plan, bool TailFolded)
If a check is needed to guard executing the scalar epilogue loop, it will be added to the middle bloc...
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.
static LLVM_ABI bool HoistRuntimeChecks